diff --git a/src/commands.rs b/src/commands.rs index 337919e..899435b 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -1,6 +1,6 @@ //! Command functions that implement each of the `granges` subcommands. -use std::{path::PathBuf, io, fs::File}; +use std::{fs::File, io, path::PathBuf}; use csv::WriterBuilder; @@ -11,7 +11,7 @@ use crate::{ tsv::BED_TSV, }, prelude::*, - ranges::{operations::adjust_range, GenomicRangeRecordEmpty, GenomicRangeRecord}, + ranges::{operations::adjust_range, GenomicRangeRecord, GenomicRangeRecordEmpty}, reporting::{CommandOutput, Report}, test_utilities::{random_granges, random_granges_mock_bed5}, Position, PositionOffset, @@ -94,9 +94,9 @@ pub fn granges_adjust( if skipped_ranges > 0 { report.add_issue(format!( - "{} ranges were removed because their widths after adjustment were ≤ 0", - skipped_ranges - )) + "{} ranges were removed because their widths after adjustment were ≤ 0", + skipped_ranges + )) } } } else { @@ -161,7 +161,7 @@ pub fn granges_filter( right_path: &PathBuf, output: Option<&PathBuf>, skip_missing: bool, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); @@ -200,7 +200,7 @@ pub fn granges_filter( right_gr = GRanges::from_iter( right.try_unwrap_data().retain_seqnames(&seqnames), &genome, - )?; + )?; } else { left_gr = GRangesEmpty::from_iter(left, &genome)?; right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; @@ -243,7 +243,7 @@ pub fn granges_filter( right_gr = GRanges::from_iter( right.try_unwrap_data().retain_seqnames(&seqnames), &genome, - )?; + )?; } else { left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; @@ -292,7 +292,7 @@ pub fn granges_flank( output: Option<&PathBuf>, skip_missing: bool, mode: ProcessingMode, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); let ranges_iter = GenomicRangesFile::parsing_iterator(bedfile)?; @@ -424,7 +424,7 @@ pub fn granges_map( operations: Vec, output: Option<&PathBuf>, skip_missing: bool, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; let seqnames: Vec = genome.keys().cloned().collect(); let report = Report::new(); @@ -478,7 +478,9 @@ pub fn granges_map( operations .iter() .map(|operation| { - operation.run(&mut overlap_scores).into_serializable(&BED_TSV) + operation + .run(&mut overlap_scores) + .into_serializable(&BED_TSV) }) .collect::>() })?; @@ -495,7 +497,7 @@ pub fn granges_windows( step: Option, chop: bool, output: Option>, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; GRangesEmpty::from_windows(&genome, width, step, chop)?.write_to_tsv(output, &BED_TSV)?; let report = Report::new(); @@ -509,7 +511,7 @@ pub fn granges_random_bed( output: Option>, sort: bool, bed5: bool, - ) -> Result, GRangesError> { +) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; diff --git a/src/granges.rs b/src/granges.rs index 3c74a36..62d4048 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -1280,15 +1280,50 @@ impl GRangesEmpty where CL: IterableRangeContainer, { - /// Filter out ranges that do *not* have at least overlap with the `right` ranges. + /// Exclude genomic ranges in this object that have any overlaps + /// with the `right` set of genomic ranges. /// - /// In database lingo, this is a type of *filtering join*, in particular a *semi join*. - /// See Hadley Wickham's excellent [R for Data - /// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. + /// This will exclude the *entire* left range, if it had *any* + /// overlaps with *any* right genomic ranges. /// - /// Note that this consumes the `self` [`GRanges`] object, turning it into a new - /// [`GRanges`]. + /// This is a type of *filtering join*, in particular a *anti-join*. + /// See Hadley Wickham's [R for Data Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. + pub fn antifilter_overlaps<'a, M: Clone + 'a, DR: 'a>( + self, + // right: &GRanges, DR>, + right: &'a impl AsGRangesRef<'a, COITrees, DR>, + ) -> Result, GRangesError> { + let mut gr = GRangesEmpty::new_vec(&self.seqlens()); + + let right_ref = right.as_granges_ref(); + + for (seqname, left_ranges) in self.0.ranges.iter() { + for left_range in left_ranges.iter_ranges() { + if let Some(right_ranges) = right_ref.ranges.get(seqname) { + let has_overlaps = + right_ranges.count_overlaps(left_range.start(), left_range.end()) > 0; + if !has_overlaps { + gr.push_range(seqname, left_range.start(), left_range.end())?; + } + } else { + // if this left range's chrom doesn't exist in right, it doesn't have + // overlaps, so we push + gr.push_range(seqname, left_range.start(), left_range.end())?; + } + } + } + Ok(gr) + } + + /// Retain only genomic ranges that have at least one overlap with the `right` + /// set of genomic ranges. The whole range will be retained. /// + /// This will retain the *entire* left range only if it had at least one + /// basepair overlap with *any* right genomic range. + /// + /// This is a type of *filtering join*, in particular a *semi-join*. + /// See Hadley Wickham's [R for Data + /// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( self, // right: &GRanges, DR>, @@ -1319,19 +1354,42 @@ impl GRanges> where CL: IterableRangeContainer, { - /// Filter out ranges that do *not* have at least overlap with the `right` ranges. + /// Retain only genomic ranges that have at least one overlap with the `right` + /// set of genomic ranges. The whole range will be retained. /// - /// In database lingo, this is a type of *filtering join*, in particular a *semi join*. - /// See Hadley Wickham's excellent [R for Data - /// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. + /// This will retain the *entire* left range only if it had at least one + /// basepair overlap with *any* right genomic range. /// - /// Note that this consumes the `self` [`GRanges`] object, turning it into a new - /// [`GRanges>`]. The data container is rebuilt from indices - /// into a new [`Vec`] where `U` is the associated type [`IndexedDataContainer::Item`], - /// which represents the individual data element in the data container. + /// This is a type of *filtering join*, in particular a *semi-join*. + /// See Hadley Wickham's [R for Data + /// Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. pub fn filter_overlaps<'a, M: Clone + 'a, DR: 'a>( + self, + right: &'a impl AsGRangesRef<'a, COITrees, DR>, + ) -> Result>, GRangesError> { + self._filter_overlaps_base(right, false) + } + + /// Exclude genomic ranges in this object that have any overlaps + /// with the `right` set of genomic ranges. + /// + /// This will exclude the *entire* left range, if it had *any* + /// overlaps with *any* right genomic ranges. + /// + /// This is a type of *filtering join*, in particular a *anti-join*. + /// See Hadley Wickham's [R for Data Science](https://r4ds.hadley.nz/joins.html#filtering-joins) for more information. + pub fn antifilter_overlaps<'a, M: Clone + 'a, DR: 'a>( + self, + right: &'a impl AsGRangesRef<'a, COITrees, DR>, + ) -> Result>, GRangesError> { + self._filter_overlaps_base(right, true) + } + + // internal base function for handling the cases above + fn _filter_overlaps_base<'a, M: Clone + 'a, DR: 'a>( mut self, right: &'a impl AsGRangesRef<'a, COITrees, DR>, + anti: bool, ) -> Result>, GRangesError> { let mut gr: GRanges> = GRanges::new_vec(&self.seqlens()); @@ -1345,11 +1403,11 @@ where for (seqname, left_ranges) in self.ranges.iter() { for left_range in left_ranges.iter_ranges() { if let Some(right_ranges) = right_ref.ranges.get(seqname) { - let num_overlaps = - right_ranges.count_overlaps(left_range.start(), left_range.end()); - if num_overlaps == 0 { - // no overlaps -- skip - } else { + let has_overlaps = + right_ranges.count_overlaps(left_range.start(), left_range.end()) > 0; + // XOR with anti + let passes_filter = has_overlaps != anti; + if passes_filter { gr.push_range_with_index( seqname, left_range.start(), @@ -1361,6 +1419,19 @@ where new_indices.push(current_index); current_index += 1; } + } else if anti { + // if this left range's chrom doesn't exist in right, it doesn't have + // overlaps, so we push + gr.push_range_with_index( + seqname, + left_range.start(), + left_range.end(), + current_index, + )?; + // unwrap should be safe, since this is an indexed GRanges + old_indices.insert(left_range.index().unwrap()); + new_indices.push(current_index); + current_index += 1; } } } @@ -1511,6 +1582,52 @@ mod tests { assert_eq!(gr_filtered.len(), 3); } + #[test] + fn granges_antifilter_overlaps() { + // ANTI version of above + let seqlens = seqlens! { "chr1" => 10}; + + // test 1: one range that overlaps only the first range + let gr = granges_test_case_01(); + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 0, 1).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.antifilter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 4); + + // test 2: one range that overlaps the first two ranges + let gr = granges_test_case_01(); + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 0, 5).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.antifilter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 3); + + // test 3: one range that overlaps no ranges + let gr = granges_test_case_01(); + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 8, 9).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.clone().antifilter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 5); + assert_eq!(gr, gr_filtered); + + // test 4: ranges on two chromosomes: first overlaps two ranges, second + // overlaps one + let gr = granges_test_case_01(); + let seqlens = seqlens! { "chr1" => 10, "chr2" => 10 }; + let mut gr_keep: GRangesEmpty = GRangesEmpty::new_vec(&seqlens); + gr_keep.push_range("chr1", 4, 7).unwrap(); + gr_keep.push_range("chr2", 10, 12).unwrap(); + let gr_keep = gr_keep.into_coitrees().unwrap(); + + let gr_filtered = gr.antifilter_overlaps(&gr_keep).unwrap(); + assert_eq!(gr_filtered.len(), 2); + } + #[test] fn test_flanking_left() { let gr = granges_test_case_02(); diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 1cf1cb4..49ebf7c 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -3,9 +3,7 @@ use granges::{ commands::granges_random_bed, io::{parsers::bed::bed_missing, InputStream}, - prelude::{ - read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator, - }, + prelude::{read_seqlens, BedlikeIterator, GRanges, GenomicRangesFile, TsvRecordIterator}, ranges::GenomicRangeRecord, test_utilities::{granges_binary_path, random_bed3file, random_bed5file, temp_bedfile}, };