Skip to content

Commit

Permalink
New traits and operation functionality for bedtools map-like function…
Browse files Browse the repository at this point in the history
…ality.

 - Simplified `granges map` to just handle BED3 for left and BED5 right only.
 - New `Operation`, new operations command line interface, new `GRangesError`.
 - `map_data()`
 - Fixed BED5 having wrong field.
 - New partial code of BED6.
 - New `JoinDataOperations` trait, with implementations for combined
   join data types.
  • Loading branch information
vsbuffalo committed Feb 25, 2024
1 parent 1b928d5 commit ba6a03c
Show file tree
Hide file tree
Showing 8 changed files with 318 additions and 190 deletions.
117 changes: 29 additions & 88 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use std::path::PathBuf;

use crate::{
data::operations::Operation,
io::{parsers::GenomicRangesParser, OutputStream},
io::{parsers::{GenomicRangesParser, Bed5Iterator}, OutputStream},
prelude::*,
ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord},
reporting::{CommandOutput, Report},
Expand Down Expand Up @@ -398,6 +398,9 @@ pub fn granges_flank(
Ok(CommandOutput::new((), report))
}


/// # Developer Notes
/// This function is a great way to see GRange's methods in action.
pub fn granges_map(
seqlens: impl Into<PathBuf>,
left_path: &PathBuf,
Expand All @@ -410,100 +413,38 @@ pub fn granges_map(
let seqnames: Vec<String> = genome.keys().cloned().collect();
let mut report = Report::new();

let left_iter = GenomicRangesFile::parsing_iterator(left_path)?;
let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;
let left_iter = Bed3Iterator::new(left_path)?;
let right_iter = Bed5Iterator::new(right_path)?;

match (left_iter, right_iter) {
(GenomicRangesParser::Bed5(left), GenomicRangesParser::Bed5(right)) => {
let left_gr;
let right_gr;
let left_gr;
let right_gr;

if skip_missing {
left_gr = GRanges::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRanges::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right, &genome)?;
}

let right_gr = right_gr.into_coitrees()?;

let left_join = left_gr.left_overlaps(&right_gr)?;

// TODO -- map function
// left_join.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => {
todo!();
let left_gr;
let right_gr;

if skip_missing {
left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRangesEmpty::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}

let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => {
let left_gr;
let right_gr;

if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRangesEmpty::from_iter(right, &genome)?;
}

let right_gr = right_gr.into_coitrees()?;
if skip_missing {
left_gr = GRangesEmpty::from_iter(left_iter.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(right_iter.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRangesEmpty::from_iter(left_iter, &genome)?;
right_gr = GRanges::from_iter(right_iter, &genome)?;
}

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;
let right_gr = right_gr.into_coitrees()?
.map_data(|bed5_cols| {
// extract out just the score
bed5_cols.score
})?;

Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => {
todo!();
let left_gr;
let right_gr;
let left_join = left_gr.left_overlaps(&right_gr)?;

if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}
let new_column = left_join.map_over_joins(|join_data| {
join_data.
operations.iter().map(|operation| operation.run(data)).collect()
})?;

let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;
// TODO -- map function
// left_join.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
_ => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
}
Ok(CommandOutput::new((), report))
}

/// Generate a random BED-like file with genomic ranges.
Expand All @@ -512,7 +453,7 @@ pub fn granges_random_bed(
num: usize,
output: Option<impl Into<PathBuf>>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

Expand Down
78 changes: 42 additions & 36 deletions src/data/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
use num_traits::{Float, ToPrimitive};
use std::iter::Sum;

use crate::error::GRangesError;

/// Calculate the median.
///
/// This will clone and turn `numbers` into a `Vec`.
Expand All @@ -18,6 +20,15 @@ pub fn median<F: Float + Ord + Sum>(numbers: &[F]) -> F {
}
}

/// A subset of types that represent operation results types.
pub enum OperationResult<T>
where
T: Float,
{
Float(T),
String(String),
}

/// The (subset of) standard `bedtools map` operations.
pub enum Operation {
Sum,
Expand All @@ -28,42 +39,37 @@ pub enum Operation {
Collapse,
}

pub enum OperationResult<T>
where
T: Float,
{
Float(T),
String(String),
}

pub fn float_compute<T>(operation: Operation, data: &[T]) -> Option<OperationResult<T>>
where
T: Float + Sum<T> + ToPrimitive + Ord + Clone + ToString,
{
match operation {
Operation::Sum => {
let sum: T = data.iter().cloned().sum();
Some(OperationResult::Float(sum))
}
Operation::Min => data.iter().cloned().min().map(OperationResult::Float),
Operation::Max => data.iter().cloned().max().map(OperationResult::Float),
Operation::Mean => {
if data.is_empty() {
None
} else {
let sum: T = data.iter().cloned().sum();
let mean = sum / T::from(data.len()).unwrap();
Some(OperationResult::Float(mean))
impl Operation {
/// Do a particular (summarizing) operation on some generic data.
pub fn run<T>(&self, data: &[T]) -> Option<OperationResult<T>>
where
T: Float + Sum<T> + ToPrimitive + Ord + Clone + ToString,
{
match self {
Operation::Sum => {
let sum: T = data.iter().cloned().sum();
Some(OperationResult::Float(sum))
}
Operation::Min => data.iter().cloned().min().map(OperationResult::Float),
Operation::Max => data.iter().cloned().max().map(OperationResult::Float),
Operation::Mean => {
if data.is_empty() {
None
} else {
let sum: T = data.iter().cloned().sum();
let mean = sum / T::from(data.len()).unwrap();
Some(OperationResult::Float(mean))
}
}
Operation::Median => Some(OperationResult::Float(median(data))),
Operation::Collapse => {
let collapsed = data
.iter()
.map(|num| num.to_string())
.collect::<Vec<_>>()
.join(", ");
Some(OperationResult::String(collapsed))
}
}
}
Operation::Median => Some(OperationResult::Float(median(data))),
Operation::Collapse => {
let collapsed = data
.iter()
.map(|num| num.to_string())
.collect::<Vec<_>>()
.join(", ");
Some(OperationResult::String(collapsed))
}
}
}
2 changes: 2 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,6 @@ pub enum GRangesError {
UnsupportedGenomicRangesFileFormat,
#[error("Command line argument error: {0}")]
ArgumentError(#[from] clap::error::Error),
#[error("No such operation: {0}")]
NoSuchOperation,
}
Loading

0 comments on commit ba6a03c

Please sign in to comment.