Skip to content

Commit

Permalink
Doc-writing session.
Browse files Browse the repository at this point in the history
 - Added lots of module, type, and function level documentation.
 - F'ing sick ASCII diagrams.
  • Loading branch information
vsbuffalo committed Feb 23, 2024
1 parent 6278515 commit 2b8aa5a
Show file tree
Hide file tree
Showing 20 changed files with 562 additions and 295 deletions.
6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ noodles = { version = "0.63.0", features = ["core", "bed"] }
rand = "0.8.5"
tempfile = "3.10.0"
thiserror = "1.0.57"
polars = { version = "0.37.0", optional = true }

[features]
# cli = [ "clap" ] // TODO make feature
# cli = [ "clap" ] # TODO make feature
dev-commands = [ ]
test_bigranges = [] # NOTE: not used yet, for tests on large files
polars = ["dep:polars"]


[profile.release]
opt-level = 3
Expand Down
257 changes: 167 additions & 90 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
//! Command functions that implement each of the `granges` subcommands.

use std::path::PathBuf;

use crate::{
io::{parsers::GenomicRangesParser, OutputFile},
io::{parsers::GenomicRangesParser, OutputStream},
prelude::*,
ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord},
reporting::{CommandOutput, Report},
Expand All @@ -10,13 +12,37 @@ use crate::{
Position, PositionOffset,
};

/// An `enum` to indicate whether an streaming or in-memory algorithm should be used.
#[derive(Clone)]
pub enum ProcessingMode {
Streaming,
InMemory,
}

/// Adjust the genomic ranges in a bedfile by some specified amount.
/// Adjusts genomic ranges in a BED file by a specified amount.
///
/// This function modifies the start and end positions of each range in the input BED file based on
/// the provided offsets, ensuring the adjusted ranges do not exceed the sequence lengths specified
/// in the `seqlens` file. If sorting is enabled, the output will be sorted based on the sequence names
/// and start positions.
///
/// # Arguments
///
/// * `bedfile` - A reference to a `PathBuf` for the input BED file.
/// * `seqlens` - A reference to a `PathBuf` for the file containing sequence lengths.
/// * `both` - A [`PositionOffset`] specifying how much to adjust the start and end positions.
/// * `output` - An optional reference to a `PathBuf` where the adjusted ranges will be written. Writes
/// to stdout if `None`.
/// * `sort` - A boolean indicating whether to sort the output.
///
/// # Returns
///
/// A `Result` wrapping [`CommandOutput<()>`] on success, or [`GRangesError`] on failure.
///
/// # Errors
///
/// Returns `GRangesError` if the input BED file or sequence lengths file cannot be read, or if
/// an adjusted range exceeds the sequence boundaries.
pub fn granges_adjust(
bedfile: &PathBuf,
seqlens: &PathBuf,
Expand All @@ -27,8 +53,8 @@ pub fn granges_adjust(
let genome = read_seqlens(seqlens)?;

// Setup Output stream -- header is None for now (TODO).
let output_stream = output.map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
let output_stream = output.map_or(OutputStream::new_stdout(None), |file| {
OutputStream::new(file, None)
});
let mut writer = output_stream.writer()?;

Expand Down Expand Up @@ -76,6 +102,9 @@ pub fn granges_adjust(
let gr = GRangesEmpty::from_iter(iter, &genome)?;
gr.adjust_ranges(-both, both).to_tsv(output)?
}
GenomicRangesParser::Bed5(iter) => {
unimplemented!()
}
GenomicRangesParser::Bedlike(iter) => {
// Note the call to try_unwrap_data() here: this is because
// we know that the records *do* have data. Unwrapping the Option<String>
Expand All @@ -92,7 +121,28 @@ pub fn granges_adjust(
Ok(CommandOutput::new((), report))
}

/// Retain only the ranges that have at least one overlap with another set of ranges.
/// Filters genomic ranges based on overlaps with another set of ranges.
///
/// Retains only the ranges from the `left_path` file that have at least one overlap with
/// the ranges in the `right_path` file. The function can optionally skip ranges that do not
/// exist in the provided sequence lengths (e.g. a "genome" file in `bedtools` lingo).
///
/// # Arguments
///
/// * `seqlens` - A reference to a `PathBuf` for the file containing sequence lengths.
/// * `left_path` - A reference to a `PathBuf` for the input BED file containing the ranges to filter.
/// * `right_path` - A reference to a `PathBuf` for the BED file containing ranges to check for overlaps.
/// * `output` - An optional reference to a `PathBuf` where the filtered ranges will be written. Writes
/// to stdout if `None`.
/// * `skip_missing` - A boolean indicating whether to skip ranges missing in the sequence lengths file.
///
/// # Returns
///
/// A `Result` wrapping [`CommandOutput<()>`] on success, or [`GRangesError`] on failure.
///
/// # Errors
///
/// Returns [`GRangesError`] if any input file cannot be read, or if there's an issue processing the ranges.
pub fn granges_filter(
seqlens: &PathBuf,
left_path: &PathBuf,
Expand Down Expand Up @@ -198,7 +248,30 @@ pub fn granges_filter(
}
}

/// Adjust the genomic ranges in a bedfile by some specified amount.
/// Generates flanking regions for genomic ranges in a BED file.
///
/// For each range in the input BED file, this function computes the flanking regions based on
/// the specified left and right offsets. The flanking regions are constrained by the sequence lengths
/// provided in the `seqlens` file. The function supports both in-memory and streaming modes for processing.
///
/// # Arguments
///
/// * `seqlens` - A reference to a `PathBuf` for the file containing sequence lengths.
/// * `bedfile` - A reference to a `PathBuf` for the input BED file.
/// * `left` - An optional `Position` specifying the left flank size.
/// * `right` - An optional `Position` specifying the right flank size.
/// * `output` - An optional reference to a `PathBuf` for the output file. Writes to stdout if `None`.
/// * `skip_missing` - A boolean indicating whether to skip ranges missing in the sequence lengths file.
/// * `mode` - A [`ProcessingMode`] indicating whether to use in-memory or streaming processing.
///
/// # Returns
///
/// A `Result` wrapping [`CommandOutput<()>`] on success, or [`GRangesError`] on failure.
///
/// # Errors
///
/// Returns [`GRangesError`] if the input BED file or sequence lengths file cannot be read, or if there's
/// an issue generating the flanking regions.
pub fn granges_flank(
seqlens: &PathBuf,
bedfile: &PathBuf,
Expand Down Expand Up @@ -226,6 +299,9 @@ pub fn granges_flank(
};
gr.flanking_ranges(left, right)?.to_tsv(output)?
}
GenomicRangesParser::Bed5(_iter) => {
unimplemented!()
}
GenomicRangesParser::Bedlike(iter) => {
let gr = if skip_missing {
GRanges::from_iter(iter.try_unwrap_data().retain_seqnames(&seqnames), &genome)?
Expand All @@ -240,8 +316,8 @@ pub fn granges_flank(
},
ProcessingMode::Streaming => {
// Setup Output stream -- header is None for now (TODO).
let output_stream = output.map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
let output_stream = output.map_or(OutputStream::new_stdout(None), |file| {
OutputStream::new(file, None)
});
let mut writer = output_stream.writer()?;

Expand Down Expand Up @@ -278,6 +354,9 @@ pub fn granges_flank(
}
}
}
GenomicRangesParser::Bed5(_iter) => {
unimplemented!()
}
GenomicRangesParser::Bedlike(iter) => {
if skip_missing {
for record in iter.retain_seqnames(&seqnames) {
Expand Down Expand Up @@ -318,122 +397,120 @@ pub fn granges_flank(
Ok(CommandOutput::new((), report))
}


pub fn granges_map(
seqlens: impl Into<PathBuf>,
left_path: &PathBuf,
right_path: &PathBuf,
output: Option<&PathBuf>,
skip_missing: bool)
-> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();
let mut report = Report::new();
skip_missing: bool,
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;
let seqnames: Vec<String> = genome.keys().cloned().collect();
let mut report = Report::new();

let left_iter = GenomicRangesFile::parsing_iterator(left_path)?;
let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;
let left_iter = GenomicRangesFile::parsing_iterator(left_path)?;
let right_iter = GenomicRangesFile::parsing_iterator(right_path)?;

match (left_iter, right_iter) {
(GenomicRangesParser::Bed5(left), GenomicRangesParser::Bed5(right)) => {
let left_gr;
let right_gr;

match (left_iter, right_iter) {
(GenomicRangesParser::Bed5(left), GenomicRangesParser::Bed5(right)) => {
let left_gr;
let right_gr;
if skip_missing {
left_gr = GRanges::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRanges::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right, &genome)?;
}

if skip_missing {
left_gr = GRanges::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRanges::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right, &genome)?;
}
let right_gr = right_gr.into_coitrees()?;

let right_gr = right_gr.into_coitrees()?;
let left_join = left_gr.left_overlaps(&right_gr)?;

let left_join = left_gr.left_overlaps(&right_gr)?;
// TODO -- map function
// left_join.to_tsv(output)?;

left_join.to_tsv(output)?;
Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => {
todo!();
let left_gr;
let right_gr;

Ok(CommandOutput::new((), report))
if skip_missing {
left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRangesEmpty::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}
(GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => {
todo!();
let left_gr;
let right_gr;

if skip_missing {
left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRangesEmpty::from_iter(left, &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}

let right_gr = right_gr.into_coitrees()?;
let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;
let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => {
let left_gr;
let right_gr;

if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRangesEmpty::from_iter(right, &genome)?;
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => {
let left_gr;
let right_gr;

if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRangesEmpty::from_iter(right, &genome)?;
}

let right_gr = right_gr.into_coitrees()?;
let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;
Ok(CommandOutput::new((), report))
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => {
todo!();
let left_gr;
let right_gr;

Ok(CommandOutput::new((), report))
if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}
(GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => {
todo!();
let left_gr;
let right_gr;

if skip_missing {
left_gr =
GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?;
right_gr = GRanges::from_iter(
right.try_unwrap_data().retain_seqnames(&seqnames),
&genome,
)?;
} else {
left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?;
right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?;
}

let right_gr = right_gr.into_coitrees()?;
let right_gr = right_gr.into_coitrees()?;

let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;
let intersection = left_gr.filter_overlaps(&right_gr)?;
intersection.to_tsv(output)?;

Ok(CommandOutput::new((), report))
}
_ => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
Ok(CommandOutput::new((), report))
}
_ => Err(GRangesError::UnsupportedGenomicRangesFileFormat),
}

}

/// Generate a random BED-like file with genomic ranges.
pub fn granges_random_bed(
seqlens: impl Into<PathBuf>,
num: usize,
output: Option<impl Into<PathBuf>>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

Expand Down
1 change: 1 addition & 0 deletions src/data/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Data container implementations.
//!

use crate::traits::DataContainer;

Expand Down
Loading

0 comments on commit 2b8aa5a

Please sign in to comment.