Skip to content

Commit

Permalink
New cleaner handling of BED3 vs BED+ variants, new GenericRange trait…
Browse files Browse the repository at this point in the history
…, new integration tests against bedtools

 - new integration testing against bedtools
 - InputFile can detect column number
 - GenericRange trait, and simplified adjust_range() function
 - new iterator methods that more seamlessly handle BED3 BED+
 - granges adjust command line function uses enum variant to handle BED3 vs BED+
 - new range sorting methods
 - additional to_tsv methods
 - added bedtools install to CI
  • Loading branch information
vsbuffalo committed Feb 16, 2024
1 parent 828d9e2 commit 31b84db
Show file tree
Hide file tree
Showing 16 changed files with 448 additions and 177 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ jobs:

steps:
- uses: actions/checkout@v3
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y bedtools
- name: Build
run: cargo build --verbose
- name: Run tests
Expand Down
10 changes: 9 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,18 @@ thiserror = "1.0.57"
# cli = [ "clap" ] // TODO make feature
dev-commands = [ ]

[profile.release]
opt-level = 3

[profile.dev]
opt-level = 1

[[bin]]
name = "granges"
path = "src/main/mod.rs"
# required-features = ["cli"]

[dev-dependencies]
tempfile = "3.10.0"



107 changes: 107 additions & 0 deletions src/commands.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
use std::path::PathBuf;

use crate::{prelude::*, PositionOffset, reporting::{CommandOutput, Report}, io::OutputFile, ranges::operations::adjust_range, test_utilities::random_granges};


/// Adjust the genomic ranges in a bedfile by some specified amount.
// NOTE: we don't do build the full GRanges objects here, for efficiency.
// But it would be a good benchmark to see how much slower that is.
pub fn granges_adjust(
bedfile: &PathBuf,
seqlens: &PathBuf,
both: PositionOffset,
output: Option<&PathBuf>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
let genome = read_seqlens(seqlens)?;

// input iterator
let bedlike_iterator = BedlikeIterator::new(bedfile)?;

// output stream -- header is None for now (TODO)
let output_stream = output.map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
});
let mut writer = output_stream.writer()?;

// for reporting stuff to the user
let mut report = Report::new();

// if we need to sort, we need to accumulate ranges in memory
enum GRangesVariant {
Indexed(GRanges<VecRangesIndexed, Vec<String>>),
Empty(GRanges<VecRangesEmpty, ()>),
}

let number_columns = bedlike_iterator.number_columns();
let mut gr = match number_columns {
3 => GRangesVariant::Empty(GRanges::new_vec(&genome)),
n if n > 3 => GRangesVariant::Indexed(GRanges::new_vec(&genome)),
_ => panic!("Unexpected number of columns"),
};

let mut skipped_ranges = 0;
for record in bedlike_iterator {
let range = record?;
let seqname = &range.seqname;
let length = *genome
.get(seqname)
.ok_or(GRangesError::MissingSequence(seqname.to_string()))?;

let possibly_adjusted_range = adjust_range(range, -both, both, length);

if let Some(range_adjusted) = possibly_adjusted_range {
if !sort {
writer.write_all(&range_adjusted.to_tsv().into_bytes())?;
} else {
// we need to sort, so we build up the appropriate type of GRanges
// object, depending on if we need to hold data or not.
match gr {
GRangesVariant::Empty(ref mut obj) => obj.push_range_empty(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end)?,
GRangesVariant::Indexed(ref mut obj) => obj.push_range_with_data(&range_adjusted.seqname, range_adjusted.start, range_adjusted.end, range_adjusted.data)?,
}
}
} else {
skipped_ranges += 1;
}

if skipped_ranges > 0 {
report.add_issue(format!(
"{} ranges were removed because their widths after adjustment were ≤ 0",
skipped_ranges
))
}
}

// if we need to sort the ranges, do that and then output them
if sort {
match gr {
GRangesVariant::Empty(obj) => obj.sort().to_bed3(output)?,
GRangesVariant::Indexed(obj) => obj.sort().to_tsv(output)?,
}

}
Ok(CommandOutput::new((), report))
}

/// Generate a random BED-like file with genomic ranges.
pub fn granges_random_bed(
seqlens: impl Into<PathBuf>,
num: u32,
output: Option<impl Into<PathBuf>>,
sort: bool,
) -> Result<CommandOutput<()>, GRangesError> {
// get the genome info
let genome = read_seqlens(seqlens)?;

let mut gr = random_granges(&genome, num.try_into().unwrap())?;

if sort {
gr = gr.sort();
}

gr.to_bed3(output)?;

let report = Report::new();
Ok(CommandOutput::new((), report))
}
2 changes: 2 additions & 0 deletions src/data/vec.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Data container implementations for [`Vec<U>`].

use crate::traits::IndexedDataContainer;


/// Trait methods for the commonly-used `Vec<U>` data container.
///
Expand Down
37 changes: 35 additions & 2 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
RangeEmpty, RangeIndexed, RangeRecord,
},
traits::{RangeContainer, RangesIterable, TsvSerialize},
traits::{RangeContainer, RangesIterable, TsvSerialize, GenericRange, IndexedDataContainer},
Position,
};

Expand Down Expand Up @@ -47,7 +47,7 @@ where
}
}

impl<R: Clone, T> GRanges<VecRanges<R>, T> {
impl<R: GenericRange, T> GRanges<VecRanges<R>, T> {
/// Create a new [`GRanges`] object, with vector storage for ranges and data.
///
/// This combination of range and data containers is used when loading data into
Expand All @@ -68,6 +68,16 @@ impl<R: Clone, T> GRanges<VecRanges<R>, T> {
}
Self { ranges, data: None }
}

/// Consume this [`GRanges`] object and sort the ranges.
pub fn sort(mut self) -> Self {
self.ranges
.values_mut()
.for_each(|ranges| {
ranges.sort()
});
self
}
}

impl<U> GRanges<VecRangesIndexed, Vec<U>> {
Expand Down Expand Up @@ -96,6 +106,29 @@ impl<U> GRanges<VecRangesIndexed, Vec<U>> {
}
}

impl<'a, T> GRanges<VecRanges<RangeIndexed>, T>
where T: IndexedDataContainer<'a>,
T: TsvSerialize,
<T as IndexedDataContainer<'a>>::Item: TsvSerialize
{
///
pub fn to_tsv(&'a self, output: Option<impl Into<PathBuf>>) -> Result<(), GRangesError> {
// output stream -- header is None for now (TODO)
let output = output.map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
});
let mut writer = output.writer()?;

let seqnames = self.seqnames();
for range in self.iter_ranges() {
let record = range.to_record(&seqnames, self.data.as_ref().unwrap());
writeln!(writer, "{}", record.to_tsv())?;
}
Ok(())
}

}

impl GRanges<VecRangesEmpty, ()> {
/// Push an empty range (no data) to the [`VecRangesEmpty`] range container.
pub fn push_range_empty(
Expand Down
19 changes: 17 additions & 2 deletions src/io/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ impl InputFile {
}

/// Collects comment lines and/or a line at the start of the file.
pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>) -> io::Result<bool> {
pub fn collect_metadata(&mut self, comment: &str, header: Option<&str>)
-> io::Result<bool> {
let mut buf_reader = self.reader()?;
let mut comments = Vec::new();
let mut line = String::new();
Expand Down Expand Up @@ -122,6 +123,20 @@ impl InputFile {
Ok(self.skip_lines > 0)
}


/// Detect the number of columns *from the first line*, according to some delimiter.
/// This is not robust against ragged delimited data formats.
pub fn detect_columns(&mut self, delim: &str) -> Result<usize, GRangesError> {
let mut skipped_lines = 0;
let mut buf_reader = self.reader()?;
while skipped_lines < self.skip_lines {
skipped_lines += 1;
}
let mut line = String::new();
buf_reader.read_line(&mut line)?;
Ok(line.split(delim).count())
}

/// Method to continue reading after skipping the comment and header lines.
pub fn continue_reading(&self) -> io::Result<BufReader<Box<dyn Read>>> {
let mut buf_reader = self.reader()?;
Expand Down Expand Up @@ -197,7 +212,7 @@ impl OutputFile {
Box::new(BufWriter::new(File::create(path)?))
}
}
OutputDestination::Stdout => Box::new(io::stdout()),
OutputDestination::Stdout => Box::new(BufWriter::new(io::stdout())),
};
// write header if one is set
if let Some(entries) = &self.header {
Expand Down
15 changes: 13 additions & 2 deletions src/io/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,12 @@ impl GeneralRangeRecordIterator<()> for Bed3RecordIterator {
}
}

/// An extensible TSV parser, which uses a supplied parser function to
/// convert a line into a [`RangeRecord<U>`], a range with generic associated
/// data.
pub struct TsvRecordIterator<F, U> {
reader: BufReader<Box<dyn std::io::Read>>,
num_columns: usize,
parser: F,
phantom: std::marker::PhantomData<U>,
}
Expand All @@ -68,12 +72,17 @@ impl<F, U> TsvRecordIterator<F, U>
where
F: Fn(&str) -> Result<RangeRecord<U>, GRangesError>,
{
/// Create a new [`TsvRecordIterator`], which parses lines from the supplied
/// file path into [`RangeRecord<U>`] using the specified parsing function.
pub fn new(filepath: impl Into<PathBuf>, parser: F) -> Result<Self, GRangesError> {
let input_file = InputFile::new(filepath);
let mut input_file = InputFile::new(filepath);
let _has_metadata = input_file.collect_metadata("#", None);
let num_columns = input_file.detect_columns("\t")?;
let reader = input_file.continue_reading()?;

Ok(Self {
reader,
num_columns,
parser,
phantom: std::marker::PhantomData,
})
Expand Down Expand Up @@ -108,12 +117,14 @@ pub struct BedlikeIterator {

impl BedlikeIterator {
pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
// Wrap the parse_bedlike_to_range_record function to conform with TsvRecordIterator's expectations.
let parser: fn(&str) -> Result<RangeRecord<String>, GRangesError> = parse_bed_lazy;

let iter = TsvRecordIterator::new(filepath, parser)?;
Ok(Self { iter })
}
pub fn number_columns(&self) -> usize {
self.iter.num_columns
}
}

impl Iterator for BedlikeIterator {
Expand Down
9 changes: 8 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,18 @@ pub use indexmap;
pub mod error;
pub mod granges;
pub mod io;
pub mod data;
pub mod iterators;
pub mod join;
pub mod ranges;
pub mod test_utilities;
pub mod traits;

// bringing these CLI modules into lib.rs rather than main/ allows for
// use in integration tests and other Rust-side command line work
pub mod commands;
pub mod reporting;

pub type Position = u32;
pub type PositionOffset = i32; // signed variant

Expand All @@ -24,7 +30,8 @@ pub mod prelude {

pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
pub use crate::traits::{
GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable, TsvSerialize,
IndexedDataContainer,
GeneralRangeRecordIterator, GenericRange, RangesIntoIterable, RangesIterable, TsvSerialize,
};

pub use crate::seqlens;
Expand Down
Loading

0 comments on commit 31b84db

Please sign in to comment.