Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New csv + serde parsing, with cleaner type inference; new `TsvRecordI… #3

Merged
merged 1 commit into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ bytes = "1.5.0"
ndarray-npy = { version = "0.8.1", optional = true }
num-traits = "0.2.18"
lazy_static = "1.4.0"
medians = "3.0.6"
csv = "1.3.0"
serde = { version = "1.0.197", features = ["derive"] }

[features]
# cli = [ "clap" ] # TODO make feature
Expand All @@ -54,3 +55,7 @@ criterion = { version = "0.5.1", features = ["html_reports"] }
name = "bedtools_comparison"
harness = false

[[bench]]
name = "io"
harness = false

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ let results_gr = left_gr
// Find overlaps
.left_overlaps(&right_gr)?
// Summarize overlap data
.map_over_joins(mean_score)?;
.map_joins(mean_score)?;
```

where `mean_score()` is:
Expand Down
66 changes: 66 additions & 0 deletions benches/io.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use criterion::{criterion_group, criterion_main, Criterion};
use csv::{self, ReaderBuilder};
use granges::io::parsers::Bed5Addition;
use granges::ranges::GenomicRangeRecord;
use granges::test_utilities::random_bed5file;
use granges::{prelude::*, Position};

#[derive(Debug, serde::Deserialize, PartialEq)]
struct Bed5Row {
seqname: String,
start: Position,
end: Position,
name: String,
score: f64,
}

const BED_LENGTH: usize = 1_000_000;

fn bench_io_shootout(c: &mut Criterion) {
// create the benchmark group
let mut group = c.benchmark_group("adjust");

// create the test data
let input_bedfile = random_bed5file(BED_LENGTH);

let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap();

// configure the sample size for the group
group.sample_size(10);

// Bed5Iterator
group.bench_function("bed5iterator", |b| {
b.iter(|| {
let iter = Bed5Iterator::new(input_bedfile.path()).unwrap();
let gr = GRanges::from_iter(iter, &genome).unwrap();
gr.len()
});
});

// CSV
group.bench_function("csv", |b| {
b.iter(|| {
let mut rdr = ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_path(input_bedfile.path())
.unwrap();

let iter = rdr.deserialize();
let mut gr: GRanges<VecRangesIndexed, Vec<Bed5Addition>> = GRanges::new_vec(&genome);

for result in iter {
let row: GenomicRangeRecord<Bed5Addition> = result.unwrap();
let data = Bed5Addition {
name: row.data.name,
score: row.data.score,
};
gr.push_range(&row.seqname, row.start, row.end, data)
.unwrap();
}
});
});
}

criterion_group!(benches, bench_io_shootout,);
criterion_main!(benches);
21 changes: 11 additions & 10 deletions examples/mean_score.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
use granges::{prelude::*, join::CombinedJoinDataLeftEmpty};
use granges::{join::CombinedJoinDataLeftEmpty, prelude::*};

// Our overlap data processing function.
pub fn mean_score(join_data: CombinedJoinDataLeftEmpty<Option<f64>>) -> f64 {
// Get the "right data" -- the BED5 scores.
let overlap_scores: Vec<f64> = join_data.right_data.into_iter()
let overlap_scores: Vec<f64> = join_data
.right_data
.into_iter()
// filter out missing values ('.' in BED)
.filter_map(|x| x).collect();
.filter_map(|x| x)
.collect();

// calculate the mean
let score_sum: f64 = overlap_scores.iter().sum();
score_sum / (overlap_scores.len() as f64)
}


fn try_main() -> Result<(), granges::error::GRangesError> {
// Mock sequence lengths (e.g. "genome" file)
let genome = seqlens!("chr1" => 100, "chr2" => 100);
Expand All @@ -30,18 +32,17 @@ fn try_main() -> Result<(), granges::error::GRangesError> {
// Convert to interval trees.
.into_coitrees()?
// Extract out just the score from the additional BED5 columns.
.map_data(|bed5_cols| {
bed5_cols.score
})?;
.map_data(|bed5_cols| bed5_cols.score)?;

// Compute overlaps and combine scores into mean.
let results_gr = left_gr
.left_overlaps(&right_gr)?
.map_over_joins(mean_score)?;
.map_joins(mean_score)?;

results_gr.to_tsv(None::<String>, &BED_TSV)?;
Ok(())
}

fn main() { try_main().unwrap(); }

fn main() {
try_main().unwrap();
}
2 changes: 1 addition & 1 deletion src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ pub fn granges_map(
let left_join_gr = left_gr.left_overlaps(&right_gr)?;

// Process all the overlaps.
let result_gr = left_join_gr.map_over_joins(|join_data| {
let result_gr = left_join_gr.map_joins(|join_data| {
// Get the "right data" -- the BED5 scores
let mut overlap_scores: Vec<f64> = join_data
.right_data
Expand Down
7 changes: 3 additions & 4 deletions src/data/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ pub fn median<F: Float + Sum>(numbers: &mut [F]) -> Option<F> {
}
let mid = numbers.len() / 2;
if numbers.len() % 2 == 0 {
numbers.select_nth_unstable_by(mid-1, |a, b| a.partial_cmp(b).unwrap());
let lower = numbers[mid-1];
numbers.select_nth_unstable_by(mid - 1, |a, b| a.partial_cmp(b).unwrap());
let lower = numbers[mid - 1];
numbers.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap());
let upper = numbers[mid];
Some((lower + upper) / F::from(2.0).unwrap())
Expand Down Expand Up @@ -93,7 +93,7 @@ impl Operation {

#[cfg(test)]
mod tests {
use super::*;
use super::*;

#[test]
fn test_median_empty() {
Expand All @@ -118,5 +118,4 @@ mod tests {
let mut numbers = vec![-3.0, -1.0, -2.0];
assert_eq!(median(&mut numbers), Some(-2.0));
}

}
3 changes: 3 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ pub enum GRangesError {
#[error("File reading error: {0}. Please check if the file exists and you have permission to read it.")]
IOError(#[from] std::io::Error),

#[error("File parsing error: {0}")]
TsvParsingError(#[from] csv::Error),

// File parsing related errors
#[error("Could not determine the file type based on its extension. Ensure the file has a standard genomic data extension (.bed, .gff, etc.).")]
CouldNotDetectRangesFiletype,
Expand Down
12 changes: 6 additions & 6 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ where
/// See [`CombinedJoinData`] and its convenience methods, which are designed
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -847,7 +847,7 @@ impl GRanges<VecRangesIndexed, JoinDataBothEmpty> {
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.

pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -885,7 +885,7 @@ where
/// See [`CombinedJoinDataRightEmpty`] and its convenience methods, which are designed
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -924,7 +924,7 @@ where
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.

pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -1470,7 +1470,7 @@ mod tests {
}

#[test]
fn test_map_over_joins() {
fn test_map_oins() {
let sl = seqlens!("chr1" => 50);
let windows: GRangesEmpty<VecRangesEmpty> =
GRangesEmpty::from_windows(&sl, 10, None, true).unwrap();
Expand All @@ -1487,7 +1487,7 @@ mod tests {
let joined_results = windows
.left_overlaps(&right_gr)
.unwrap()
.map_over_joins(|join_data| {
.map_joins(|join_data| {
let overlap_scores = join_data.right_data;
overlap_scores.iter().sum::<f64>()
})
Expand Down
4 changes: 2 additions & 2 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub mod tsv;

pub use file::{InputStream, OutputStream};
pub use parsers::{
Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
TsvRecordIterator,
bed::Bed3Iterator, bed::Bed5Iterator, bed::BedlikeIterator, tsv::TsvRecordIterator,
GenomicRangesFile, GenomicRangesParser,
};
pub use tsv::BED_TSV;
Loading
Loading