From 3e22670a6ab67167990f6f898543c04e0a72e090 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sun, 25 Feb 2024 12:46:36 -0800 Subject: [PATCH] Added new `granges windows` subcommand, fixed bug in `from_windows()`. - Fixed window behavior to match `bedtools makewindows`. - Fixed tests. - New test utility functions for generating random BED5 files. - New validation against `bedtools makewindows`. --- src/commands.rs | 37 ++++++++++++---- src/granges.rs | 63 ++++++++++++++++++--------- src/io/parsers.rs | 4 ++ src/main/mod.rs | 82 +++++++++++++++++++++++++++++++++--- src/test_utilities.rs | 49 ++++++++++++++++++++- tests/bedtools_validation.rs | 45 ++++++++++++++++++++ 6 files changed, 243 insertions(+), 37 deletions(-) diff --git a/src/commands.rs b/src/commands.rs index 2499d76..ce1defa 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -12,7 +12,7 @@ use crate::{ prelude::*, ranges::{operations::adjust_range, GenomicRangeEmptyRecord, GenomicRangeRecord}, reporting::{CommandOutput, Report}, - test_utilities::random_granges, + test_utilities::{random_granges, random_granges_mock_bed5}, traits::TsvSerialize, Position, PositionOffset, }; @@ -469,23 +469,44 @@ pub fn granges_map( Ok(CommandOutput::new((), report)) } +/// Generate a BED3 file of genomic windows. +pub fn granges_windows( + seqlens: impl Into, + width: Position, + step: Option, + chop: bool, + output: Option>, +) -> Result, GRangesError> { + let genome = read_seqlens(seqlens)?; + GRangesEmpty::from_windows(&genome, width, step, chop)?.to_tsv(output, &BED_TSV)?; + let report = Report::new(); + Ok(CommandOutput::new((), report)) +} + /// Generate a random BED-like file with genomic ranges. pub fn granges_random_bed( seqlens: impl Into, num: usize, output: Option>, sort: bool, + bed5: bool, ) -> Result, GRangesError> { // get the genome info let genome = read_seqlens(seqlens)?; - let mut gr = random_granges(&genome, num)?; - - if sort { - gr = gr.sort(); - } - - gr.to_tsv(output, &BED_TSV)?; + if bed5 { + let mut gr = random_granges_mock_bed5(&genome, num)?; + if sort { + gr = gr.sort() + } + gr.to_tsv(output, &BED_TSV)?; + } else { + let mut gr = random_granges(&genome, num)?; + if sort { + gr = gr.sort(); + } + gr.to_tsv(output, &BED_TSV)?; + }; let report = Report::new(); Ok(CommandOutput::new((), report)) diff --git a/src/granges.rs b/src/granges.rs index 3c4a93b..442a24a 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -258,14 +258,14 @@ impl<'a, R: IterableRangeContainer> GenomicRangesTsvSerialize<'a, R> for GRanges /// will be a BED3 file. /// /// # Arguments - /// * `output`: either `None` (for standard out) or file path. If the filepath - /// ends in `.gz`, the output will be gzip-compressed. + /// * `output`: either `None` (for standard out) or file path. /// * `config`: a [`TsvConfig`], which contains the TSV output settings. fn to_tsv( &'a self, output: Option>, config: &TsvConfig, ) -> Result<(), GRangesError> { + // TODO gzip output handling // output stream -- header is None for now (TODO) let output = output.map_or(OutputStream::new_stdout(None), |file| { OutputStream::new(file, None) @@ -435,29 +435,26 @@ impl GRangesEmpty { ) -> Result, GRangesError> { let mut gr: GRangesEmpty = GRangesEmpty::new_vec(seqlens); - // have we encountered a remainder chunk? - - let mut remainder = false; // iterate over each chromosome and create windows for (seqname, len) in seqlens { let mut start = 0; while start < *len { - let mut end = start + width - 1; + let mut end = start + width; if end >= *len { + // the end is past the sequence length if chop { + // do not add any remainder break; } else { - end = std::cmp::min(end, len - 1); - } - } - if end - start + 1 < width { - if remainder { - break; + // truncate end, push, and break + end = std::cmp::min(end, *len); + gr.push_range(seqname, start, end)?; } - remainder = true; + } else { + // push a normal window + gr.push_range(seqname, start, end)?; } - gr.push_range(seqname, start, end + 1)?; start += step.unwrap_or(width); } } @@ -1386,11 +1383,11 @@ mod tests { #[test] fn test_from_windows() { - let sl = seqlens!( "chr1" => 35); + let sl = seqlens!( "chr1" => 35 ); + // Test chop let gr = GRangesEmpty::from_windows(&sl, 10, None, true).unwrap(); assert_eq!(gr.len(), 3, "{:?}", gr); - dbg!(&gr); gr.iter_ranges().for_each(|r| assert_eq!(r.width(), 10)); // Test no chop @@ -1398,19 +1395,22 @@ mod tests { assert_eq!(gr.iter_ranges().last().unwrap().width(), 5); // Test sliding with step of 2, with chop - let sl = seqlens!( "chr1" => 21); + let sl = seqlens!( "chr1" => 21, "chr2" => 13 ); let gr = GRangesEmpty::from_windows(&sl, 10, Some(2), true).unwrap(); - let mut expected_ranges_chop: Vec<(String, Position, Position)> = vec![ + let expected_ranges_chop: Vec<(String, Position, Position)> = vec![ ("chr1", 0, 10), ("chr1", 2, 12), ("chr1", 4, 14), ("chr1", 6, 16), ("chr1", 8, 18), ("chr1", 10, 20), + ("chr2", 0, 10), + ("chr2", 2, 12), ] .into_iter() .map(|(seq, s, e)| (seq.to_string(), s, e)) .collect(); + let seqnames = sl.keys().map(|x| x.to_string()).collect::>(); let actual_ranges: Vec<(String, Position, Position)> = gr .iter_ranges() @@ -1420,13 +1420,36 @@ mod tests { // The same as above, without chop -- goes to seqlen with little remainder. let gr = GRangesEmpty::from_windows(&sl, 10, Some(2), false).unwrap(); - expected_ranges_chop.push(("chr1".to_string(), 12, 21)); + let expected_ranges_no_chop: Vec<(String, Position, Position)> = vec![ + ("chr1", 0, 10), + ("chr1", 2, 12), + ("chr1", 4, 14), + ("chr1", 6, 16), + ("chr1", 8, 18), + ("chr1", 10, 20), + ("chr1", 12, 21), + ("chr1", 14, 21), + ("chr1", 16, 21), + ("chr1", 18, 21), + ("chr1", 20, 21), + ("chr2", 0, 10), + ("chr2", 2, 12), + ("chr2", 4, 13), + ("chr2", 6, 13), + ("chr2", 8, 13), + ("chr2", 10, 13), + ("chr2", 12, 13), + ] + .into_iter() + .map(|(seq, s, e)| (seq.to_string(), s, e)) + .collect(); + let actual_ranges: Vec<(String, Position, Position)> = gr .iter_ranges() .map(|r| (r.seqname(&seqnames), r.start(), r.end())) .collect(); - assert_eq!(actual_ranges, expected_ranges_chop); + assert_eq!(actual_ranges, expected_ranges_no_chop); } #[test] diff --git a/src/io/parsers.rs b/src/io/parsers.rs index fa47ec2..117fed8 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -427,6 +427,10 @@ impl TsvSerialize for Option { } /// The additional two BED5 columns. +/// +/// # Fields +/// * `name`: the feature name. +/// * `score`: a score. #[derive(Clone, Debug)] pub struct Bed5Addition { pub name: String, diff --git a/src/main/mod.rs b/src/main/mod.rs index 282e379..54caeaa 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -2,7 +2,9 @@ use std::path::PathBuf; use clap::{Parser, Subcommand}; use granges::{ - commands::{granges_adjust, granges_filter, granges_flank, granges_map, ProcessingMode}, + commands::{ + granges_adjust, granges_filter, granges_flank, granges_map, granges_windows, ProcessingMode, + }, data::operations::Operation, prelude::GRangesError, Position, PositionOffset, @@ -17,12 +19,27 @@ usage: granges [--help] Subcommands: - adjust: Adjust each genomic range, e.g. to add a kilobase to each end. + adjust: Adjust each genomic range, e.g. to add a kilobase to each end. + + filter: Filter the left ranges based on whether they have at least one + overlap with a right range. This is equivalent to a filtering + "semi-join" in SQL terminology. - filter: Filter the left ranges based on whether they have at least one - overlap with a right range. This is equivalent to a filtering - "semi-join" in SQL terminology. + map: Compute the left grouped overlaps between the left genomic ranges + and right genomic ranges, and apply one or more operations to the + score column of the right BED5 file. + windows: Create a set of genomic windows of the specified width (in basepairs), + stepping the specified step size (the width, by default). + + +NOTE: granges is under active development. It is not currently meant to be +a full replacement for other genomic ranges software, such as bedtools. The +command line functionality currently used for testing and benchmarking. + +Please request features and report any issues on GitHub: +https://github.com/vsbuffalo/granges/issues + "#; #[derive(Parser)] @@ -38,6 +55,8 @@ struct Cli { #[derive(Subcommand)] enum Commands { + /// Adjust the start, end, or both coordinates of each range by some + /// specified amount. Adjust { /// A TSV genome file of chromosome names and their lengths #[arg(short, long, required = true)] @@ -60,6 +79,8 @@ enum Commands { sort: bool, // TODO add skip_missing here }, + /// Filter out the left ranges that do not have overlaps with any + /// right ranges. This is a "semi-join" in SQL terminology. Filter { /// A TSV genome file of chromosome names and their lengths #[arg(short, long, required = true)] @@ -82,6 +103,7 @@ enum Commands { #[arg(short, long)] skip_missing: bool, }, + /// Compute the flanking regions for each range. Flank { /// A TSV genome file of chromosome names and their lengths #[arg(short, long, required = true)] @@ -116,6 +138,11 @@ enum Commands { #[arg(long)] in_mem: bool, }, + /// Do a "left grouped join", on the specified left and right genomic ranges, + /// and apply one or more functions to the BED5 scores for all right genomic + /// ranges. + /// + /// This is analogous to 'bedtools map'. Map { /// A TSV genome file of chromosome names and their lengths #[arg(short, long, required = true)] @@ -142,6 +169,36 @@ enum Commands { #[arg(short, long)] skip_missing: bool, }, + /// Create a set of genomic windows ranges using the specified width + /// and step size, and output to BED3. + /// + /// If --chop is set, the "remainder" windows at the end of a chromosome + /// that would have width less than that specified by --width are chopped + /// off. + /// + /// This is analogous to 'bedtools makewindows'. + Windows { + /// A TSV genome file of chromosome names and their lengths + #[arg(short, long, required = true)] + genome: PathBuf, + + /// Width (in basepairs) of each window. + #[arg(short, long)] + width: Position, + + /// Step width (by default: window size). + #[arg(short, long)] + step: Option, + + /// If last window remainder is shorter than width, remove? + #[arg(short, long)] + chop: bool, + + /// An optional output file (standard output will be used if not specified) + #[arg(short, long)] + output: Option, + }, + #[cfg(feature = "dev-commands")] RandomBed { /// a TSV genome file of chromosome names and their lengths @@ -159,6 +216,10 @@ enum Commands { /// sort the ranges #[arg(short, long)] sort: bool, + + /// add two additional columns (feature name and score) to mimic a BED5 file + #[arg(short, long)] + bed5: bool, }, } @@ -240,14 +301,21 @@ fn run() -> Result<(), GRangesError> { *skip_missing, ) } - + Some(Commands::Windows { + genome, + width, + step, + chop, + output, + }) => granges_windows(genome, *width, *step, *chop, output.as_ref()), #[cfg(feature = "dev-commands")] Some(Commands::RandomBed { genome, num, output, sort, - }) => granges_random_bed(genome, *num, output.as_ref(), *sort), + bed5, + }) => granges_random_bed(genome, *num, output.as_ref(), *sort, *bed5), None => { println!("{}\n", INFO); std::process::exit(1); diff --git a/src/test_utilities.rs b/src/test_utilities.rs index 23b6e6b..a10031b 100644 --- a/src/test_utilities.rs +++ b/src/test_utilities.rs @@ -8,6 +8,7 @@ use crate::{ create_granges_with_seqlens, error::GRangesError, granges::GRangesEmpty, + io::parsers::Bed5Addition, prelude::{GRanges, VecRangesIndexed}, ranges::{ coitrees::COITrees, @@ -20,8 +21,8 @@ use crate::{ use genomap::GenomeMap; use indexmap::IndexMap; #[cfg(feature = "ndarray")] -use ndarray::{s, Array1, Array2}; -use rand::{seq::SliceRandom, thread_rng, Rng}; +use ndarray::{Array1, Array2}; +use rand::{distributions::Uniform, seq::SliceRandom, thread_rng, Rng}; use tempfile::{Builder, NamedTempFile}; /// Get the path to the `grange` command line tool after a build. @@ -110,6 +111,48 @@ pub fn random_granges( Ok(gr) } +/// Generate random strings, e.g. for mock feature names. +fn generate_random_string(n: usize) -> String { + let mut rng = rand::thread_rng(); + let letters: Vec = ('a'..='z').collect(); + let letters_dist = Uniform::from(0..letters.len()); + + (0..n).map(|_| letters[rng.sample(letters_dist)]).collect() +} + +/// Generate a random float value, e.g. for a mock BED "score". +fn generate_random_uniform(start: f64, end: f64) -> f64 { + let mut rng = rand::thread_rng(); + let uniform = Uniform::new(start, end); // Specify the range + rng.sample(uniform) +} + +/// Build a random [`GRanges`] using a set of sequence lengths, +/// with BED5 like data. +pub fn random_granges_mock_bed5( + seqlens: &IndexMap, + num: usize, +) -> Result>, GRangesError> { + let mut rng = thread_rng(); + + let mut gr = GRanges::new_vec(seqlens); + + let seqnames: Vec = seqlens.keys().cloned().collect(); + for _ in 0..num { + let seqname = seqnames.choose(&mut rng).unwrap(); + let chrom_len = *seqlens + .get(seqname) + .ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?; + let (start, end) = random_range(chrom_len); + let bed5_cols = Bed5Addition { + name: generate_random_string(8), + score: generate_random_uniform(0.0, 1.0), + }; + gr.push_range(seqname, start, end, bed5_cols)?; + } + Ok(gr) +} + /// Build random [`COITrees`] from a random [`VecRanges`]. pub fn random_coitrees() -> COITrees<()> { let vr = random_vecranges(100); @@ -133,6 +176,7 @@ pub fn random_bed3file(length: usize) -> NamedTempFile { length, Some(temp_bedfile.path()), true, + false, ) .expect("could not generate random BED file"); temp_bedfile @@ -149,6 +193,7 @@ pub fn random_bed5file(length: usize) -> NamedTempFile { length, Some(temp_bedfile.path()), true, + true, ) .expect("could not generate random BED file"); temp_bedfile diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index 8a8228e..b19a9d6 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -16,6 +16,7 @@ fn test_random_bed3file_filetype_detect() { 100_000, Some(&random_bedfile_path), true, + false, ) .expect("could not generate random BED file"); @@ -35,6 +36,7 @@ fn test_against_bedtools_slop() { 100_000, Some(&random_bedfile_path), true, + false, ) .expect("could not generate random BED file"); @@ -92,6 +94,7 @@ fn test_against_bedtools_intersect_wa() { num_ranges, Some(&random_bedfile_right), true, + false, ) .expect("could not generate random BED file"); @@ -141,6 +144,7 @@ fn test_against_bedtools_flank() { num_ranges, Some(&random_bedfile), true, + false, ) .expect("could not generate random BED file"); @@ -183,3 +187,44 @@ fn test_against_bedtools_flank() { assert_eq!(bedtools_ranges, granges_ranges); } + +#[test] +fn test_against_bedtools_makewindows() { + // some weird widths, steps to try to catch remainder issues + let widths = vec![131123, 1_000_0013]; + let steps = vec![10_001, 10_113]; + + for width in widths.iter() { + for step in steps.iter() { + let bedtools_output = Command::new("bedtools") + .arg("makewindows") + .arg("-g") + .arg("tests_data/hg38_seqlens.tsv") + .arg("-w") + .arg(width.to_string()) + .arg("-s") + .arg(step.to_string()) + .output() + .expect("bedtools slop failed"); + + let granges_output = Command::new(granges_binary_path()) + .arg("windows") + .arg("--genome") + .arg("tests_data/hg38_seqlens.tsv") + .arg("--width") + .arg(width.to_string()) + .arg("--step") + .arg(step.to_string()) + .output() + .expect("granges windows failed"); + + assert!(bedtools_output.status.success(), "{:?}", bedtools_output); + assert!(granges_output.status.success(), "{:?}", granges_output); + + assert_eq!( + String::from_utf8_lossy(&bedtools_output.stdout), + String::from_utf8_lossy(&granges_output.stdout) + ); + } + } +}