From 7448e8288ab63dec42a73b519d2110ea813c4c53 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sat, 18 May 2024 11:35:40 -0700 Subject: [PATCH] New additional benchmarking to check scaling with input size. - Also changed `unwrap()` to `expect()`. --- additional_benchmarks/Snakefile | 57 +++++++++++++++++++ .../rscripts/plyranges_join_overlap_inner.r | 18 ++++++ src/granges.rs | 3 +- 3 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 additional_benchmarks/Snakefile create mode 100644 additional_benchmarks/rscripts/plyranges_join_overlap_inner.r diff --git a/additional_benchmarks/Snakefile b/additional_benchmarks/Snakefile new file mode 100644 index 0000000..58b100c --- /dev/null +++ b/additional_benchmarks/Snakefile @@ -0,0 +1,57 @@ +import numpy as np + +GRANGES = "../target/release/granges" +SEQLENS = "../tests_data/hg38_seqlens.tsv" +NREPS = 1 + +rule granges: + output: GRANGES + shell: + """ + (cargo build --release --features=dev-commands) + """ + +rule random_bed: + input: seqlens=SEQLENS, granges=GRANGES + output: "random_bed/{size}__{rep}.bed.gz" + shell: + """ + {input.granges} random-bed --sort --num {wildcards.size} {input.seqlens} | gzip > {output} + """ + +rule granges_filter: + input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz", + genome=SEQLENS, granges=GRANGES + output: "results/granges_filter__size_{size}.bed" + benchmark: repeat("benchmarks/granges_filter__size_{size}.tsv", NREPS) + shell: + """ + {input.granges} filter --left {input.a} --right {input.b} --genome {input.genome} > {output} + """ + + +rule bedtools_intersect: + input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz" + output: "results/bedtools_intersect__size_{size}.bed" + benchmark: repeat("benchmarks/bedtools_intersect__size_{size}.tsv", NREPS) + shell: + """ + bedtools intersect -a {input.a} -b {input.b} > {output} + """ + +rule plyranges_join_overlap_inner: + input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz" + output: "results/plyranges_join_overlap_inner__size_{size}.bed" + benchmark: repeat("benchmarks/plyranges_join_overlap_inner__size_{size}.tsv", NREPS) + shell: + """ + Rscript Rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} > {output} + """ + +tools = ["bedtools_intersect", "granges_filter", "plyranges_join_overlap_inner"] +sizes = np.logspace(3, 9, 10) + +all_benches = expand("results/{tool}__size_{size}.bed", tool=tools, size=sizes) + +rule all: + input: all_benches diff --git a/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r b/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r new file mode 100644 index 0000000..911ef5e --- /dev/null +++ b/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r @@ -0,0 +1,18 @@ +library(plyranges) + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) != 3) { + stop("Usage: Rscript script.R ") +} + +bed_file_a <- args[1] +bed_file_b <- args[2] +output_file <- args[3] + +# load data into mem +granges_a <- read_bed(bed_file_a) +granges_b <- read_bed(bed_file_b) + +intersection_result <- join_overlap_inner(granges_a, granges_b) + +write_bed(intersection_result, output_file) diff --git a/src/granges.rs b/src/granges.rs index e5b0a7d..08973c8 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -491,7 +491,8 @@ where for (seqname, indices) in self.data_indices()? { let mut new_data = Vec::new(); for index in indices { - let value = data.get(index).unwrap(); + let value = data.get(index) + .expect("Invalid index in GRanges::data_by_seqname(). This indicates invalid data indices (developer error)."); new_data.push((*value).clone()); } all_new_data.insert(&seqname, new_data)?;