From 3588443cfd7fcbc29a669efe4d6852e7605dcae5 Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Sat, 18 May 2024 16:39:20 -0700 Subject: [PATCH] New benchmarking --- additional_benchmarks/Snakefile | 89 +++++++++++++++++-- .../rscripts/plyranges_map.r | 37 ++++++++ 2 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 additional_benchmarks/rscripts/plyranges_map.r diff --git a/additional_benchmarks/Snakefile b/additional_benchmarks/Snakefile index 0fe6696..60c3316 100644 --- a/additional_benchmarks/Snakefile +++ b/additional_benchmarks/Snakefile @@ -3,9 +3,9 @@ import numpy as np RSCRIPT = "~/.conda/envs/granges_benchmark/bin/R" GRANGES = "../target/release/granges" SEQLENS = "../tests_data/hg38_seqlens.tsv" -NREPS = 50 +NREPS = 100 -localrules: granges +localrules: granges, windows rule granges: output: GRANGES @@ -14,9 +14,11 @@ rule granges: (cargo build --release --features=dev-commands) """ +## ----- random BED files ----- + rule random_bed: input: seqlens=SEQLENS, granges=GRANGES - output: "random_bed/{size}__{rep}.bed.gz" + output: "random_bed/{size,[0-9]+}__{rep}.bed.gz" resources: runtime=30, mem_mb_per_cpu=1800, @@ -26,6 +28,29 @@ rule random_bed: {input.granges} random-bed --sort --num {wildcards.size} {input.seqlens} | gzip > {output} """ +rule random_bed_scores: + input: seqlens=SEQLENS, granges=GRANGES + output: "random_bed/scores/{size,[0-9]+}__{rep}.bed.gz" + resources: + runtime=30, + mem_mb_per_cpu=1800, + cpus_per_task=28 + shell: + """ + {input.granges} random-bed --sort --num {wildcards.size} --scores {input.seqlens} | gzip > {output} + """ + +rule windows: + input: seqlens=SEQLENS, granges=GRANGES + output: "windows/width_{width}.bed" + shell: + """ + {input.granges} windows --genome {input.seqlens} --width {wildcards.width} --output {output} + """ + + +## ----- granges filter / bedtools intersect / plyranges join overlap inner ----- + rule granges_filter: input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz", genome=SEQLENS, granges=GRANGES @@ -40,7 +65,6 @@ rule granges_filter: """ {input.granges} filter --left {input.a} --right {input.b} --genome {input.genome} > {output} """ - rule bedtools_intersect: input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz" @@ -71,7 +95,62 @@ rule plyranges_join_overlap_inner: {params.rscript} Rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} > {output} """ -tools = ["bedtools_intersect", "granges_filter"] +## ----- granges map / bedtools map ----- +WINDOWS = "windows/width_100000.bed" + +rule granges_map: + input: right=WINDOWS, left="random_bed/scores/{size}__A.bed.gz", + genome=SEQLENS, granges=GRANGES + output: "results/granges_map__size_{size}.bed" + benchmark: repeat("benchmarks/granges_map__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=300, + mem_mb=96000, + cpus_per_task=40 + shell: + """ + {input.granges} map --left {input.left} --right {input.right} --genome {input.genome} --func "min,max,mean,sum,median" --output {output} + """ + +rule bedtools_map: + input: right=WINDOWS, left="random_bed/scores/{size}__A.bed.gz" + output: "results/bedtools_map__size_{size}.bed" + benchmark: repeat("benchmarks/bedtools_map__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=300, + mem_mb=96000, + cpus_per_task=40 + shell: + """ + bedtools map -a {input.left} -b {input.right} -c 5 -o "min,max,mean,sum,median" > {output} + """ + +rule plyranges_map: + input: + right=WINDOWS, + left="random_bed/scores/{size}__A.bed.gz" + output: + "results/plyranges_map__size_{size}.bed" + benchmark: + repeat("benchmarks/plyranges_map__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=1000, + mem_mb=96000, + cpus_per_task=40 + params: rscript = RSCRIPT + shell: + """ + {params.rscript} Rscripts/plyranges_map.r {input.left} {input.right} > {output} + """ + + +tools = ["bedtools_intersect", "granges_filter", "plyranges_join_overlap_inner"] +tools += ["bedtools_map", "granges_map", "plyranges_map"] + +# BED file sizes sizes = np.logspace(3, 7, 10).astype('int') all_results = expand("results/{tool}__size_{size}.bed", tool=tools, size=sizes) diff --git a/additional_benchmarks/rscripts/plyranges_map.r b/additional_benchmarks/rscripts/plyranges_map.r new file mode 100644 index 0000000..638dc23 --- /dev/null +++ b/additional_benchmarks/rscripts/plyranges_map.r @@ -0,0 +1,37 @@ +library(plyranges) + +args <- commandArgs(trailingOnly = TRUE) + +if (length(args) != 3) { + stop("Usage: Rscript script.R ") +} + +bed_file_a <- args[1] +bed_file_b <- args[2] +output_file <- args[3] + +# Load data into memory +granges_a <- read_bed(bed_file_a) +granges_b <- read_bed(bed_file_b) + +# Perform the map operation +mapped_result <- join_overlap_left( + granges_a, + granges_b, + suffix = c(".left", ".right") +) %>% + select( + seqnames, + start.left, + end.left, + width = width.left, + strand, + min = min(score.right), + max = max(score.right), + mean = mean(score.right), + sum = sum(score.right), + median = median(score.right) + ) + +# Write the mapped result to a BED file +write_bed(mapped_result, output_file)