Skip to content

Commit

Permalink
New benchmarking
Browse files Browse the repository at this point in the history
  • Loading branch information
vsbuffalo committed May 18, 2024
1 parent 211c056 commit 3588443
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 5 deletions.
89 changes: 84 additions & 5 deletions additional_benchmarks/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ import numpy as np
RSCRIPT = "~/.conda/envs/granges_benchmark/bin/R"
GRANGES = "../target/release/granges"
SEQLENS = "../tests_data/hg38_seqlens.tsv"
NREPS = 50
NREPS = 100

localrules: granges
localrules: granges, windows

rule granges:
output: GRANGES
Expand All @@ -14,9 +14,11 @@ rule granges:
(cargo build --release --features=dev-commands)
"""

## ----- random BED files -----

rule random_bed:
input: seqlens=SEQLENS, granges=GRANGES
output: "random_bed/{size}__{rep}.bed.gz"
output: "random_bed/{size,[0-9]+}__{rep}.bed.gz"
resources:
runtime=30,
mem_mb_per_cpu=1800,
Expand All @@ -26,6 +28,29 @@ rule random_bed:
{input.granges} random-bed --sort --num {wildcards.size} {input.seqlens} | gzip > {output}
"""

rule random_bed_scores:
input: seqlens=SEQLENS, granges=GRANGES
output: "random_bed/scores/{size,[0-9]+}__{rep}.bed.gz"
resources:
runtime=30,
mem_mb_per_cpu=1800,
cpus_per_task=28
shell:
"""
{input.granges} random-bed --sort --num {wildcards.size} --scores {input.seqlens} | gzip > {output}
"""

rule windows:
input: seqlens=SEQLENS, granges=GRANGES
output: "windows/width_{width}.bed"
shell:
"""
{input.granges} windows --genome {input.seqlens} --width {wildcards.width} --output {output}
"""


## ----- granges filter / bedtools intersect / plyranges join overlap inner -----

rule granges_filter:
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz",
genome=SEQLENS, granges=GRANGES
Expand All @@ -40,7 +65,6 @@ rule granges_filter:
"""
{input.granges} filter --left {input.a} --right {input.b} --genome {input.genome} > {output}
"""


rule bedtools_intersect:
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz"
Expand Down Expand Up @@ -71,7 +95,62 @@ rule plyranges_join_overlap_inner:
{params.rscript} Rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} > {output}
"""

tools = ["bedtools_intersect", "granges_filter"]
## ----- granges map / bedtools map -----
WINDOWS = "windows/width_100000.bed"

rule granges_map:
input: right=WINDOWS, left="random_bed/scores/{size}__A.bed.gz",
genome=SEQLENS, granges=GRANGES
output: "results/granges_map__size_{size}.bed"
benchmark: repeat("benchmarks/granges_map__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=300,
mem_mb=96000,
cpus_per_task=40
shell:
"""
{input.granges} map --left {input.left} --right {input.right} --genome {input.genome} --func "min,max,mean,sum,median" --output {output}
"""

rule bedtools_map:
input: right=WINDOWS, left="random_bed/scores/{size}__A.bed.gz"
output: "results/bedtools_map__size_{size}.bed"
benchmark: repeat("benchmarks/bedtools_map__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=300,
mem_mb=96000,
cpus_per_task=40
shell:
"""
bedtools map -a {input.left} -b {input.right} -c 5 -o "min,max,mean,sum,median" > {output}
"""

rule plyranges_map:
input:
right=WINDOWS,
left="random_bed/scores/{size}__A.bed.gz"
output:
"results/plyranges_map__size_{size}.bed"
benchmark:
repeat("benchmarks/plyranges_map__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=1000,
mem_mb=96000,
cpus_per_task=40
params: rscript = RSCRIPT
shell:
"""
{params.rscript} Rscripts/plyranges_map.r {input.left} {input.right} > {output}
"""


tools = ["bedtools_intersect", "granges_filter", "plyranges_join_overlap_inner"]
tools += ["bedtools_map", "granges_map", "plyranges_map"]

# BED file sizes
sizes = np.logspace(3, 7, 10).astype('int')

all_results = expand("results/{tool}__size_{size}.bed", tool=tools, size=sizes)
Expand Down
37 changes: 37 additions & 0 deletions additional_benchmarks/rscripts/plyranges_map.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
library(plyranges)

args <- commandArgs(trailingOnly = TRUE)

if (length(args) != 3) {
stop("Usage: Rscript script.R <bed_file_a> <bed_file_b> <output_file>")
}

bed_file_a <- args[1]
bed_file_b <- args[2]
output_file <- args[3]

# Load data into memory
granges_a <- read_bed(bed_file_a)
granges_b <- read_bed(bed_file_b)

# Perform the map operation
mapped_result <- join_overlap_left(
granges_a,
granges_b,
suffix = c(".left", ".right")
) %>%
select(
seqnames,
start.left,
end.left,
width = width.left,
strand,
min = min(score.right),
max = max(score.right),
mean = mean(score.right),
sum = sum(score.right),
median = median(score.right)
)

# Write the mapped result to a BED file
write_bed(mapped_result, output_file)

0 comments on commit 3588443

Please sign in to comment.