diff --git a/additional_benchmarks/Snakefile b/additional_benchmarks/Snakefile index 60c3316..3a968d6 100644 --- a/additional_benchmarks/Snakefile +++ b/additional_benchmarks/Snakefile @@ -1,6 +1,6 @@ import numpy as np -RSCRIPT = "~/.conda/envs/granges_benchmark/bin/R" +RSCRIPT = "~/.conda/envs/granges_benchmark/bin/Rscript" GRANGES = "../target/release/granges" SEQLENS = "../tests_data/hg38_seqlens.tsv" NREPS = 100 @@ -58,7 +58,7 @@ rule granges_filter: benchmark: repeat("benchmarks/granges_filter__size_{size}.tsv", NREPS) resources: partition="savio3", - runtime=300, + runtime=600, mem_mb=96000, cpus_per_task=40 shell: @@ -72,7 +72,7 @@ rule bedtools_intersect: benchmark: repeat("benchmarks/bedtools_intersect__size_{size}.tsv", NREPS) resources: partition="savio3", - runtime=300, + runtime=600, mem_mb=96000, cpus_per_task=40 shell: @@ -87,25 +87,25 @@ rule plyranges_join_overlap_inner: benchmark: repeat("benchmarks/plyranges_join_overlap_inner__size_{size}.tsv", NREPS) resources: partition="savio3", - runtime=300, + runtime=600, mem_mb=96000, cpus_per_task=40 shell: """ - {params.rscript} Rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} > {output} + {params.rscript} rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} {output} """ ## ----- granges map / bedtools map ----- WINDOWS = "windows/width_100000.bed" rule granges_map: - input: right=WINDOWS, left="random_bed/scores/{size}__A.bed.gz", + input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz", genome=SEQLENS, granges=GRANGES output: "results/granges_map__size_{size}.bed" benchmark: repeat("benchmarks/granges_map__size_{size}.tsv", NREPS) resources: partition="savio3", - runtime=300, + runtime=600, mem_mb=96000, cpus_per_task=40 shell: @@ -114,12 +114,12 @@ rule granges_map: """ rule bedtools_map: - input: right=WINDOWS, left="random_bed/scores/{size}__A.bed.gz" + input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz" output: "results/bedtools_map__size_{size}.bed" benchmark: repeat("benchmarks/bedtools_map__size_{size}.tsv", NREPS) resources: partition="savio3", - runtime=300, + runtime=600, mem_mb=96000, cpus_per_task=40 shell: @@ -129,8 +129,8 @@ rule bedtools_map: rule plyranges_map: input: - right=WINDOWS, - left="random_bed/scores/{size}__A.bed.gz" + left=WINDOWS, + right="random_bed/scores/{size}__A.bed.gz" output: "results/plyranges_map__size_{size}.bed" benchmark: @@ -143,7 +143,7 @@ rule plyranges_map: params: rscript = RSCRIPT shell: """ - {params.rscript} Rscripts/plyranges_map.r {input.left} {input.right} > {output} + {params.rscript} rscripts/plyranges_map.r {input.left} {input.right} {output} """ diff --git a/additional_benchmarks/rscripts/plyranges_map.r b/additional_benchmarks/rscripts/plyranges_map.r index 638dc23..f886d36 100644 --- a/additional_benchmarks/rscripts/plyranges_map.r +++ b/additional_benchmarks/rscripts/plyranges_map.r @@ -1,4 +1,5 @@ library(plyranges) +library(rtracklayer) args <- commandArgs(trailingOnly = TRUE) @@ -15,23 +16,16 @@ granges_a <- read_bed(bed_file_a) granges_b <- read_bed(bed_file_b) # Perform the map operation -mapped_result <- join_overlap_left( - granges_a, - granges_b, - suffix = c(".left", ".right") -) %>% - select( - seqnames, - start.left, - end.left, - width = width.left, - strand, - min = min(score.right), - max = max(score.right), - mean = mean(score.right), - sum = sum(score.right), - median = median(score.right) +mapped_granges <- granges_a %>% + join_overlap_inner(granges_b) %>% + group_by(seqnames, start, end, strand) %>% + summarise( + min = min(score, na.rm = TRUE), + max = max(score, na.rm = TRUE), + mean = mean(score, na.rm = TRUE), + sum = sum(score, na.rm = TRUE), + median = median(score, na.rm = TRUE) ) -# Write the mapped result to a BED file -write_bed(mapped_result, output_file) +mapped_df <- as.data.frame(mapped_granges) +write.table(mapped_df, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE)