-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New additional benchmarks for preprint.
- This adds some Snakemake/Python-based benchmarks for clearer comparison to other tools. - Also changed `unwrap()` to `expect()`.
- Loading branch information
Showing
8 changed files
with
351 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Additional Benchmarks | ||
|
||
## Environment | ||
|
||
mamba create -n granges_benchmark -c conda-forge -c bioconda --file requirements.txt --yes | ||
pip install snakemake-executor-plugin-slurm | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
import numpy as np | ||
|
||
RSCRIPT = "~/.conda/envs/granges_benchmark/bin/Rscript" | ||
GRANGES = "../target/release/granges" | ||
SEQLENS = "../tests_data/hg38_seqlens.tsv" | ||
NREPS = 100 | ||
|
||
localrules: granges, windows | ||
|
||
rule granges: | ||
output: GRANGES | ||
shell: | ||
""" | ||
(cargo build --release --features=dev-commands) | ||
""" | ||
|
||
## ----- random BED files ----- | ||
|
||
rule random_bed: | ||
input: seqlens=SEQLENS, granges=GRANGES | ||
output: "random_bed/{size,[0-9]+}__{rep}.bed.gz" | ||
resources: | ||
runtime=30, | ||
mem_mb_per_cpu=1800, | ||
cpus_per_task=28 | ||
shell: | ||
""" | ||
{input.granges} random-bed --sort --num {wildcards.size} {input.seqlens} | gzip > {output} | ||
""" | ||
|
||
rule random_bed_scores: | ||
input: seqlens=SEQLENS, granges=GRANGES | ||
output: "random_bed/scores/{size,[0-9]+}__{rep}.bed.gz" | ||
resources: | ||
runtime=30, | ||
mem_mb_per_cpu=1800, | ||
cpus_per_task=28 | ||
shell: | ||
""" | ||
{input.granges} random-bed --sort --num {wildcards.size} --scores {input.seqlens} | gzip > {output} | ||
""" | ||
|
||
rule windows: | ||
input: seqlens=SEQLENS, granges=GRANGES | ||
output: "windows/width_{width}.bed" | ||
shell: | ||
""" | ||
{input.granges} windows --genome {input.seqlens} --width {wildcards.width} --output {output} | ||
""" | ||
|
||
|
||
## ----- granges filter / bedtools intersect / plyranges join overlap inner ----- | ||
|
||
rule granges_filter: | ||
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz", | ||
genome=SEQLENS, granges=GRANGES | ||
output: "results/granges_filter__size_{size}.bed" | ||
benchmark: repeat("benchmarks/granges_filter__size_{size}.tsv", NREPS) | ||
resources: | ||
partition="savio3", | ||
runtime=600, | ||
mem_mb=96000, | ||
cpus_per_task=40 | ||
shell: | ||
""" | ||
{input.granges} filter --left {input.a} --right {input.b} --genome {input.genome} > {output} | ||
""" | ||
|
||
rule bedtools_intersect: | ||
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz" | ||
output: "results/bedtools_intersect__size_{size}.bed" | ||
benchmark: repeat("benchmarks/bedtools_intersect__size_{size}.tsv", NREPS) | ||
resources: | ||
partition="savio3", | ||
runtime=600, | ||
mem_mb=96000, | ||
cpus_per_task=40 | ||
shell: | ||
""" | ||
bedtools intersect -a {input.a} -b {input.b} > {output} | ||
""" | ||
|
||
rule plyranges_join_overlap_inner: | ||
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz" | ||
output: "results/plyranges_join_overlap_inner__size_{size}.bed" | ||
params: rscript = RSCRIPT | ||
benchmark: repeat("benchmarks/plyranges_join_overlap_inner__size_{size}.tsv", NREPS) | ||
resources: | ||
partition="savio3", | ||
runtime=600, | ||
mem_mb=96000, | ||
cpus_per_task=40 | ||
shell: | ||
""" | ||
{params.rscript} rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} {output} | ||
""" | ||
|
||
## ----- granges map / bedtools map ----- | ||
WINDOWS = "windows/width_100000.bed" | ||
|
||
rule granges_map: | ||
input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz", | ||
genome=SEQLENS, granges=GRANGES | ||
output: "results/granges_map__size_{size}.bed" | ||
benchmark: repeat("benchmarks/granges_map__size_{size}.tsv", NREPS) | ||
resources: | ||
partition="savio3", | ||
runtime=600, | ||
mem_mb=96000, | ||
cpus_per_task=40 | ||
shell: | ||
""" | ||
{input.granges} map --left {input.left} --right {input.right} --genome {input.genome} --func "min,max,mean,sum,median" --output {output} | ||
""" | ||
|
||
rule bedtools_map: | ||
input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz" | ||
output: "results/bedtools_map__size_{size}.bed" | ||
benchmark: repeat("benchmarks/bedtools_map__size_{size}.tsv", NREPS) | ||
resources: | ||
partition="savio3", | ||
runtime=600, | ||
mem_mb=96000, | ||
cpus_per_task=40 | ||
shell: | ||
""" | ||
bedtools map -a {input.left} -b {input.right} -c 5 -o "min,max,mean,sum,median" > {output} | ||
""" | ||
|
||
rule plyranges_map: | ||
input: | ||
left=WINDOWS, | ||
right="random_bed/scores/{size}__A.bed.gz" | ||
output: | ||
"results/plyranges_map__size_{size}.bed" | ||
benchmark: | ||
repeat("benchmarks/plyranges_map__size_{size}.tsv", NREPS) | ||
resources: | ||
partition="savio3", | ||
runtime=1000, | ||
mem_mb=96000, | ||
cpus_per_task=40 | ||
params: rscript = RSCRIPT | ||
shell: | ||
""" | ||
{params.rscript} rscripts/plyranges_map.r {input.left} {input.right} {output} | ||
""" | ||
|
||
|
||
tools = ["bedtools_intersect", "granges_filter", "plyranges_join_overlap_inner"] | ||
tools += ["bedtools_map", "granges_map", "plyranges_map"] | ||
|
||
# BED file sizes | ||
sizes = np.logspace(3, 7, 10).astype('int') | ||
|
||
all_results = expand("results/{tool}__size_{size}.bed", tool=tools, size=sizes) | ||
|
||
rule all: | ||
input: all_results | ||
|
||
|
||
rule combine: | ||
input: expand("benchmarks/{tool}__size_{size}.tsv", tool=tools, size=sizes) | ||
output: "combined_benchmarks.tsv" | ||
run: | ||
import pandas as pd | ||
|
||
def read_tsv(file): | ||
return pd.read_csv(file, sep="\t") | ||
|
||
dfs = [] | ||
for tool in tools: | ||
for size in sizes: | ||
file = f"benchmarks/{tool}__size_{size}.tsv" | ||
df = read_tsv(file) | ||
df["tool"] = tool | ||
df["size"] = size | ||
dfs.append(df) | ||
|
||
combined_df = pd.concat(dfs, ignore_index=True) | ||
combined_df.to_csv(output[0], sep="\t", index=False) | ||
|
||
rule figure: | ||
input: "combined_benchmarks.tsv" | ||
output: "benchmark_figure.pdf" | ||
run: | ||
import matplotlib.pyplot as plt | ||
import matplotlib as mpl | ||
import pandas as pd | ||
import numpy as np | ||
mpl.use('agg') | ||
|
||
d = pd.read_csv(input[0], sep="\t") | ||
ds = d.groupby(['tool', 'size'], as_index=False)['s'].median() | ||
ds = ds.loc[ds['size'] > 1e3] | ||
|
||
tools = d['tool'].unique() | ||
num_tools = len(tools) | ||
colors = plt.cm.Paired.colors | ||
|
||
fig, ax = plt.subplots() | ||
|
||
i = 0 | ||
for tool in ds['tool'].unique(): | ||
data = ds[ds['tool'] == tool] | ||
tool, cmd = tool.split('_')[:2] | ||
|
||
color = colors[i] | ||
i += 1 | ||
ax.plot(data['size'], data['s'], '-o', label=f"{tool} {cmd}", color=color,) | ||
|
||
ax.semilogx() | ||
ax.semilogy() | ||
ax.set_xlabel('Number of BED ranges') | ||
ax.set_ylabel('Time (s)') | ||
ax.legend(fontsize=8, frameon=False) | ||
plt.tight_layout() | ||
plt.savefig(output[0]) | ||
|
||
rule merge: | ||
input: "benchmark_figure.pdf" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
snakemake | ||
numpy | ||
scipy | ||
matplotlib | ||
bedtools | ||
R |
18 changes: 18 additions & 0 deletions
18
additional_benchmarks/rscripts/plyranges_join_overlap_inner.r
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
library(plyranges) | ||
|
||
args <- commandArgs(trailingOnly = TRUE) | ||
if (length(args) != 3) { | ||
stop("Usage: Rscript script.R <bed_file_a> <bed_file_b> <output_file>") | ||
} | ||
|
||
bed_file_a <- args[1] | ||
bed_file_b <- args[2] | ||
output_file <- args[3] | ||
|
||
# load data into mem | ||
granges_a <- read_bed(bed_file_a) | ||
granges_b <- read_bed(bed_file_b) | ||
|
||
intersection_result <- join_overlap_inner(granges_a, granges_b) | ||
|
||
write_bed(intersection_result, output_file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
library(plyranges) | ||
library(rtracklayer) | ||
|
||
args <- commandArgs(trailingOnly = TRUE) | ||
|
||
if (length(args) != 3) { | ||
stop("Usage: Rscript script.R <bed_file_a> <bed_file_b> <output_file>") | ||
} | ||
|
||
bed_file_a <- args[1] | ||
bed_file_b <- args[2] | ||
output_file <- args[3] | ||
|
||
# Load data into memory | ||
granges_a <- read_bed(bed_file_a) | ||
granges_b <- read_bed(bed_file_b) | ||
|
||
# Perform the map operation | ||
mapped_granges <- granges_a %>% | ||
join_overlap_inner(granges_b) %>% | ||
group_by(seqnames, start, end, strand) %>% | ||
summarise( | ||
min = min(score, na.rm = TRUE), | ||
max = max(score, na.rm = TRUE), | ||
mean = mean(score, na.rm = TRUE), | ||
sum = sum(score, na.rm = TRUE), | ||
median = median(score, na.rm = TRUE) | ||
) | ||
|
||
mapped_df <- as.data.frame(mapped_granges) | ||
write.table(mapped_df, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
use granges::prelude::*; | ||
|
||
// Our overlap data processing function - a weighted mean. | ||
pub fn weighted_mean_score(join_data: CombinedJoinDataLeftEmpty<Option<f64>>) -> f64 { | ||
// Get the widths of overlaps. | ||
let overlap_widths = join_data.join().overlap_widths(); | ||
|
||
// Get the "right data" -- the BED5 scores. | ||
let overlap_scores: Vec<Option<f64>> = join_data | ||
.right_data | ||
.into_iter() | ||
.collect(); | ||
|
||
// Calculate the weighted mean | ||
let mut score_sum = 0.0; | ||
let mut total_width = 0.0; | ||
for (score, width) in overlap_scores.iter().zip(overlap_widths) { | ||
if let Some(value) = score { | ||
score_sum += value * (width as f64); | ||
total_width += width as f64; | ||
} else { | ||
// This is a missing value, a '.' in BED. | ||
continue; | ||
} | ||
} | ||
if total_width > 0.0 { | ||
score_sum / total_width | ||
} else { | ||
0.0 | ||
} | ||
} | ||
|
||
fn try_main() -> Result<(), granges::error::GRangesError> { | ||
// Load in the chromosome lengths. | ||
let genome = read_seqlens("tests_data/hg38_seqlens.tsv")?; | ||
|
||
// Create 1Mb windows, non-sliding, chopping off | ||
// remainder. | ||
let width = 1_000_000; | ||
let windows = GRangesEmpty::from_windows(&genome, width, None, true)?; | ||
|
||
// Create a new iterator over a BED5 file. | ||
let scores_iter = Bed5Iterator::new("tests_data/scores.bed.gz")?; | ||
|
||
// Load the ranges from the iterator into a GRanges object. | ||
let scores = GRanges::from_iter(scores_iter, &genome)? | ||
// Convert to interval trees. | ||
.into_coitrees()? | ||
// Extract out just the score from the additional | ||
// BED5 columns. | ||
.map_data(|bed5_cols| bed5_cols.score)?; | ||
|
||
windows | ||
// Do a "left overlap join" with scores. | ||
.left_overlaps(&scores)? | ||
// Process each overlapping join by applying | ||
// a weighted mean function. | ||
.map_joins(weighted_mean_score)? | ||
// Write to a BED file. | ||
.write_to_tsv(Some("tests_data/weighted_means.bed"), &BED_TSV)?; | ||
Ok(()) | ||
} | ||
|
||
fn main() { | ||
try_main().unwrap(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.