Skip to content

Commit

Permalink
New additional benchmarks for preprint.
Browse files Browse the repository at this point in the history
 - This adds some Snakemake/Python-based benchmarks for
   clearer comparison to other tools.
 - Also changed `unwrap()` to `expect()`.
  • Loading branch information
vsbuffalo committed May 28, 2024
1 parent 39217cd commit e5b19eb
Show file tree
Hide file tree
Showing 8 changed files with 351 additions and 1 deletion.
7 changes: 7 additions & 0 deletions additional_benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Additional Benchmarks

## Environment

mamba create -n granges_benchmark -c conda-forge -c bioconda --file requirements.txt --yes
pip install snakemake-executor-plugin-slurm

221 changes: 221 additions & 0 deletions additional_benchmarks/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
import numpy as np

RSCRIPT = "~/.conda/envs/granges_benchmark/bin/Rscript"
GRANGES = "../target/release/granges"
SEQLENS = "../tests_data/hg38_seqlens.tsv"
NREPS = 100

localrules: granges, windows

rule granges:
output: GRANGES
shell:
"""
(cargo build --release --features=dev-commands)
"""

## ----- random BED files -----

rule random_bed:
input: seqlens=SEQLENS, granges=GRANGES
output: "random_bed/{size,[0-9]+}__{rep}.bed.gz"
resources:
runtime=30,
mem_mb_per_cpu=1800,
cpus_per_task=28
shell:
"""
{input.granges} random-bed --sort --num {wildcards.size} {input.seqlens} | gzip > {output}
"""

rule random_bed_scores:
input: seqlens=SEQLENS, granges=GRANGES
output: "random_bed/scores/{size,[0-9]+}__{rep}.bed.gz"
resources:
runtime=30,
mem_mb_per_cpu=1800,
cpus_per_task=28
shell:
"""
{input.granges} random-bed --sort --num {wildcards.size} --scores {input.seqlens} | gzip > {output}
"""

rule windows:
input: seqlens=SEQLENS, granges=GRANGES
output: "windows/width_{width}.bed"
shell:
"""
{input.granges} windows --genome {input.seqlens} --width {wildcards.width} --output {output}
"""


## ----- granges filter / bedtools intersect / plyranges join overlap inner -----

rule granges_filter:
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz",
genome=SEQLENS, granges=GRANGES
output: "results/granges_filter__size_{size}.bed"
benchmark: repeat("benchmarks/granges_filter__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=600,
mem_mb=96000,
cpus_per_task=40
shell:
"""
{input.granges} filter --left {input.a} --right {input.b} --genome {input.genome} > {output}
"""

rule bedtools_intersect:
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz"
output: "results/bedtools_intersect__size_{size}.bed"
benchmark: repeat("benchmarks/bedtools_intersect__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=600,
mem_mb=96000,
cpus_per_task=40
shell:
"""
bedtools intersect -a {input.a} -b {input.b} > {output}
"""

rule plyranges_join_overlap_inner:
input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz"
output: "results/plyranges_join_overlap_inner__size_{size}.bed"
params: rscript = RSCRIPT
benchmark: repeat("benchmarks/plyranges_join_overlap_inner__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=600,
mem_mb=96000,
cpus_per_task=40
shell:
"""
{params.rscript} rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} {output}
"""

## ----- granges map / bedtools map -----
WINDOWS = "windows/width_100000.bed"

rule granges_map:
input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz",
genome=SEQLENS, granges=GRANGES
output: "results/granges_map__size_{size}.bed"
benchmark: repeat("benchmarks/granges_map__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=600,
mem_mb=96000,
cpus_per_task=40
shell:
"""
{input.granges} map --left {input.left} --right {input.right} --genome {input.genome} --func "min,max,mean,sum,median" --output {output}
"""

rule bedtools_map:
input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz"
output: "results/bedtools_map__size_{size}.bed"
benchmark: repeat("benchmarks/bedtools_map__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=600,
mem_mb=96000,
cpus_per_task=40
shell:
"""
bedtools map -a {input.left} -b {input.right} -c 5 -o "min,max,mean,sum,median" > {output}
"""

rule plyranges_map:
input:
left=WINDOWS,
right="random_bed/scores/{size}__A.bed.gz"
output:
"results/plyranges_map__size_{size}.bed"
benchmark:
repeat("benchmarks/plyranges_map__size_{size}.tsv", NREPS)
resources:
partition="savio3",
runtime=1000,
mem_mb=96000,
cpus_per_task=40
params: rscript = RSCRIPT
shell:
"""
{params.rscript} rscripts/plyranges_map.r {input.left} {input.right} {output}
"""


tools = ["bedtools_intersect", "granges_filter", "plyranges_join_overlap_inner"]
tools += ["bedtools_map", "granges_map", "plyranges_map"]

# BED file sizes
sizes = np.logspace(3, 7, 10).astype('int')

all_results = expand("results/{tool}__size_{size}.bed", tool=tools, size=sizes)

rule all:
input: all_results


rule combine:
input: expand("benchmarks/{tool}__size_{size}.tsv", tool=tools, size=sizes)
output: "combined_benchmarks.tsv"
run:
import pandas as pd

def read_tsv(file):
return pd.read_csv(file, sep="\t")

dfs = []
for tool in tools:
for size in sizes:
file = f"benchmarks/{tool}__size_{size}.tsv"
df = read_tsv(file)
df["tool"] = tool
df["size"] = size
dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv(output[0], sep="\t", index=False)

rule figure:
input: "combined_benchmarks.tsv"
output: "benchmark_figure.pdf"
run:
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np
mpl.use('agg')

d = pd.read_csv(input[0], sep="\t")
ds = d.groupby(['tool', 'size'], as_index=False)['s'].median()
ds = ds.loc[ds['size'] > 1e3]

tools = d['tool'].unique()
num_tools = len(tools)
colors = plt.cm.Paired.colors

fig, ax = plt.subplots()

i = 0
for tool in ds['tool'].unique():
data = ds[ds['tool'] == tool]
tool, cmd = tool.split('_')[:2]

color = colors[i]
i += 1
ax.plot(data['size'], data['s'], '-o', label=f"{tool} {cmd}", color=color,)

ax.semilogx()
ax.semilogy()
ax.set_xlabel('Number of BED ranges')
ax.set_ylabel('Time (s)')
ax.legend(fontsize=8, frameon=False)
plt.tight_layout()
plt.savefig(output[0])

rule merge:
input: "benchmark_figure.pdf"
6 changes: 6 additions & 0 deletions additional_benchmarks/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
snakemake
numpy
scipy
matplotlib
bedtools
R
18 changes: 18 additions & 0 deletions additional_benchmarks/rscripts/plyranges_join_overlap_inner.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
library(plyranges)

args <- commandArgs(trailingOnly = TRUE)
if (length(args) != 3) {
stop("Usage: Rscript script.R <bed_file_a> <bed_file_b> <output_file>")
}

bed_file_a <- args[1]
bed_file_b <- args[2]
output_file <- args[3]

# load data into mem
granges_a <- read_bed(bed_file_a)
granges_b <- read_bed(bed_file_b)

intersection_result <- join_overlap_inner(granges_a, granges_b)

write_bed(intersection_result, output_file)
31 changes: 31 additions & 0 deletions additional_benchmarks/rscripts/plyranges_map.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
library(plyranges)
library(rtracklayer)

args <- commandArgs(trailingOnly = TRUE)

if (length(args) != 3) {
stop("Usage: Rscript script.R <bed_file_a> <bed_file_b> <output_file>")
}

bed_file_a <- args[1]
bed_file_b <- args[2]
output_file <- args[3]

# Load data into memory
granges_a <- read_bed(bed_file_a)
granges_b <- read_bed(bed_file_b)

# Perform the map operation
mapped_granges <- granges_a %>%
join_overlap_inner(granges_b) %>%
group_by(seqnames, start, end, strand) %>%
summarise(
min = min(score, na.rm = TRUE),
max = max(score, na.rm = TRUE),
mean = mean(score, na.rm = TRUE),
sum = sum(score, na.rm = TRUE),
median = median(score, na.rm = TRUE)
)

mapped_df <- as.data.frame(mapped_granges)
write.table(mapped_df, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE)
66 changes: 66 additions & 0 deletions examples/weighted_mean.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use granges::prelude::*;

// Our overlap data processing function - a weighted mean.
pub fn weighted_mean_score(join_data: CombinedJoinDataLeftEmpty<Option<f64>>) -> f64 {
// Get the widths of overlaps.
let overlap_widths = join_data.join().overlap_widths();

// Get the "right data" -- the BED5 scores.
let overlap_scores: Vec<Option<f64>> = join_data
.right_data
.into_iter()
.collect();

// Calculate the weighted mean
let mut score_sum = 0.0;
let mut total_width = 0.0;
for (score, width) in overlap_scores.iter().zip(overlap_widths) {
if let Some(value) = score {
score_sum += value * (width as f64);
total_width += width as f64;
} else {
// This is a missing value, a '.' in BED.
continue;
}
}
if total_width > 0.0 {
score_sum / total_width
} else {
0.0
}
}

fn try_main() -> Result<(), granges::error::GRangesError> {
// Load in the chromosome lengths.
let genome = read_seqlens("tests_data/hg38_seqlens.tsv")?;

// Create 1Mb windows, non-sliding, chopping off
// remainder.
let width = 1_000_000;
let windows = GRangesEmpty::from_windows(&genome, width, None, true)?;

// Create a new iterator over a BED5 file.
let scores_iter = Bed5Iterator::new("tests_data/scores.bed.gz")?;

// Load the ranges from the iterator into a GRanges object.
let scores = GRanges::from_iter(scores_iter, &genome)?
// Convert to interval trees.
.into_coitrees()?
// Extract out just the score from the additional
// BED5 columns.
.map_data(|bed5_cols| bed5_cols.score)?;

windows
// Do a "left overlap join" with scores.
.left_overlaps(&scores)?
// Process each overlapping join by applying
// a weighted mean function.
.map_joins(weighted_mean_score)?
// Write to a BED file.
.write_to_tsv(Some("tests_data/weighted_means.bed"), &BED_TSV)?;
Ok(())
}

fn main() {
try_main().unwrap();
}
3 changes: 2 additions & 1 deletion src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,8 @@ where
for (seqname, indices) in self.data_indices()? {
let mut new_data = Vec::new();
for index in indices {
let value = data.get(index).unwrap();
let value = data.get(index)
.expect("Invalid index in GRanges::data_by_seqname(). This indicates invalid data indices (developer error).");
new_data.push((*value).clone());
}
all_new_data.insert(&seqname, new_data)?;
Expand Down
Binary file added tests_data/scores.bed.gz
Binary file not shown.

0 comments on commit e5b19eb

Please sign in to comment.