diff --git a/additional_benchmarks/README.md b/additional_benchmarks/README.md new file mode 100644 index 0000000..094548a --- /dev/null +++ b/additional_benchmarks/README.md @@ -0,0 +1,7 @@ +# Additional Benchmarks + +## Environment + + mamba create -n granges_benchmark -c conda-forge -c bioconda --file requirements.txt --yes + pip install snakemake-executor-plugin-slurm + diff --git a/additional_benchmarks/Snakefile b/additional_benchmarks/Snakefile new file mode 100644 index 0000000..101b221 --- /dev/null +++ b/additional_benchmarks/Snakefile @@ -0,0 +1,221 @@ +import numpy as np + +RSCRIPT = "~/.conda/envs/granges_benchmark/bin/Rscript" +GRANGES = "../target/release/granges" +SEQLENS = "../tests_data/hg38_seqlens.tsv" +NREPS = 100 + +localrules: granges, windows + +rule granges: + output: GRANGES + shell: + """ + (cargo build --release --features=dev-commands) + """ + +## ----- random BED files ----- + +rule random_bed: + input: seqlens=SEQLENS, granges=GRANGES + output: "random_bed/{size,[0-9]+}__{rep}.bed.gz" + resources: + runtime=30, + mem_mb_per_cpu=1800, + cpus_per_task=28 + shell: + """ + {input.granges} random-bed --sort --num {wildcards.size} {input.seqlens} | gzip > {output} + """ + +rule random_bed_scores: + input: seqlens=SEQLENS, granges=GRANGES + output: "random_bed/scores/{size,[0-9]+}__{rep}.bed.gz" + resources: + runtime=30, + mem_mb_per_cpu=1800, + cpus_per_task=28 + shell: + """ + {input.granges} random-bed --sort --num {wildcards.size} --scores {input.seqlens} | gzip > {output} + """ + +rule windows: + input: seqlens=SEQLENS, granges=GRANGES + output: "windows/width_{width}.bed" + shell: + """ + {input.granges} windows --genome {input.seqlens} --width {wildcards.width} --output {output} + """ + + +## ----- granges filter / bedtools intersect / plyranges join overlap inner ----- + +rule granges_filter: + input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz", + genome=SEQLENS, granges=GRANGES + output: "results/granges_filter__size_{size}.bed" + benchmark: repeat("benchmarks/granges_filter__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=600, + mem_mb=96000, + cpus_per_task=40 + shell: + """ + {input.granges} filter --left {input.a} --right {input.b} --genome {input.genome} > {output} + """ + +rule bedtools_intersect: + input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz" + output: "results/bedtools_intersect__size_{size}.bed" + benchmark: repeat("benchmarks/bedtools_intersect__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=600, + mem_mb=96000, + cpus_per_task=40 + shell: + """ + bedtools intersect -a {input.a} -b {input.b} > {output} + """ + +rule plyranges_join_overlap_inner: + input: a="random_bed/{size}__A.bed.gz", b="random_bed/{size}__B.bed.gz" + output: "results/plyranges_join_overlap_inner__size_{size}.bed" + params: rscript = RSCRIPT + benchmark: repeat("benchmarks/plyranges_join_overlap_inner__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=600, + mem_mb=96000, + cpus_per_task=40 + shell: + """ + {params.rscript} rscripts/plyranges_join_overlap_inner.r {input.a} {input.b} {output} + """ + +## ----- granges map / bedtools map ----- +WINDOWS = "windows/width_100000.bed" + +rule granges_map: + input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz", + genome=SEQLENS, granges=GRANGES + output: "results/granges_map__size_{size}.bed" + benchmark: repeat("benchmarks/granges_map__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=600, + mem_mb=96000, + cpus_per_task=40 + shell: + """ + {input.granges} map --left {input.left} --right {input.right} --genome {input.genome} --func "min,max,mean,sum,median" --output {output} + """ + +rule bedtools_map: + input: left=WINDOWS, right="random_bed/scores/{size}__A.bed.gz" + output: "results/bedtools_map__size_{size}.bed" + benchmark: repeat("benchmarks/bedtools_map__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=600, + mem_mb=96000, + cpus_per_task=40 + shell: + """ + bedtools map -a {input.left} -b {input.right} -c 5 -o "min,max,mean,sum,median" > {output} + """ + +rule plyranges_map: + input: + left=WINDOWS, + right="random_bed/scores/{size}__A.bed.gz" + output: + "results/plyranges_map__size_{size}.bed" + benchmark: + repeat("benchmarks/plyranges_map__size_{size}.tsv", NREPS) + resources: + partition="savio3", + runtime=1000, + mem_mb=96000, + cpus_per_task=40 + params: rscript = RSCRIPT + shell: + """ + {params.rscript} rscripts/plyranges_map.r {input.left} {input.right} {output} + """ + + +tools = ["bedtools_intersect", "granges_filter", "plyranges_join_overlap_inner"] +tools += ["bedtools_map", "granges_map", "plyranges_map"] + +# BED file sizes +sizes = np.logspace(3, 7, 10).astype('int') + +all_results = expand("results/{tool}__size_{size}.bed", tool=tools, size=sizes) + +rule all: + input: all_results + + +rule combine: + input: expand("benchmarks/{tool}__size_{size}.tsv", tool=tools, size=sizes) + output: "combined_benchmarks.tsv" + run: + import pandas as pd + + def read_tsv(file): + return pd.read_csv(file, sep="\t") + + dfs = [] + for tool in tools: + for size in sizes: + file = f"benchmarks/{tool}__size_{size}.tsv" + df = read_tsv(file) + df["tool"] = tool + df["size"] = size + dfs.append(df) + + combined_df = pd.concat(dfs, ignore_index=True) + combined_df.to_csv(output[0], sep="\t", index=False) + +rule figure: + input: "combined_benchmarks.tsv" + output: "benchmark_figure.pdf" + run: + import matplotlib.pyplot as plt + import matplotlib as mpl + import pandas as pd + import numpy as np + mpl.use('agg') + + d = pd.read_csv(input[0], sep="\t") + ds = d.groupby(['tool', 'size'], as_index=False)['s'].median() + ds = ds.loc[ds['size'] > 1e3] + + tools = d['tool'].unique() + num_tools = len(tools) + colors = plt.cm.Paired.colors + + fig, ax = plt.subplots() + + i = 0 + for tool in ds['tool'].unique(): + data = ds[ds['tool'] == tool] + tool, cmd = tool.split('_')[:2] + + color = colors[i] + i += 1 + ax.plot(data['size'], data['s'], '-o', label=f"{tool} {cmd}", color=color,) + + ax.semilogx() + ax.semilogy() + ax.set_xlabel('Number of BED ranges') + ax.set_ylabel('Time (s)') + ax.legend(fontsize=8, frameon=False) + plt.tight_layout() + plt.savefig(output[0]) + +rule merge: + input: "benchmark_figure.pdf" diff --git a/additional_benchmarks/requirements.txt b/additional_benchmarks/requirements.txt new file mode 100644 index 0000000..7b80722 --- /dev/null +++ b/additional_benchmarks/requirements.txt @@ -0,0 +1,6 @@ +snakemake +numpy +scipy +matplotlib +bedtools +R diff --git a/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r b/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r new file mode 100644 index 0000000..911ef5e --- /dev/null +++ b/additional_benchmarks/rscripts/plyranges_join_overlap_inner.r @@ -0,0 +1,18 @@ +library(plyranges) + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) != 3) { + stop("Usage: Rscript script.R ") +} + +bed_file_a <- args[1] +bed_file_b <- args[2] +output_file <- args[3] + +# load data into mem +granges_a <- read_bed(bed_file_a) +granges_b <- read_bed(bed_file_b) + +intersection_result <- join_overlap_inner(granges_a, granges_b) + +write_bed(intersection_result, output_file) diff --git a/additional_benchmarks/rscripts/plyranges_map.r b/additional_benchmarks/rscripts/plyranges_map.r new file mode 100644 index 0000000..f886d36 --- /dev/null +++ b/additional_benchmarks/rscripts/plyranges_map.r @@ -0,0 +1,31 @@ +library(plyranges) +library(rtracklayer) + +args <- commandArgs(trailingOnly = TRUE) + +if (length(args) != 3) { + stop("Usage: Rscript script.R ") +} + +bed_file_a <- args[1] +bed_file_b <- args[2] +output_file <- args[3] + +# Load data into memory +granges_a <- read_bed(bed_file_a) +granges_b <- read_bed(bed_file_b) + +# Perform the map operation +mapped_granges <- granges_a %>% + join_overlap_inner(granges_b) %>% + group_by(seqnames, start, end, strand) %>% + summarise( + min = min(score, na.rm = TRUE), + max = max(score, na.rm = TRUE), + mean = mean(score, na.rm = TRUE), + sum = sum(score, na.rm = TRUE), + median = median(score, na.rm = TRUE) + ) + +mapped_df <- as.data.frame(mapped_granges) +write.table(mapped_df, file = output_file, sep = "\t", quote = FALSE, row.names = FALSE) diff --git a/examples/weighted_mean.rs b/examples/weighted_mean.rs new file mode 100644 index 0000000..7ce719a --- /dev/null +++ b/examples/weighted_mean.rs @@ -0,0 +1,66 @@ +use granges::prelude::*; + +// Our overlap data processing function - a weighted mean. +pub fn weighted_mean_score(join_data: CombinedJoinDataLeftEmpty>) -> f64 { + // Get the widths of overlaps. + let overlap_widths = join_data.join().overlap_widths(); + + // Get the "right data" -- the BED5 scores. + let overlap_scores: Vec> = join_data + .right_data + .into_iter() + .collect(); + + // Calculate the weighted mean + let mut score_sum = 0.0; + let mut total_width = 0.0; + for (score, width) in overlap_scores.iter().zip(overlap_widths) { + if let Some(value) = score { + score_sum += value * (width as f64); + total_width += width as f64; + } else { + // This is a missing value, a '.' in BED. + continue; + } + } + if total_width > 0.0 { + score_sum / total_width + } else { + 0.0 + } +} + +fn try_main() -> Result<(), granges::error::GRangesError> { + // Load in the chromosome lengths. + let genome = read_seqlens("tests_data/hg38_seqlens.tsv")?; + + // Create 1Mb windows, non-sliding, chopping off + // remainder. + let width = 1_000_000; + let windows = GRangesEmpty::from_windows(&genome, width, None, true)?; + + // Create a new iterator over a BED5 file. + let scores_iter = Bed5Iterator::new("tests_data/scores.bed.gz")?; + + // Load the ranges from the iterator into a GRanges object. + let scores = GRanges::from_iter(scores_iter, &genome)? + // Convert to interval trees. + .into_coitrees()? + // Extract out just the score from the additional + // BED5 columns. + .map_data(|bed5_cols| bed5_cols.score)?; + + windows + // Do a "left overlap join" with scores. + .left_overlaps(&scores)? + // Process each overlapping join by applying + // a weighted mean function. + .map_joins(weighted_mean_score)? + // Write to a BED file. + .write_to_tsv(Some("tests_data/weighted_means.bed"), &BED_TSV)?; + Ok(()) +} + +fn main() { + try_main().unwrap(); +} diff --git a/src/granges.rs b/src/granges.rs index e5b0a7d..08973c8 100644 --- a/src/granges.rs +++ b/src/granges.rs @@ -491,7 +491,8 @@ where for (seqname, indices) in self.data_indices()? { let mut new_data = Vec::new(); for index in indices { - let value = data.get(index).unwrap(); + let value = data.get(index) + .expect("Invalid index in GRanges::data_by_seqname(). This indicates invalid data indices (developer error)."); new_data.push((*value).clone()); } all_new_data.insert(&seqname, new_data)?; diff --git a/tests_data/scores.bed.gz b/tests_data/scores.bed.gz new file mode 100644 index 0000000..f547018 Binary files /dev/null and b/tests_data/scores.bed.gz differ