Skip to content

Commit

Permalink
Merge pull request #48 from ginkgobioworks/gfa-path-export
Browse files Browse the repository at this point in the history
Gfa path export
  • Loading branch information
dkhofer authored Sep 23, 2024
2 parents 43e73df + 2b9b0fa commit 75f0212
Show file tree
Hide file tree
Showing 7 changed files with 311 additions and 18 deletions.
2 changes: 1 addition & 1 deletion migrations/core/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ CREATE TABLE path_edges (
FOREIGN KEY(edge_id) REFERENCES edges(id),
FOREIGN KEY(path_id) REFERENCES path(id)
) STRICT;
CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id);
CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id, index_in_path);

CREATE TABLE block_group_edges (
id INTEGER PRIMARY KEY NOT NULL,
Expand Down
239 changes: 226 additions & 13 deletions src/exports/gfa.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use itertools::Itertools;
use petgraph::prelude::DiGraphMap;
use rusqlite::Connection;
use std::collections::{HashMap, HashSet};
use std::fs;
Expand All @@ -13,6 +14,7 @@ use crate::models::{
collection::Collection,
edge::{Edge, GroupBlock},
path::Path,
path_edge::PathEdge,
sequence::Sequence,
strand::Strand,
};
Expand All @@ -32,7 +34,7 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf)

let (graph, edges_by_node_pair) = Edge::build_graph(conn, &edges, &blocks);

let mut file = File::create(filename).unwrap();
let file = File::create(filename).unwrap();
let mut writer = BufWriter::new(file);

let mut terminal_block_ids = HashSet::new();
Expand All @@ -43,6 +45,33 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf)
terminal_block_ids.insert(block.id);
continue;
}
}

write_segments(&mut writer, blocks.clone(), terminal_block_ids.clone());
write_links(
&mut writer,
graph,
edges_by_node_pair.clone(),
terminal_block_ids,
);
write_paths(
&mut writer,
conn,
collection_name,
edges_by_node_pair,
&blocks,
);
}

fn write_segments(
writer: &mut BufWriter<File>,
blocks: Vec<GroupBlock>,
terminal_block_ids: HashSet<i32>,
) {
for block in &blocks {
if terminal_block_ids.contains(&block.id) {
continue;
}
writer
.write_all(&segment_line(&block.sequence, block.id as usize).into_bytes())
.unwrap_or_else(|_| {
Expand All @@ -52,13 +81,18 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf)
)
});
}
}

let blocks_by_id = blocks
.clone()
.into_iter()
.map(|block| (block.id, block))
.collect::<HashMap<i32, GroupBlock>>();
fn segment_line(sequence: &str, index: usize) -> String {
format!("S\t{}\t{}\t{}\n", index + 1, sequence, "*")
}

fn write_links(
writer: &mut BufWriter<File>,
graph: DiGraphMap<i32, ()>,
edges_by_node_pair: HashMap<(i32, i32), Edge>,
terminal_block_ids: HashSet<i32>,
) {
for (source, target, ()) in graph.all_edges() {
if terminal_block_ids.contains(&source) || terminal_block_ids.contains(&target) {
continue;
Expand All @@ -77,10 +111,6 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf)
}
}

fn segment_line(sequence: &str, index: usize) -> String {
format!("S\t{}\t{}\t{}\n", index, sequence, "*")
}

fn link_line(
source_index: i32,
source_strand: Strand,
Expand All @@ -89,10 +119,100 @@ fn link_line(
) -> String {
format!(
"L\t{}\t{}\t{}\t{}\t*\n",
source_index, source_strand, target_index, target_strand
source_index + 1,
source_strand,
target_index + 1,
target_strand
)
}

// NOTE: A path is an immutable list of edges, but the sequence between the target of one edge and
// the source of the next may be "split" by later operations that add edges with sources or targets
// on a sequence that are in between those of a consecutive pair of edges in a path. This function
// handles that case by collecting all the nodes between the target of one edge and the source of
// the next.
fn nodes_for_edges(
edge1: &Edge,
edge2: &Edge,
blocks_by_hash_and_start: &HashMap<(&str, i32), GroupBlock>,
blocks_by_hash_and_end: &HashMap<(&str, i32), GroupBlock>,
) -> Vec<i32> {
let current_block = blocks_by_hash_and_start
.get(&(edge1.target_hash.as_str(), edge1.target_coordinate))
.unwrap();
let end_block = blocks_by_hash_and_end
.get(&(edge2.source_hash.as_str(), edge2.source_coordinate))
.unwrap();
let mut node_ids = vec![];
#[allow(clippy::while_immutable_condition)]
while current_block.id != end_block.id {
node_ids.push(current_block.id);
let current_block = blocks_by_hash_and_start
.get(&(current_block.sequence_hash.as_str(), current_block.end))
.unwrap();
}
node_ids.push(end_block.id);

node_ids
}

fn write_paths(
writer: &mut BufWriter<File>,
conn: &Connection,
collection_name: &str,
edges_by_node_pair: HashMap<(i32, i32), Edge>,
blocks: &[GroupBlock],
) {
let paths = Path::get_paths_for_collection(conn, collection_name);
let edges_by_path_id =
PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect());
let node_pairs_by_edge_id = edges_by_node_pair
.iter()
.map(|(node_pair, edge)| (edge.id, *node_pair))
.collect::<HashMap<i32, (i32, i32)>>();

let blocks_by_hash_and_start = blocks
.iter()
.map(|block| ((block.sequence_hash.as_str(), block.start), block.clone()))
.collect::<HashMap<(&str, i32), GroupBlock>>();
let blocks_by_hash_and_end = blocks
.iter()
.map(|block| ((block.sequence_hash.as_str(), block.end), block.clone()))
.collect::<HashMap<(&str, i32), GroupBlock>>();

for path in paths {
let edges_for_path = edges_by_path_id.get(&path.id).unwrap();
let mut node_ids = vec![];
let mut node_strands = vec![];
for (edge1, edge2) in edges_for_path.iter().tuple_windows() {
let current_node_ids = nodes_for_edges(
edge1,
edge2,
&blocks_by_hash_and_start,
&blocks_by_hash_and_end,
);
for node_id in &current_node_ids {
node_ids.push(*node_id);
node_strands.push(edge1.target_strand);
}
}

writer
.write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes())
.unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name));
}
}

fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> String {
let nodes = node_ids
.iter()
.zip(node_strands.iter())
.map(|(node_id, node_strand)| format!("{}{}", *node_id + 1, node_strand))
.collect::<Vec<String>>()
.join(",");
format!("P\t{}\t{}\n", path_name, nodes)
}

mod tests {
use rusqlite::Connection;
// Note this useful idiom: importing names from outer (for mod tests) scope.
Expand All @@ -103,7 +223,7 @@ mod tests {
block_group::BlockGroup, block_group_edge::BlockGroupEdge, collection::Collection,
edge::Edge, sequence::Sequence,
};
use crate::test_helpers::get_connection;
use crate::test_helpers::{get_connection, setup_gen_dir};
use tempfile::tempdir;

#[test]
Expand All @@ -112,7 +232,7 @@ mod tests {
let conn = get_connection(None);

let collection_name = "test collection";
let collection = Collection::create(&conn, collection_name);
Collection::create(&conn, collection_name);
let block_group = BlockGroup::create(&conn, collection_name, None, "test block group");
let sequence1 = Sequence::new()
.sequence_type("DNA")
Expand Down Expand Up @@ -192,6 +312,14 @@ mod tests {
block_group.id,
&[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id],
);

Path::create(
&conn,
"1234",
block_group.id,
&[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id],
);

let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id);

let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
Expand All @@ -208,5 +336,90 @@ mod tests {
let all_sequences2 = BlockGroup::get_all_sequences(&conn, block_group2.id);

assert_eq!(all_sequences, all_sequences2);

let paths = Path::get_paths_for_collection(&conn, "test collection 2");
assert_eq!(paths.len(), 1);
assert_eq!(Path::sequence(&conn, paths[0].clone()), "AAAATTTTGGGGCCCC");
}

#[test]
fn test_simple_round_trip() {
setup_gen_dir();
let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
gfa_path.push("fixtures/simple.gfa");
let collection_name = "test".to_string();
let conn = &get_connection(None);
import_gfa(&gfa_path, &collection_name, conn);

let block_group_id = BlockGroup::get_id(conn, &collection_name, None, "");
let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id);

let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
let mut gfa_path = PathBuf::from(temp_dir.path());
gfa_path.push("intermediate.gfa");

export_gfa(conn, &collection_name, &gfa_path);
import_gfa(&gfa_path, "test collection 2", conn);

let block_group2 = Collection::get_block_groups(conn, "test collection 2")
.pop()
.unwrap();
let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id);

assert_eq!(all_sequences, all_sequences2);
}

#[test]
fn test_anderson_round_trip() {
setup_gen_dir();
let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
gfa_path.push("fixtures/anderson_promoters.gfa");
let collection_name = "anderson promoters".to_string();
let conn = &get_connection(None);
import_gfa(&gfa_path, &collection_name, conn);

let block_group_id = BlockGroup::get_id(conn, &collection_name, None, "");
let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id);

let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
let mut gfa_path = PathBuf::from(temp_dir.path());
gfa_path.push("intermediate.gfa");

export_gfa(conn, &collection_name, &gfa_path);
import_gfa(&gfa_path, "anderson promoters 2", conn);

let block_group2 = Collection::get_block_groups(conn, "anderson promoters 2")
.pop()
.unwrap();
let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id);

assert_eq!(all_sequences, all_sequences2);
}

#[test]
fn test_reverse_strand_round_trip() {
setup_gen_dir();
let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
gfa_path.push("fixtures/reverse_strand.gfa");
let collection_name = "test".to_string();
let conn = &get_connection(None);
import_gfa(&gfa_path, &collection_name, conn);

let block_group_id = BlockGroup::get_id(conn, &collection_name, None, "");
let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id);

let temp_dir = tempdir().expect("Couldn't get handle to temp directory");
let mut gfa_path = PathBuf::from(temp_dir.path());
gfa_path.push("intermediate.gfa");

export_gfa(conn, &collection_name, &gfa_path);
import_gfa(&gfa_path, "test collection 2", conn);

let block_group2 = Collection::get_block_groups(conn, "test collection 2")
.pop()
.unwrap();
let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id);

assert_eq!(all_sequences, all_sequences2);
}
}
29 changes: 29 additions & 0 deletions src/imports/gfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,4 +302,33 @@ mod tests {
let result = Path::sequence(conn, path);
assert_eq!(result, "TATGCCAGCTGCGAATA");
}

#[test]
fn test_import_anderson_promoters() {
let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
gfa_path.push("fixtures/anderson_promoters.gfa");
let collection_name = "anderson promoters".to_string();
let conn = &get_connection(None);
import_gfa(&gfa_path, &collection_name, conn);

let paths = Path::get_paths_for_collection(conn, &collection_name);
assert_eq!(paths.len(), 20);

let block_group_id = BlockGroup::get_id(conn, &collection_name, None, "");
let path = Path::get_paths(
conn,
"select * from path where block_group_id = ?1 AND name = ?2",
vec![
SQLValue::from(block_group_id),
SQLValue::from("BBa_J23100".to_string()),
],
)[0]
.clone();

let result = Path::sequence(conn, path);
let expected_sequence_parts = vec!["T", "T", "G", "A", "C", "G", "GCTAGCTCAG", "T", "CCT", "A", "GG", "T", "A", "C", "A", "G",
"TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"];
let expected_sequence = expected_sequence_parts.join("");
assert_eq!(result, expected_sequence);
}
}
2 changes: 1 addition & 1 deletion src/models/block_group.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ impl BlockGroup {
BlockGroupEdge::bulk_create(conn, target_block_group_id, &edge_ids);

for path in existing_paths {
let edge_ids = PathEdge::edges_for(conn, path.id)
let edge_ids = PathEdge::edges_for_path(conn, path.id)
.into_iter()
.map(|edge| edge.id)
.collect::<Vec<i32>>();
Expand Down
2 changes: 2 additions & 0 deletions src/models/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use rusqlite::types::Value;
use rusqlite::{params_from_iter, Connection};

use crate::models::block_group::BlockGroup;
use crate::models::path::Path;

#[derive(Clone, Debug)]
pub struct Collection {
Expand All @@ -15,6 +16,7 @@ impl Collection {
.unwrap();
stmt.exists([name]).unwrap()
}

pub fn create(conn: &Connection, name: &str) -> Collection {
let mut stmt = conn
.prepare("INSERT INTO collection (name) VALUES (?1) RETURNING *")
Expand Down
Loading

0 comments on commit 75f0212

Please sign in to comment.