Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Topological correctness #54

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
5 changes: 5 additions & 0 deletions fixtures/aa.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
H VN:Z:1.2
S 1 A SN:Z:123 SO:i:0 SR:i:0
S 2 A SN:Z:123 SO:i:0 SR:i:0
L 1 + 2 + *
P 124 1+,2+ 0M
37 changes: 23 additions & 14 deletions migrations/core/01-initial/up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,14 @@ CREATE TABLE sequence (
length INTEGER NOT NULL
) STRICT;

CREATE TABLE nodes (
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
sequence_hash TEXT NOT NULL,
FOREIGN KEY(sequence_hash) REFERENCES sequence(hash)
) STRICT;

CREATE TABLE block_group (
id INTEGER PRIMARY KEY NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
collection_name TEXT NOT NULL,
sample_name TEXT,
name TEXT NOT NULL,
Expand All @@ -31,7 +37,7 @@ CREATE UNIQUE INDEX block_group_uidx ON block_group(collection_name, sample_name
CREATE UNIQUE INDEX block_group_null_sample_uidx ON block_group(collection_name, name) WHERE sample_name is null;

CREATE TABLE path (
id INTEGER PRIMARY KEY NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
block_group_id INTEGER NOT NULL,
name TEXT NOT NULL,
FOREIGN KEY(block_group_id) REFERENCES block_group(id)
Expand All @@ -40,7 +46,7 @@ CREATE UNIQUE INDEX path_uidx ON path(block_group_id, name);

-- an operation from a vcf can impact multiple paths and samples, so operation is not faceted on that
CREATE TABLE operation (
id INTEGER PRIMARY KEY NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
parent_id INTEGER,
collection_name TEXT NOT NULL,
change_type TEXT NOT NULL,
Expand All @@ -49,36 +55,36 @@ CREATE TABLE operation (
) STRICT;

CREATE TABLE file_addition (
id INTEGER PRIMARY KEY NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
file_path TEXT NOT NULL,
file_type TEXT NOT NULL
) STRICT;

CREATE TABLE operation_summary (
id INTEGER PRIMARY KEY NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
operation_id INTEGER NOT NULL,
summary TEXT NOT NULL,
FOREIGN KEY(operation_id) REFERENCES operation(id)
) STRICT;

CREATE TABLE edges (
id INTEGER PRIMARY KEY NOT NULL,
source_hash TEXT NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
source_node_id INTEGER,
source_coordinate INTEGER NOT NULL,
source_strand TEXT NOT NULL,
target_hash TEXT NOT NULL,
target_node_id INTEGER,
target_coordinate INTEGER NOT NULL,
target_strand TEXT NOT NULL,
chromosome_index INTEGER NOT NULL,
phased INTEGER NOT NULL,
FOREIGN KEY(source_hash) REFERENCES sequence(hash),
FOREIGN KEY(target_hash) REFERENCES sequence(hash),
FOREIGN KEY(source_node_id) REFERENCES nodes(id),
FOREIGN KEY(target_node_id) REFERENCES nodes(id),
constraint chk_phased check (phased in (0, 1))
) STRICT;
CREATE UNIQUE INDEX edge_uidx ON edges(source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased);
CREATE UNIQUE INDEX edge_uidx ON edges(source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased);

CREATE TABLE path_edges (
id INTEGER PRIMARY KEY NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
path_id INTEGER NOT NULL,
index_in_path INTEGER NOT NULL,
edge_id INTEGER NOT NULL,
Expand All @@ -88,19 +94,22 @@ CREATE TABLE path_edges (
CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id, index_in_path);

CREATE TABLE block_group_edges (
id INTEGER PRIMARY KEY NOT NULL,
id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL,
block_group_id INTEGER NOT NULL,
edge_id INTEGER NOT NULL,
FOREIGN KEY(block_group_id) REFERENCES block_group(id),
FOREIGN KEY(edge_id) REFERENCES edges(id)
) STRICT;
CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id);

INSERT INTO sequence (hash, sequence_type, sequence, name, file_path, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "", "", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "", "", 64);
INSERT INTO gen_metadata (db_uuid) values (lower(
hex(randomblob(4)) || '-' || hex(randomblob(2)) || '-' || '4' ||
substr(hex( randomblob(2)), 2) || '-' ||
substr('AB89', 1 + (abs(random()) % 4) , 1) ||
substr(hex(randomblob(2)), 2) || '-' ||
hex(randomblob(6))
));
INSERT INTO sequence (hash, sequence_type, sequence, name, file_path, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "", "", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "", "", 64);
INSERT INTO nodes (id, sequence_hash) values (1, "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
INSERT INTO nodes (id, sequence_hash) values (2, "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz");
UPDATE SQLITE_SEQUENCE SET seq = 2 WHERE name = 'nodes';
76 changes: 41 additions & 35 deletions src/exports/gfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ use crate::models::{
block_group_edge::BlockGroupEdge,
collection::Collection,
edge::{Edge, GroupBlock},
node::{PATH_END_NODE_ID, PATH_START_NODE_ID},
path::Path,
path_edge::PathEdge,
sequence::Sequence,
strand::Strand,
};

Expand All @@ -36,9 +36,7 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf)

let mut terminal_block_ids = HashSet::new();
for block in &blocks {
if block.sequence_hash == Sequence::PATH_START_HASH
|| block.sequence_hash == Sequence::PATH_END_HASH
{
if block.node_id == PATH_START_NODE_ID || block.node_id == PATH_END_NODE_ID {
terminal_block_ids.insert(block.id);
continue;
}
Expand Down Expand Up @@ -109,7 +107,7 @@ fn link_line(
target_strand: Strand,
) -> String {
format!(
"L\t{}\t{}\t{}\t{}\t*\n",
"L\t{}\t{}\t{}\t{}\t0M\n",
source_index + 1,
source_strand,
target_index + 1,
Expand All @@ -125,21 +123,21 @@ fn link_line(
fn nodes_for_edges(
edge1: &Edge,
edge2: &Edge,
blocks_by_hash_and_start: &HashMap<(&str, i32), GroupBlock>,
blocks_by_hash_and_end: &HashMap<(&str, i32), GroupBlock>,
blocks_by_node_and_start: &HashMap<(i32, i32), GroupBlock>,
blocks_by_node_and_end: &HashMap<(i32, i32), GroupBlock>,
) -> Vec<i32> {
let mut current_block = blocks_by_hash_and_start
.get(&(edge1.target_hash.as_str(), edge1.target_coordinate))
let mut current_block = blocks_by_node_and_start
.get(&(edge1.target_node_id, edge1.target_coordinate))
.unwrap();
let end_block = blocks_by_hash_and_end
.get(&(edge2.source_hash.as_str(), edge2.source_coordinate))
let end_block = blocks_by_node_and_end
.get(&(edge2.source_node_id, edge2.source_coordinate))
.unwrap();
let mut node_ids = vec![];
#[allow(clippy::while_immutable_condition)]
while current_block.id != end_block.id {
node_ids.push(current_block.id);
current_block = blocks_by_hash_and_start
.get(&(current_block.sequence_hash.as_str(), current_block.end))
current_block = blocks_by_node_and_start
.get(&(current_block.node_id, current_block.end))
.unwrap();
}
node_ids.push(end_block.id);
Expand All @@ -157,34 +155,34 @@ fn write_paths(
let edges_by_path_id =
PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect());

let blocks_by_hash_and_start = blocks
let blocks_by_node_and_start = blocks
.iter()
.map(|block| ((block.sequence_hash.as_str(), block.start), block.clone()))
.collect::<HashMap<(&str, i32), GroupBlock>>();
let blocks_by_hash_and_end = blocks
.map(|block| ((block.node_id, block.start), block.clone()))
.collect::<HashMap<(i32, i32), GroupBlock>>();
let blocks_by_node_and_end = blocks
.iter()
.map(|block| ((block.sequence_hash.as_str(), block.end), block.clone()))
.collect::<HashMap<(&str, i32), GroupBlock>>();
.map(|block| ((block.node_id, block.end), block.clone()))
.collect::<HashMap<(i32, i32), GroupBlock>>();

for path in paths {
let edges_for_path = edges_by_path_id.get(&path.id).unwrap();
let mut node_ids = vec![];
let mut graph_node_ids = vec![];
let mut node_strands = vec![];
for (edge1, edge2) in edges_for_path.iter().tuple_windows() {
let current_node_ids = nodes_for_edges(
edge1,
edge2,
&blocks_by_hash_and_start,
&blocks_by_hash_and_end,
&blocks_by_node_and_start,
&blocks_by_node_and_end,
);
for node_id in &current_node_ids {
node_ids.push(*node_id);
graph_node_ids.push(*node_id);
node_strands.push(edge1.target_strand);
}
}

writer
.write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes())
.write_all(&path_line(&path.name, &graph_node_ids, &node_strands).into_bytes())
.unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name));
}
}
Expand All @@ -199,12 +197,16 @@ fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> Stri
format!("P\t{}\t{}\n", path_name, nodes)
}

#[cfg(test)]
mod tests {
// Note this useful idiom: importing names from outer (for mod tests) scope.
use super::*;

use crate::imports::gfa::import_gfa;
use crate::models::{block_group::BlockGroup, collection::Collection};
use crate::models::{
block_group::BlockGroup, collection::Collection, node::Node, sequence::Sequence,
};

use crate::test_helpers::{get_connection, setup_gen_dir};
use tempfile::tempdir;

Expand Down Expand Up @@ -232,57 +234,61 @@ mod tests {
.sequence_type("DNA")
.sequence("CCCC")
.save(&conn);
let node1_id = Node::create(&conn, &sequence1.hash);
let node2_id = Node::create(&conn, &sequence2.hash);
let node3_id = Node::create(&conn, &sequence3.hash);
let node4_id = Node::create(&conn, &sequence4.hash);

let edge1 = Edge::create(
&conn,
Sequence::PATH_START_HASH.to_string(),
PATH_START_NODE_ID,
0,
Strand::Forward,
sequence1.hash.clone(),
node1_id,
0,
Strand::Forward,
0,
0,
);
let edge2 = Edge::create(
&conn,
sequence1.hash,
node1_id,
4,
Strand::Forward,
sequence2.hash.clone(),
node2_id,
0,
Strand::Forward,
0,
0,
);
let edge3 = Edge::create(
&conn,
sequence2.hash,
node2_id,
4,
Strand::Forward,
sequence3.hash.clone(),
node3_id,
0,
Strand::Forward,
0,
0,
);
let edge4 = Edge::create(
&conn,
sequence3.hash,
node3_id,
4,
Strand::Forward,
sequence4.hash.clone(),
node4_id,
0,
Strand::Forward,
0,
0,
);
let edge5 = Edge::create(
&conn,
sequence4.hash,
node4_id,
4,
Strand::Forward,
Sequence::PATH_END_HASH.to_string(),
PATH_END_NODE_ID,
0,
Strand::Forward,
0,
Expand Down
20 changes: 14 additions & 6 deletions src/imports/fasta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@ use std::str;
use crate::models::file_types::FileTypes;
use crate::models::operations::{FileAddition, Operation, OperationSummary};
use crate::models::{
block_group::BlockGroup, block_group_edge::BlockGroupEdge, collection::Collection, edge::Edge,
metadata, path::Path, sequence::Sequence, strand::Strand,
block_group::BlockGroup,
block_group_edge::BlockGroupEdge,
collection::Collection,
edge::Edge,
metadata,
node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID},
path::Path,
sequence::Sequence,
strand::Strand,
};
use crate::operation_management;
use noodles::fasta;
Expand Down Expand Up @@ -57,24 +64,25 @@ pub fn import_fasta(
.sequence(&sequence)
.save(conn)
};
let node_id = Node::create(conn, &seq.hash);
let block_group = BlockGroup::create(conn, &collection.name, None, &name);
let edge_into = Edge::create(
conn,
Sequence::PATH_START_HASH.to_string(),
PATH_START_NODE_ID,
0,
Strand::Forward,
seq.hash.to_string(),
node_id,
0,
Strand::Forward,
0,
0,
);
let edge_out_of = Edge::create(
conn,
seq.hash.to_string(),
node_id,
sequence_length,
Strand::Forward,
Sequence::PATH_END_HASH.to_string(),
PATH_END_NODE_ID,
0,
Strand::Forward,
0,
Expand Down
Loading