From 7078ed8ae1b8658dd33794d952e07e78fb823b6f Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 24 Sep 2024 15:11:19 -0400 Subject: [PATCH 01/16] Add node table and edge fields --- migrations/core/01-initial/up.sql | 9 ++++ src/exports/gfa.rs | 1 + src/imports/gfa.rs | 3 ++ src/models.rs | 1 + src/models/block_group.rs | 9 ++++ src/models/edge.rs | 75 +++++++++++++++++++++++-------- src/models/node.rs | 8 ++++ src/operation_management.rs | 23 +++++----- 8 files changed, 101 insertions(+), 28 deletions(-) create mode 100644 src/models/node.rs diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 0d2a0dd..8d72ead 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -19,6 +19,12 @@ CREATE TABLE sequence ( length INTEGER NOT NULL ) STRICT; +CREATE TABLE nodes ( + id INTEGER PRIMARY KEY NOT NULL, + sequence_hash TEXT NOT NULL, + FOREIGN KEY(sequence_hash) REFERENCES sequence(hash) +) STRICT; + CREATE TABLE block_group ( id INTEGER PRIMARY KEY NOT NULL, collection_name TEXT NOT NULL, @@ -64,9 +70,11 @@ CREATE TABLE operation_summary ( CREATE TABLE edges ( id INTEGER PRIMARY KEY NOT NULL, source_hash TEXT NOT NULL, + source_node_id INTEGER, source_coordinate INTEGER NOT NULL, source_strand TEXT NOT NULL, target_hash TEXT NOT NULL, + target_node_id INTEGER, target_coordinate INTEGER NOT NULL, target_strand TEXT NOT NULL, chromosome_index INTEGER NOT NULL, @@ -76,6 +84,7 @@ CREATE TABLE edges ( constraint chk_phased check (phased in (0, 1)) ) STRICT; CREATE UNIQUE INDEX edge_uidx ON edges(source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased); +-- CREATE UNIQUE INDEX edge_uidx ON edges(source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased); CREATE TABLE path_edges ( id INTEGER PRIMARY KEY NOT NULL, diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 723a9d5..a946f03 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -199,6 +199,7 @@ fn path_line(path_name: &str, node_ids: &[i32], node_strands: &[Strand]) -> Stri format!("P\t{}\t{}\n", path_name, nodes) } +#[cfg(test)] mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 8b3616f..a1a875b 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -8,6 +8,7 @@ use crate::models::{ block_group_edge::BlockGroupEdge, collection::Collection, edge::{Edge, EdgeData}, + node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, path::Path, sequence::Sequence, strand::Strand, @@ -197,9 +198,11 @@ fn edge_data_from_fields( ) -> EdgeData { EdgeData { source_hash: source_hash.to_string(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate, source_strand, target_hash: target_hash.to_string(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: 0, target_strand, chromosome_index: 0, diff --git a/src/models.rs b/src/models.rs index 0be2805..5da2e5c 100644 --- a/src/models.rs +++ b/src/models.rs @@ -4,6 +4,7 @@ pub mod collection; pub mod edge; pub mod file_types; pub mod metadata; +pub mod node; pub mod operations; pub mod path; pub mod path_edge; diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 329f96d..34a38a5 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize}; use crate::graph::all_simple_paths; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData, GroupBlock}; +use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; use crate::models::path::{NewBlock, Path, PathData}; use crate::models::path_edge::PathEdge; use crate::models::sequence::Sequence; @@ -365,10 +366,12 @@ impl BlockGroup { // Deletion let new_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: change.start - start_block.path_start + start_block.sequence_start, source_strand: Strand::Forward, target_hash: end_block.sequence.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, @@ -382,9 +385,11 @@ impl BlockGroup { if change.start == 0 { let new_beginning_edge = EdgeData { source_hash: Sequence::PATH_START_HASH.to_string(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: 0, source_strand: Strand::Forward, target_hash: end_block.sequence.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, @@ -399,10 +404,12 @@ impl BlockGroup { // Insertion/replacement let new_start_edge = EdgeData { source_hash: start_block.sequence.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: change.start - start_block.path_start + start_block.sequence_start, source_strand: Strand::Forward, target_hash: change.block.sequence.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: change.block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, @@ -410,9 +417,11 @@ impl BlockGroup { }; let new_end_edge = EdgeData { source_hash: change.block.sequence.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: change.block.sequence_end, source_strand: Strand::Forward, target_hash: end_block.sequence.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, chromosome_index: change.chromosome_index, diff --git a/src/models/edge.rs b/src/models/edge.rs index d53d731..0a9dfde 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -7,15 +7,18 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection, Result as SQLResult, Row}; use serde::{Deserialize, Serialize}; +use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; use crate::models::{sequence::Sequence, strand::Strand}; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] pub struct Edge { pub id: i32, pub source_hash: String, + pub source_node_id: i32, pub source_coordinate: i32, pub source_strand: Strand, pub target_hash: String, + pub target_node_id: i32, pub target_coordinate: i32, pub target_strand: Strand, pub chromosome_index: i32, @@ -25,9 +28,11 @@ pub struct Edge { #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct EdgeData { pub source_hash: String, + pub source_node_id: i32, pub source_coordinate: i32, pub source_strand: Strand, pub target_hash: String, + pub target_node_id: i32, pub target_coordinate: i32, pub target_strand: Strand, pub chromosome_index: i32, @@ -38,9 +43,11 @@ impl From<&Edge> for EdgeData { fn from(item: &Edge) -> Self { EdgeData { source_hash: item.source_hash.clone(), + source_node_id: item.source_node_id, source_coordinate: item.source_coordinate, source_strand: item.source_strand, target_hash: item.target_hash.clone(), + target_node_id: item.target_node_id, target_coordinate: item.target_coordinate, target_strand: item.target_strand, chromosome_index: item.chromosome_index, @@ -77,13 +84,15 @@ impl Edge { chromosome_index: i32, phased: i32, ) -> Edge { - let query = "INSERT INTO edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8) RETURNING *"; + let query = "INSERT INTO edges (source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10) RETURNING *"; let id_query = "select id from edges where source_hash = ?1 and source_coordinate = ?2 and source_strand = ?3 and target_hash = ?4 and target_coordinate = ?5 and target_strand = ?6 and chromosome_index = ?7 and phased = ?8"; let placeholders: Vec = vec![ source_hash.clone().into(), + BOGUS_SOURCE_NODE_ID.into(), source_coordinate.into(), source_strand.into(), target_hash.clone().into(), + BOGUS_TARGET_NODE_ID.into(), target_coordinate.into(), target_strand.into(), chromosome_index.into(), @@ -95,13 +104,15 @@ impl Edge { Ok(Edge { id: row.get(0)?, source_hash: row.get(1)?, - source_coordinate: row.get(2)?, - source_strand: row.get(3)?, - target_hash: row.get(4)?, - target_coordinate: row.get(5)?, - target_strand: row.get(6)?, - chromosome_index: row.get(7)?, - phased: row.get(8)?, + source_node_id: row.get(2)?, + source_coordinate: row.get(3)?, + source_strand: row.get(4)?, + target_hash: row.get(5)?, + target_node_id: row.get(6)?, + target_coordinate: row.get(7)?, + target_strand: row.get(8)?, + chromosome_index: row.get(9)?, + phased: row.get(10)?, }) }) { Ok(edge) => edge, @@ -113,9 +124,11 @@ impl Edge { .query_row(id_query, params_from_iter(&placeholders), |row| row.get(0)) .unwrap(), source_hash, + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate, source_strand, target_hash, + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate, target_strand, chromosome_index, @@ -135,13 +148,15 @@ impl Edge { Ok(Edge { id: row.get(0)?, source_hash: row.get(1)?, - source_coordinate: row.get(2)?, - source_strand: row.get(3)?, - target_hash: row.get(4)?, - target_coordinate: row.get(5)?, - target_strand: row.get(6)?, - chromosome_index: row.get(7)?, - phased: row.get(8)?, + source_node_id: BOGUS_SOURCE_NODE_ID, + source_coordinate: row.get(3)?, + source_strand: row.get(4)?, + target_hash: row.get(5)?, + target_node_id: BOGUS_TARGET_NODE_ID, + target_coordinate: row.get(7)?, + target_strand: row.get(8)?, + chromosome_index: row.get(9)?, + phased: row.get(10)?, }) } @@ -151,7 +166,7 @@ impl Edge { .map(|edge_id| edge_id.to_string()) .collect::>() .join(","); - let query = format!("select id, source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased from edges where id in ({});", formatted_edge_ids); + let query = format!("select id, source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased from edges where id in ({});", formatted_edge_ids); Edge::query(conn, &query, vec![]) } @@ -213,11 +228,13 @@ impl Edge { let source_strand = format!("\"{0}\"", edge.source_strand); let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( - "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})", source_hash, + BOGUS_SOURCE_NODE_ID, edge.source_coordinate, source_strand, target_hash, + BOGUS_TARGET_NODE_ID, edge.target_coordinate, target_strand, edge.chromosome_index, @@ -230,7 +247,7 @@ impl Edge { for chunk in edge_rows_to_insert.chunks(100000) { let formatted_edge_rows_to_insert = chunk.join(", "); - let insert_statement = format!("INSERT INTO edges (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING *;", formatted_edge_rows_to_insert); + let insert_statement = format!("INSERT INTO edges (source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING *;", formatted_edge_rows_to_insert); let mut stmt = conn.prepare(&insert_statement).unwrap(); let rows = stmt.query_map([], Edge::edge_from_row).unwrap(); for row in rows { @@ -248,9 +265,11 @@ impl Edge { pub fn to_data(edge: Edge) -> EdgeData { EdgeData { source_hash: edge.source_hash, + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: edge.source_coordinate, source_strand: edge.source_strand, target_hash: edge.target_hash, + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: edge.target_coordinate, target_strand: edge.target_strand, chromosome_index: edge.chromosome_index, @@ -327,9 +346,11 @@ impl Edge { boundary_edges.push(Edge { id: -1, source_hash: hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: *block_boundary, source_strand: Strand::Unknown, target_hash: hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: *block_boundary, target_strand: Strand::Unknown, chromosome_index: 0, @@ -488,9 +509,11 @@ mod tests { .save(conn); let edge1 = EdgeData { source_hash: Sequence::PATH_START_HASH.to_string(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, target_hash: sequence1.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -502,9 +525,11 @@ mod tests { .save(conn); let edge2 = EdgeData { source_hash: sequence1.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: 2, source_strand: Strand::Forward, target_hash: sequence2.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, @@ -512,9 +537,11 @@ mod tests { }; let edge3 = EdgeData { source_hash: sequence2.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: 4, source_strand: Strand::Forward, target_hash: Sequence::PATH_END_HASH.to_string(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -555,9 +582,11 @@ mod tests { .save(conn); let edge1 = EdgeData { source_hash: Sequence::PATH_START_HASH.to_string(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, target_hash: sequence1.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -569,9 +598,11 @@ mod tests { .save(conn); let edge2 = EdgeData { source_hash: sequence1.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: 2, source_strand: Strand::Forward, target_hash: sequence2.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, @@ -579,9 +610,11 @@ mod tests { }; let edge3 = EdgeData { source_hash: sequence2.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: 4, source_strand: Strand::Forward, target_hash: Sequence::PATH_END_HASH.to_string(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -646,8 +679,10 @@ mod tests { let edge1 = EdgeData { source_hash: Sequence::PATH_START_HASH.to_string(), source_coordinate: -1, + source_node_id: BOGUS_SOURCE_NODE_ID, source_strand: Strand::Forward, target_hash: sequence1.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -659,9 +694,11 @@ mod tests { .save(conn); let edge2 = EdgeData { source_hash: sequence1.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: 2, source_strand: Strand::Forward, target_hash: sequence2.hash.clone(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, @@ -669,9 +706,11 @@ mod tests { }; let edge3 = EdgeData { source_hash: sequence2.hash.clone(), + source_node_id: BOGUS_SOURCE_NODE_ID, source_coordinate: 4, source_strand: Strand::Forward, target_hash: Sequence::PATH_END_HASH.to_string(), + target_node_id: BOGUS_TARGET_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, diff --git a/src/models/node.rs b/src/models/node.rs new file mode 100644 index 0000000..e0e0988 --- /dev/null +++ b/src/models/node.rs @@ -0,0 +1,8 @@ +pub const BOGUS_SOURCE_NODE_ID: i32 = -1; +pub const BOGUS_TARGET_NODE_ID: i32 = -2; + +#[derive(Clone, Debug)] +pub struct Node { + pub id: i32, + pub sequence_hash: String, +} diff --git a/src/operation_management.rs b/src/operation_management.rs index b6d717c..342f65b 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -15,6 +15,7 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData}; use crate::models::file_types::FileTypes; +use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; use crate::models::operations::{ Branch, FileAddition, Operation, OperationState, OperationSummary, }; @@ -331,15 +332,17 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { edge_pk, EdgeData { source_hash: item.new_value(1).unwrap().as_str().unwrap().to_string(), - source_coordinate: item.new_value(2).unwrap().as_i64().unwrap() as i32, - source_strand: Strand::column_result(item.new_value(3).unwrap()) + source_node_id: BOGUS_SOURCE_NODE_ID, + source_coordinate: item.new_value(3).unwrap().as_i64().unwrap() as i32, + source_strand: Strand::column_result(item.new_value(4).unwrap()) .unwrap(), - target_hash: item.new_value(4).unwrap().as_str().unwrap().to_string(), - target_coordinate: item.new_value(5).unwrap().as_i64().unwrap() as i32, - target_strand: Strand::column_result(item.new_value(6).unwrap()) + target_hash: item.new_value(5).unwrap().as_str().unwrap().to_string(), + target_node_id: BOGUS_TARGET_NODE_ID, + target_coordinate: item.new_value(7).unwrap().as_i64().unwrap() as i32, + target_strand: Strand::column_result(item.new_value(8).unwrap()) .unwrap(), - chromosome_index: item.new_value(7).unwrap().as_i64().unwrap() as i32, - phased: item.new_value(8).unwrap().as_i64().unwrap() as i32, + chromosome_index: item.new_value(9).unwrap().as_i64().unwrap() as i32, + phased: item.new_value(10).unwrap().as_i64().unwrap() as i32, }, ); } @@ -565,7 +568,7 @@ mod tests { setup_db(op_conn, &db_uuid); // create some stuff before we attach to our main session that will be required as extra information - let (bg_id, path_id) = setup_block_group(conn); + let (bg_id, _path_id) = setup_block_group(conn); let binding = BlockGroup::query( conn, "select * from block_group where id = ?1;", @@ -707,8 +710,8 @@ mod tests { operation_conn, ); - let branch_1 = Branch::create(operation_conn, &db_uuid, "branch-1"); - let branch_2 = Branch::create(operation_conn, &db_uuid, "branch-2"); + Branch::create(operation_conn, &db_uuid, "branch-1"); + Branch::create(operation_conn, &db_uuid, "branch-2"); checkout( conn, operation_conn, From 7598f47005bb3cb812789042cd9330387352b3a6 Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 24 Sep 2024 16:52:27 -0400 Subject: [PATCH 02/16] Fix test --- src/models/edge.rs | 2 +- src/operation_management.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/models/edge.rs b/src/models/edge.rs index 0a9dfde..ad3b1d7 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -85,7 +85,7 @@ impl Edge { phased: i32, ) -> Edge { let query = "INSERT INTO edges (source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10) RETURNING *"; - let id_query = "select id from edges where source_hash = ?1 and source_coordinate = ?2 and source_strand = ?3 and target_hash = ?4 and target_coordinate = ?5 and target_strand = ?6 and chromosome_index = ?7 and phased = ?8"; + let id_query = "select id from edges where source_hash = ?1 and source_node_id = ?2 and source_coordinate = ?3 and source_strand = ?4 and target_hash = ?5 and target_node_id = ?6 and target_coordinate = ?7 and target_strand = ?8 and chromosome_index = ?9 and phased = ?10"; let placeholders: Vec = vec![ source_hash.clone().into(), BOGUS_SOURCE_NODE_ID.into(), diff --git a/src/operation_management.rs b/src/operation_management.rs index 342f65b..e7440b4 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -114,7 +114,7 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec .unwrap() .to_string(); let target_hash = - str::from_utf8(item.new_value(4).unwrap().as_bytes().unwrap()) + str::from_utf8(item.new_value(5).unwrap().as_bytes().unwrap()) .unwrap() .to_string(); created_edges.insert(edge_pk); @@ -201,6 +201,7 @@ pub fn write_changeset(conn: &Connection, operation: &Operation, changes: &[u8]) let mut file = fs::File::create_new(&change_path) .unwrap_or_else(|_| panic!("Unable to open {change_path:?}")); + file.write_all(changes).unwrap() } From 6dffbee6581f0414657d37bd66deef1d8da0d0c5 Mon Sep 17 00:00:00 2001 From: hofer Date: Tue, 24 Sep 2024 19:05:02 -0400 Subject: [PATCH 03/16] Add AA unit test, add node IDs to most methods --- fixtures/aa.gfa | 5 +++ migrations/core/01-initial/up.sql | 23 +++++----- src/exports/gfa.rs | 17 ++++++- src/imports/fasta.rs | 16 ++++++- src/imports/gfa.rs | 74 +++++++++++++++++++++++++++++-- src/models/edge.rs | 30 ++++++++----- src/models/node.rs | 23 +++++++++- src/models/path.rs | 36 ++++++++++++++- src/operation_management.rs | 9 +++- src/test_helpers.rs | 11 +++++ 10 files changed, 211 insertions(+), 33 deletions(-) create mode 100644 fixtures/aa.gfa diff --git a/fixtures/aa.gfa b/fixtures/aa.gfa new file mode 100644 index 0000000..071f3e2 --- /dev/null +++ b/fixtures/aa.gfa @@ -0,0 +1,5 @@ +H VN:Z:1.2 +S 1 A SN:Z:123 SO:i:0 SR:i:0 +S 2 A SN:Z:123 SO:i:0 SR:i:0 +L 1 + 2 + * +P 124 1+,2+ 4M diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 8d72ead..dabbc40 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -20,13 +20,13 @@ CREATE TABLE sequence ( ) STRICT; CREATE TABLE nodes ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, sequence_hash TEXT NOT NULL, FOREIGN KEY(sequence_hash) REFERENCES sequence(hash) ) STRICT; CREATE TABLE block_group ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, collection_name TEXT NOT NULL, sample_name TEXT, name TEXT NOT NULL, @@ -37,7 +37,7 @@ CREATE UNIQUE INDEX block_group_uidx ON block_group(collection_name, sample_name CREATE UNIQUE INDEX block_group_null_sample_uidx ON block_group(collection_name, name) WHERE sample_name is null; CREATE TABLE path ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, block_group_id INTEGER NOT NULL, name TEXT NOT NULL, FOREIGN KEY(block_group_id) REFERENCES block_group(id) @@ -46,7 +46,7 @@ CREATE UNIQUE INDEX path_uidx ON path(block_group_id, name); -- an operation from a vcf can impact multiple paths and samples, so operation is not faceted on that CREATE TABLE operation ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, parent_id INTEGER, collection_name TEXT NOT NULL, change_type TEXT NOT NULL, @@ -55,20 +55,20 @@ CREATE TABLE operation ( ) STRICT; CREATE TABLE file_addition ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, file_path TEXT NOT NULL, file_type TEXT NOT NULL ) STRICT; CREATE TABLE operation_summary ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, operation_id INTEGER NOT NULL, summary TEXT NOT NULL, FOREIGN KEY(operation_id) REFERENCES operation(id) ) STRICT; CREATE TABLE edges ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, source_hash TEXT NOT NULL, source_node_id INTEGER, source_coordinate INTEGER NOT NULL, @@ -83,11 +83,11 @@ CREATE TABLE edges ( FOREIGN KEY(target_hash) REFERENCES sequence(hash), constraint chk_phased check (phased in (0, 1)) ) STRICT; -CREATE UNIQUE INDEX edge_uidx ON edges(source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased); +CREATE UNIQUE INDEX edge_uidx ON edges(source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased); -- CREATE UNIQUE INDEX edge_uidx ON edges(source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased); CREATE TABLE path_edges ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, path_id INTEGER NOT NULL, index_in_path INTEGER NOT NULL, edge_id INTEGER NOT NULL, @@ -97,7 +97,7 @@ CREATE TABLE path_edges ( CREATE UNIQUE INDEX path_edges_uidx ON path_edges(path_id, edge_id, index_in_path); CREATE TABLE block_group_edges ( - id INTEGER PRIMARY KEY NOT NULL, + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, block_group_id INTEGER NOT NULL, edge_id INTEGER NOT NULL, FOREIGN KEY(block_group_id) REFERENCES block_group(id), @@ -113,3 +113,6 @@ INSERT INTO gen_metadata (db_uuid) values (lower( substr(hex(randomblob(2)), 2) || '-' || hex(randomblob(6)) )); +INSERT INTO nodes (id, sequence_hash) values (1, "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); +INSERT INTO nodes (id, sequence_hash) values (2, "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"); +UPDATE SQLITE_SEQUENCE SET seq = 2 WHERE name = 'nodes'; diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index a946f03..6e3f7d0 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -205,7 +205,12 @@ mod tests { use super::*; use crate::imports::gfa::import_gfa; - use crate::models::{block_group::BlockGroup, collection::Collection}; + use crate::models::{ + block_group::BlockGroup, + collection::Collection, + node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, + }; + use crate::test_helpers::{get_connection, setup_gen_dir}; use tempfile::tempdir; @@ -237,9 +242,11 @@ mod tests { let edge1 = Edge::create( &conn, Sequence::PATH_START_HASH.to_string(), + BOGUS_SOURCE_NODE_ID, 0, Strand::Forward, sequence1.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -248,9 +255,11 @@ mod tests { let edge2 = Edge::create( &conn, sequence1.hash, + BOGUS_SOURCE_NODE_ID, 4, Strand::Forward, sequence2.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -259,9 +268,11 @@ mod tests { let edge3 = Edge::create( &conn, sequence2.hash, + BOGUS_SOURCE_NODE_ID, 4, Strand::Forward, sequence3.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -270,9 +281,11 @@ mod tests { let edge4 = Edge::create( &conn, sequence3.hash, + BOGUS_SOURCE_NODE_ID, 4, Strand::Forward, sequence4.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -281,9 +294,11 @@ mod tests { let edge5 = Edge::create( &conn, sequence4.hash, + BOGUS_SOURCE_NODE_ID, 4, Strand::Forward, Sequence::PATH_END_HASH.to_string(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index 5507697..aafb33d 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -4,8 +4,15 @@ use std::str; use crate::models::file_types::FileTypes; use crate::models::operations::{FileAddition, Operation, OperationSummary}; use crate::models::{ - block_group::BlockGroup, block_group_edge::BlockGroupEdge, collection::Collection, edge::Edge, - metadata, path::Path, sequence::Sequence, strand::Strand, + block_group::BlockGroup, + block_group_edge::BlockGroupEdge, + collection::Collection, + edge::Edge, + metadata, + node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, + path::Path, + sequence::Sequence, + strand::Strand, }; use crate::operation_management; use noodles::fasta; @@ -57,13 +64,16 @@ pub fn import_fasta( .sequence(&sequence) .save(conn) }; + let node = Node::create(conn, &seq.hash); let block_group = BlockGroup::create(conn, &collection.name, None, &name); let edge_into = Edge::create( conn, Sequence::PATH_START_HASH.to_string(), + PATH_START_NODE_ID, 0, Strand::Forward, seq.hash.to_string(), + node.id, 0, Strand::Forward, 0, @@ -72,9 +82,11 @@ pub fn import_fasta( let edge_out_of = Edge::create( conn, seq.hash.to_string(), + node.id, sequence_length, Strand::Forward, Sequence::PATH_END_HASH.to_string(), + PATH_END_NODE_ID, 0, Strand::Forward, 0, diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index a1a875b..0c39723 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -8,7 +8,7 @@ use crate::models::{ block_group_edge::BlockGroupEdge, collection::Collection, edge::{Edge, EdgeData}, - node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, + node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, path::Path, sequence::Sequence, strand::Strand, @@ -27,6 +27,7 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let block_group = BlockGroup::create(conn, collection_name, None, ""); let gfa: Gfa = Gfa::parse_gfa_file(gfa_path.to_str().unwrap()); let mut sequences_by_segment_id: HashMap = HashMap::new(); + let mut node_ids_by_segment_id: HashMap = HashMap::new(); for segment in &gfa.segments { let input_sequence = segment.sequence.get_string(&gfa.sequence); @@ -34,72 +35,92 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) .sequence_type("DNA") .sequence(input_sequence) .save(conn); - sequences_by_segment_id.insert(segment.id, sequence); + sequences_by_segment_id.insert(segment.id, sequence.clone()); + let node = Node::create(conn, &sequence.hash); + node_ids_by_segment_id.insert(segment.id, node.id); } let mut edges = HashSet::new(); for link in &gfa.links { let source = sequences_by_segment_id.get(&link.from).unwrap(); let target = sequences_by_segment_id.get(&link.to).unwrap(); + let source_node_id = *node_ids_by_segment_id.get(&link.from).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(&link.to).unwrap(); edges.insert(edge_data_from_fields( &source.hash, + source_node_id, source.length, bool_to_strand(link.from_dir), &target.hash, + target_node_id, bool_to_strand(link.to_dir), )); } for input_path in &gfa.paths { let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; for (index, segment_id) in input_path.nodes.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_path.dir[index]); edges.insert(edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, &target.hash, + target_node_id, target_strand, )); source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } edges.insert(edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, )); } for input_walk in &gfa.walk { let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; for (index, segment_id) in input_walk.walk_id.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_walk.walk_dir[index]); edges.insert(edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, &target.hash, + target_node_id, target_strand, )); source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } edges.insert(edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, )); } @@ -112,9 +133,11 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) for edge in saved_edges { let key = edge_data_from_fields( &edge.source_hash, + edge.source_node_id, edge.source_coordinate, edge.source_strand, &edge.target_hash, + edge.target_node_id, edge.target_strand, ); edge_ids_by_data.insert(key, edge.id); @@ -123,30 +146,37 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) for input_path in &gfa.paths { let path_name = &input_path.name; let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; let mut path_edge_ids = vec![]; for (index, segment_id) in input_path.nodes.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_path.dir[index]); let key = edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, &target.hash, + target_node_id, target_strand, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } let key = edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); @@ -157,30 +187,37 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) for input_walk in &gfa.walk { let path_name = &input_walk.sample_id; let mut source_hash = Sequence::PATH_START_HASH; + let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; let mut path_edge_ids = vec![]; for (index, segment_id) in input_walk.walk_id.iter().enumerate() { let target = sequences_by_segment_id.get(segment_id).unwrap(); + let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_walk.walk_dir[index]); let key = edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, &target.hash, + target_node_id, target_strand, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); source_hash = &target.hash; + source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } let key = edge_data_from_fields( source_hash, + source_node_id, source_coordinate, source_strand, Sequence::PATH_END_HASH, + PATH_END_NODE_ID, Strand::Forward, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); @@ -191,18 +228,20 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) fn edge_data_from_fields( source_hash: &str, + source_node_id: i32, source_coordinate: i32, source_strand: Strand, target_hash: &str, + target_node_id: i32, target_strand: Strand, ) -> EdgeData { EdgeData { source_hash: source_hash.to_string(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_node_id, source_coordinate, source_strand, target_hash: target_hash.to_string(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_node_id, target_coordinate: 0, target_strand, chromosome_index: 0, @@ -331,4 +370,31 @@ mod tests { let expected_sequence = expected_sequence_parts.join(""); assert_eq!(result, expected_sequence); } + + #[test] + fn test_import_aa_gfa() { + setup_gen_dir(); + let mut gfa_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + gfa_path.push("fixtures/aa.gfa"); + let collection_name = "test".to_string(); + let conn = &get_connection(None); + import_gfa(&gfa_path, &collection_name, conn); + + let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); + let path = Path::get_paths( + conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![ + SQLValue::from(block_group_id), + SQLValue::from("124".to_string()), + ], + )[0] + .clone(); + + let result = Path::sequence(conn, path); + assert_eq!(result, "AA"); + + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + assert_eq!(all_sequences, HashSet::from_iter(vec!["AA".to_string()])); + } } diff --git a/src/models/edge.rs b/src/models/edge.rs index ad3b1d7..2e08ea9 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -76,9 +76,11 @@ impl Edge { pub fn create( conn: &Connection, source_hash: String, + source_node_id: i32, source_coordinate: i32, source_strand: Strand, target_hash: String, + target_node_id: i32, target_coordinate: i32, target_strand: Strand, chromosome_index: i32, @@ -88,11 +90,11 @@ impl Edge { let id_query = "select id from edges where source_hash = ?1 and source_node_id = ?2 and source_coordinate = ?3 and source_strand = ?4 and target_hash = ?5 and target_node_id = ?6 and target_coordinate = ?7 and target_strand = ?8 and chromosome_index = ?9 and phased = ?10"; let placeholders: Vec = vec![ source_hash.clone().into(), - BOGUS_SOURCE_NODE_ID.into(), + source_node_id.into(), source_coordinate.into(), source_strand.into(), target_hash.clone().into(), - BOGUS_TARGET_NODE_ID.into(), + target_node_id.into(), target_coordinate.into(), target_strand.into(), chromosome_index.into(), @@ -124,11 +126,11 @@ impl Edge { .query_row(id_query, params_from_iter(&placeholders), |row| row.get(0)) .unwrap(), source_hash, - source_node_id: BOGUS_SOURCE_NODE_ID, + source_node_id, source_coordinate, source_strand, target_hash, - target_node_id: BOGUS_TARGET_NODE_ID, + target_node_id, target_coordinate, target_strand, chromosome_index, @@ -148,11 +150,11 @@ impl Edge { Ok(Edge { id: row.get(0)?, source_hash: row.get(1)?, - source_node_id: BOGUS_SOURCE_NODE_ID, + source_node_id: row.get(2)?, source_coordinate: row.get(3)?, source_strand: row.get(4)?, target_hash: row.get(5)?, - target_node_id: BOGUS_TARGET_NODE_ID, + target_node_id: row.get(6)?, target_coordinate: row.get(7)?, target_strand: row.get(8)?, chromosome_index: row.get(9)?, @@ -191,11 +193,13 @@ impl Edge { let target_hash = format!("\"{0}\"", edge.target_hash); let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( - "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})", source_hash, + edge.source_node_id, edge.source_coordinate, source_strand, target_hash, + edge.target_node_id, edge.target_coordinate, target_strand, edge.chromosome_index, @@ -205,7 +209,7 @@ impl Edge { } let formatted_edge_rows = edge_rows.join(", "); - let select_statement = format!("SELECT * FROM edges WHERE (source_hash, source_coordinate, source_strand, target_hash, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); + let select_statement = format!("SELECT * FROM edges WHERE (source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); let existing_edges = Edge::query(conn, &select_statement, vec![]); for edge in existing_edges.iter() { edge_map.insert(EdgeData::from(edge), edge.id); @@ -230,11 +234,11 @@ impl Edge { let edge_row = format!( "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})", source_hash, - BOGUS_SOURCE_NODE_ID, + edge.source_node_id, edge.source_coordinate, source_strand, target_hash, - BOGUS_TARGET_NODE_ID, + edge.target_node_id, edge.target_coordinate, target_strand, edge.chromosome_index, @@ -265,11 +269,11 @@ impl Edge { pub fn to_data(edge: Edge) -> EdgeData { EdgeData { source_hash: edge.source_hash, - source_node_id: BOGUS_SOURCE_NODE_ID, + source_node_id: edge.source_node_id, source_coordinate: edge.source_coordinate, source_strand: edge.source_strand, target_hash: edge.target_hash, - target_node_id: BOGUS_TARGET_NODE_ID, + target_node_id: edge.target_node_id, target_coordinate: edge.target_coordinate, target_strand: edge.target_strand, chromosome_index: edge.chromosome_index, @@ -663,9 +667,11 @@ mod tests { let existing_edge = Edge::create( conn, Sequence::PATH_START_HASH.to_string(), + BOGUS_SOURCE_NODE_ID, -1, Strand::Forward, sequence1.hash.clone(), + BOGUS_TARGET_NODE_ID, 1, Strand::Forward, 0, diff --git a/src/models/node.rs b/src/models/node.rs index e0e0988..3ad2817 100644 --- a/src/models/node.rs +++ b/src/models/node.rs @@ -1,8 +1,27 @@ +use rusqlite::Connection; + pub const BOGUS_SOURCE_NODE_ID: i32 = -1; pub const BOGUS_TARGET_NODE_ID: i32 = -2; +pub const PATH_START_NODE_ID: i32 = 1; +pub const PATH_END_NODE_ID: i32 = 2; + #[derive(Clone, Debug)] -pub struct Node { +pub struct Node<'a> { pub id: i32, - pub sequence_hash: String, + pub sequence_hash: &'a str, +} + +impl Node<'_> { + pub fn create<'a>(conn: &'a Connection, sequence_hash: &'a str) -> Node<'a> { + let insert_statement = format!( + "INSERT INTO nodes (sequence_hash) VALUES ('{}');", + sequence_hash + ); + let _ = conn.execute(&insert_statement, ()); + Node { + id: conn.last_insert_rowid() as i32, + sequence_hash, + } + } } diff --git a/src/models/path.rs b/src/models/path.rs index 15746b8..1e5d145 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -294,7 +294,11 @@ mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; - use crate::models::{block_group::BlockGroup, collection::Collection}; + use crate::models::{ + block_group::BlockGroup, + collection::Collection, + node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, + }; use crate::test_helpers::get_connection; #[test] @@ -309,9 +313,11 @@ mod tests { let edge1 = Edge::create( conn, Sequence::PATH_START_HASH.to_string(), + BOGUS_SOURCE_NODE_ID, -123, Strand::Forward, sequence1.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -324,9 +330,11 @@ mod tests { let edge2 = Edge::create( conn, sequence1.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, sequence2.hash.clone(), + BOGUS_TARGET_NODE_ID, 1, Strand::Forward, 0, @@ -339,9 +347,11 @@ mod tests { let edge3 = Edge::create( conn, sequence2.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, sequence3.hash.clone(), + BOGUS_TARGET_NODE_ID, 1, Strand::Forward, 0, @@ -354,9 +364,11 @@ mod tests { let edge4 = Edge::create( conn, sequence3.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, sequence4.hash.clone(), + BOGUS_TARGET_NODE_ID, 1, Strand::Forward, 0, @@ -365,9 +377,11 @@ mod tests { let edge5 = Edge::create( conn, sequence4.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, Sequence::PATH_END_HASH.to_string(), + BOGUS_TARGET_NODE_ID, -1, Strand::Forward, 0, @@ -395,9 +409,11 @@ mod tests { let edge5 = Edge::create( conn, sequence1.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Reverse, Sequence::PATH_END_HASH.to_string(), + BOGUS_TARGET_NODE_ID, 0, Strand::Reverse, 0, @@ -410,9 +426,11 @@ mod tests { let edge4 = Edge::create( conn, sequence2.hash.clone(), + BOGUS_SOURCE_NODE_ID, 7, Strand::Reverse, sequence1.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Reverse, 0, @@ -425,9 +443,11 @@ mod tests { let edge3 = Edge::create( conn, sequence3.hash.clone(), + BOGUS_SOURCE_NODE_ID, 7, Strand::Reverse, sequence2.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Reverse, 0, @@ -440,9 +460,11 @@ mod tests { let edge2 = Edge::create( conn, sequence4.hash.clone(), + BOGUS_SOURCE_NODE_ID, 7, Strand::Reverse, sequence3.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Reverse, 0, @@ -451,9 +473,11 @@ mod tests { let edge1 = Edge::create( conn, Sequence::PATH_START_HASH.to_string(), + BOGUS_SOURCE_NODE_ID, -1, Strand::Reverse, sequence4.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Reverse, 0, @@ -488,9 +512,11 @@ mod tests { let edge1 = Edge::create( conn, Sequence::PATH_START_HASH.to_string(), + BOGUS_SOURCE_NODE_ID, -1, Strand::Forward, sequence1.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -503,9 +529,11 @@ mod tests { let edge2 = Edge::create( conn, sequence1.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, sequence2.hash.clone(), + BOGUS_TARGET_NODE_ID, 1, Strand::Forward, 0, @@ -518,9 +546,11 @@ mod tests { let edge3 = Edge::create( conn, sequence2.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, sequence3.hash.clone(), + BOGUS_TARGET_NODE_ID, 1, Strand::Forward, 0, @@ -533,9 +563,11 @@ mod tests { let edge4 = Edge::create( conn, sequence3.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, sequence4.hash.clone(), + BOGUS_TARGET_NODE_ID, 1, Strand::Forward, 0, @@ -544,9 +576,11 @@ mod tests { let edge5 = Edge::create( conn, sequence4.hash.clone(), + BOGUS_SOURCE_NODE_ID, 8, Strand::Forward, Sequence::PATH_END_HASH.to_string(), + BOGUS_TARGET_NODE_ID, -1, Strand::Forward, 0, diff --git a/src/operation_management.rs b/src/operation_management.rs index e7440b4..902607c 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -539,7 +539,12 @@ mod tests { use crate::imports::fasta::import_fasta; use crate::models::file_types::FileTypes; use crate::models::operations::{setup_db, Branch, FileAddition, Operation, OperationState}; - use crate::models::{edge::Edge, metadata, sample::Sample}; + use crate::models::{ + edge::Edge, + metadata, + node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, + sample::Sample, + }; use crate::test_helpers::{ get_connection, get_operation_connection, setup_block_group, setup_gen_dir, }; @@ -588,9 +593,11 @@ mod tests { let new_edge = Edge::create( conn, random_seq.hash.clone(), + BOGUS_SOURCE_NODE_ID, 0, Strand::Forward, existing_seq.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, diff --git a/src/test_helpers.rs b/src/test_helpers.rs index 2abebbb..81f65ef 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -9,6 +9,7 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::collection::Collection; use crate::models::edge::Edge; +use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; use crate::models::path::Path; use crate::models::sequence::Sequence; use crate::models::strand::Strand; @@ -79,9 +80,11 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { let edge0 = Edge::create( conn, Sequence::PATH_START_HASH.to_string(), + BOGUS_SOURCE_NODE_ID, 0, Strand::Forward, a_seq.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -90,9 +93,11 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { let edge1 = Edge::create( conn, a_seq.hash, + BOGUS_SOURCE_NODE_ID, 10, Strand::Forward, t_seq.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -101,9 +106,11 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { let edge2 = Edge::create( conn, t_seq.hash, + BOGUS_SOURCE_NODE_ID, 10, Strand::Forward, c_seq.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -112,9 +119,11 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { let edge3 = Edge::create( conn, c_seq.hash, + BOGUS_SOURCE_NODE_ID, 10, Strand::Forward, g_seq.hash.clone(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, @@ -123,9 +132,11 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { let edge4 = Edge::create( conn, g_seq.hash, + BOGUS_SOURCE_NODE_ID, 10, Strand::Forward, Sequence::PATH_END_HASH.to_string(), + BOGUS_TARGET_NODE_ID, 0, Strand::Forward, 0, From 63e84d3480335f2549d2c40496b5673b034ebd4c Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 11:18:20 -0400 Subject: [PATCH 04/16] Add new versions of methods using nodes instead of sequences --- src/imports/gfa.rs | 2 +- src/models/block_group.rs | 53 +++++++++ src/models/edge.rs | 230 +++++++++++++++++++++++++++++++++++++- src/models/node.rs | 57 +++++++++- 4 files changed, 334 insertions(+), 8 deletions(-) diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 0c39723..276262b 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -394,7 +394,7 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "AA"); - let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); assert_eq!(all_sequences, HashSet::from_iter(vec!["AA".to_string()])); } } diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 34a38a5..2e287e6 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -305,6 +305,59 @@ impl BlockGroup { sequences } + pub fn get_all_sequences_new(conn: &Connection, block_group_id: i32) -> HashSet { + let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id); + let (blocks, boundary_edges) = Edge::blocks_from_edges_new(conn, &edges); + edges.extend(boundary_edges.clone()); + let (graph, _) = Edge::build_graph_new(&edges, &blocks); + + let mut start_nodes = vec![]; + let mut end_nodes = vec![]; + for node in graph.nodes() { + let has_incoming = graph.neighbors_directed(node, Direction::Incoming).next(); + let has_outgoing = graph.neighbors_directed(node, Direction::Outgoing).next(); + if has_incoming.is_none() { + start_nodes.push(node); + } + if has_outgoing.is_none() { + end_nodes.push(node); + } + } + + let blocks_by_id = blocks + .clone() + .into_iter() + .map(|block| (block.id, block)) + .collect::>(); + let mut sequences = HashSet::::new(); + + for start_node in start_nodes { + for end_node in &end_nodes { + // TODO: maybe make all_simple_paths return a single path id where start == end + if start_node == *end_node { + let block = blocks_by_id.get(&start_node).unwrap(); + if block.sequence_hash != Sequence::PATH_START_HASH + && block.sequence_hash != Sequence::PATH_END_HASH + { + sequences.insert(block.sequence.clone()); + } + } else { + for path in all_simple_paths(&graph, start_node, *end_node) { + let mut current_sequence = "".to_string(); + for node in path { + let block = blocks_by_id.get(&node).unwrap(); + let block_sequence = block.sequence.clone(); + current_sequence.push_str(&block_sequence); + } + sequences.insert(current_sequence); + } + } + } + } + + sequences + } + pub fn insert_changes(conn: &Connection, changes: &Vec, cache: &PathCache) { let mut new_edges_by_block_group = HashMap::>::new(); for change in changes { diff --git a/src/models/edge.rs b/src/models/edge.rs index 2e08ea9..f5ca99c 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -7,7 +7,9 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection, Result as SQLResult, Row}; use serde::{Deserialize, Serialize}; -use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; +use crate::models::node::{ + Node, BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID, PATH_END_NODE_ID, PATH_START_NODE_ID, +}; use crate::models::{sequence::Sequence, strand::Strand}; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] @@ -62,10 +64,17 @@ pub struct BlockKey { pub coordinate: i32, } +#[derive(Eq, Hash, PartialEq)] +pub struct BlockKeyNew { + pub node_id: i32, + pub coordinate: i32, +} + #[derive(Clone, Debug)] pub struct GroupBlock { pub id: i32, pub sequence_hash: String, + pub node_id: i32, pub sequence: String, pub start: i32, pub end: i32, @@ -369,6 +378,7 @@ impl Edge { let first_block = GroupBlock { id: block_index, sequence_hash: hash.clone(), + node_id: BOGUS_SOURCE_NODE_ID, sequence: block_sequence, start, end, @@ -380,6 +390,7 @@ impl Edge { let block = GroupBlock { id: block_index, sequence_hash: hash.clone(), + node_id: BOGUS_SOURCE_NODE_ID, sequence: block_sequence, start, end, @@ -393,6 +404,7 @@ impl Edge { let last_block = GroupBlock { id: block_index, sequence_hash: hash.clone(), + node_id: BOGUS_SOURCE_NODE_ID, sequence: block_sequence, start, end, @@ -403,6 +415,7 @@ impl Edge { blocks.push(GroupBlock { id: block_index, sequence_hash: hash.clone(), + node_id: BOGUS_SOURCE_NODE_ID, sequence: sequence.get_sequence(None, None), start: 0, end: sequence.length, @@ -419,6 +432,7 @@ impl Edge { let start_block = GroupBlock { id: block_index + 1, sequence_hash: start_sequence.hash.clone(), + node_id: BOGUS_SOURCE_NODE_ID, sequence: "".to_string(), start: 0, end: 0, @@ -428,6 +442,161 @@ impl Edge { let end_block = GroupBlock { id: block_index + 2, sequence_hash: end_sequence.hash.clone(), + node_id: BOGUS_SOURCE_NODE_ID, + sequence: "".to_string(), + start: 0, + end: 0, + }; + blocks.push(end_block); + (blocks, boundary_edges) + } + + pub fn blocks_from_edges_new( + conn: &Connection, + edges: &Vec, + ) -> (Vec, Vec) { + let mut node_ids = HashSet::new(); + let mut edges_by_source_node_id: HashMap> = HashMap::new(); + let mut edges_by_target_node_id: HashMap> = HashMap::new(); + for edge in edges { + if edge.source_node_id != PATH_START_NODE_ID { + node_ids.insert(edge.source_node_id); + edges_by_source_node_id + .entry(edge.source_node_id) + .and_modify(|edges| edges.push(edge)) + .or_default(); + } + if edge.target_node_id != PATH_END_NODE_ID { + node_ids.insert(edge.target_node_id); + edges_by_target_node_id + .entry(edge.target_node_id) + .and_modify(|edges| edges.push(edge)) + .or_default(); + } + } + + let nodes = Node::get_nodes(conn, node_ids.into_iter().collect::>()); + let sequence_hashes_by_node_id = nodes + .iter() + .map(|node| (node.id, node.sequence_hash.clone())) + .collect::>(); + let sequences_by_hash = Sequence::sequences_by_hash( + conn, + sequence_hashes_by_node_id + .values() + .map(|hash| hash.as_str()) + .collect::>(), + ); + let sequences_by_node_id = sequence_hashes_by_node_id + .clone() + .into_iter() + .map(|(node_id, sequence_hash)| { + ( + node_id, + sequences_by_hash.get(&sequence_hash).unwrap().clone(), + ) + }) + .collect::>(); + let mut blocks = vec![]; + let mut block_index = 0; + let mut boundary_edges = vec![]; + for (node_id, sequence) in sequences_by_node_id.into_iter() { + let hash = sequence_hashes_by_node_id.get(&node_id).unwrap(); + let block_boundaries = Edge::get_block_boundaries( + edges_by_source_node_id.get(&node_id), + edges_by_target_node_id.get(&node_id), + sequence.length, + ); + for block_boundary in &block_boundaries { + // NOTE: Most of this data is bogus, the Edge struct is just a convenient wrapper + // for the data we need to set up boundary edges in the block group graph + boundary_edges.push(Edge { + id: -1, + source_hash: hash.clone(), + source_node_id: node_id, + source_coordinate: *block_boundary, + source_strand: Strand::Unknown, + target_hash: hash.clone(), + target_node_id: node_id, + target_coordinate: *block_boundary, + target_strand: Strand::Unknown, + chromosome_index: 0, + phased: 0, + }); + } + + if !block_boundaries.is_empty() { + let start = 0; + let end = block_boundaries[0]; + let block_sequence = sequence.get_sequence(start, end).to_string(); + let first_block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + node_id, + sequence: block_sequence, + start, + end, + }; + blocks.push(first_block); + block_index += 1; + for (start, end) in block_boundaries.clone().into_iter().tuple_windows() { + let block_sequence = sequence.get_sequence(start, end).to_string(); + let block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + node_id, + sequence: block_sequence, + start, + end, + }; + blocks.push(block); + block_index += 1; + } + let start = block_boundaries[block_boundaries.len() - 1]; + let end = sequence.length; + let block_sequence = sequence.get_sequence(start, end).to_string(); + let last_block = GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + node_id, + sequence: block_sequence, + start, + end, + }; + blocks.push(last_block); + block_index += 1; + } else { + blocks.push(GroupBlock { + id: block_index, + sequence_hash: hash.clone(), + node_id, + sequence: sequence.get_sequence(None, None), + start: 0, + end: sequence.length, + }); + block_index += 1; + } + } + + // NOTE: We need a dedicated start node and a dedicated end node for the graph formed by the + // block group, since different paths in the block group may start or end at different + // places on sequences. These two "start sequence" and "end sequence" blocks will serve + // that role. + let start_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_START_HASH).unwrap(); + let start_block = GroupBlock { + id: block_index + 1, + sequence_hash: start_sequence.hash.clone(), + node_id: PATH_START_NODE_ID, + sequence: "".to_string(), + start: 0, + end: 0, + }; + blocks.push(start_block); + let end_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); + let end_block = GroupBlock { + id: block_index + 2, + sequence_hash: end_sequence.hash.clone(), + node_id: PATH_END_NODE_ID, sequence: "".to_string(), start: 0, end: 0, @@ -494,6 +663,65 @@ impl Edge { (graph, edges_by_node_pair) } + + pub fn build_graph_new( + edges: &Vec, + blocks: &Vec, + ) -> (DiGraphMap, HashMap<(i32, i32), Edge>) { + let blocks_by_start = blocks + .clone() + .into_iter() + .map(|block| { + ( + BlockKeyNew { + node_id: block.node_id, + coordinate: block.start, + }, + block.id, + ) + }) + .collect::>(); + let blocks_by_end = blocks + .clone() + .into_iter() + .map(|block| { + ( + BlockKeyNew { + node_id: block.node_id, + coordinate: block.end, + }, + block.id, + ) + }) + .collect::>(); + + let mut graph: DiGraphMap = DiGraphMap::new(); + let mut edges_by_node_pair = HashMap::new(); + for block in blocks { + graph.add_node(block.id); + } + for edge in edges { + let source_key = BlockKeyNew { + node_id: edge.source_node_id, + coordinate: edge.source_coordinate, + }; + let source_id = blocks_by_end.get(&source_key); + let target_key = BlockKeyNew { + node_id: edge.target_node_id, + coordinate: edge.target_coordinate, + }; + let target_id = blocks_by_start.get(&target_key); + + if let Some(source_id_value) = source_id { + if let Some(target_id_value) = target_id { + graph.add_edge(*source_id_value, *target_id_value, ()); + edges_by_node_pair.insert((*source_id_value, *target_id_value), edge.clone()); + } + } + } + + (graph, edges_by_node_pair) + } } #[cfg(test)] diff --git a/src/models/node.rs b/src/models/node.rs index 3ad2817..6aa6633 100644 --- a/src/models/node.rs +++ b/src/models/node.rs @@ -1,4 +1,6 @@ -use rusqlite::Connection; +use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; + +use crate::models::sequence::Sequence; pub const BOGUS_SOURCE_NODE_ID: i32 = -1; pub const BOGUS_TARGET_NODE_ID: i32 = -2; @@ -7,13 +9,13 @@ pub const PATH_START_NODE_ID: i32 = 1; pub const PATH_END_NODE_ID: i32 = 2; #[derive(Clone, Debug)] -pub struct Node<'a> { +pub struct Node { pub id: i32, - pub sequence_hash: &'a str, + pub sequence_hash: String, } -impl Node<'_> { - pub fn create<'a>(conn: &'a Connection, sequence_hash: &'a str) -> Node<'a> { +impl Node { + pub fn create(conn: &Connection, sequence_hash: &str) -> Node { let insert_statement = format!( "INSERT INTO nodes (sequence_hash) VALUES ('{}');", sequence_hash @@ -21,7 +23,50 @@ impl Node<'_> { let _ = conn.execute(&insert_statement, ()); Node { id: conn.last_insert_rowid() as i32, - sequence_hash, + sequence_hash: sequence_hash.to_string(), + } + } + + pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { + let mut stmt = conn.prepare(query).unwrap(); + let rows = stmt + .query_map(params_from_iter(placeholders), |row| { + Ok(Node { + id: row.get(0)?, + sequence_hash: row.get(1)?, + }) + }) + .unwrap(); + let mut objs = vec![]; + for row in rows { + objs.push(row.unwrap()); } + objs + } + + pub fn get_nodes(conn: &Connection, node_ids: Vec) -> Vec { + Node::query( + conn, + &format!( + "SELECT * FROM nodes WHERE id IN ({})", + node_ids.iter().map(|_| "?").collect::>().join(", ") + ), + node_ids + .iter() + .map(|id| SQLValue::Integer(*id as i64)) + .collect(), + ) + } + + pub fn sequences_from_node_ids(conn: &Connection, node_ids: Vec) -> Vec { + let nodes = Node::get_nodes(conn, node_ids); + let sequence_hashes = nodes + .iter() + .map(|node| node.sequence_hash.as_str()) + .collect::>(); + Sequence::sequences_by_hash(conn, sequence_hashes) + .values() + .cloned() + .collect() } } From d0cd891470b0ef130cac42d2ae7a8fc0203c3a2c Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 11:52:52 -0400 Subject: [PATCH 05/16] Update gfa export --- src/exports/gfa.rs | 76 +++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 6e3f7d0..385856a 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -10,9 +10,9 @@ use crate::models::{ block_group_edge::BlockGroupEdge, collection::Collection, edge::{Edge, GroupBlock}, + node::{PATH_END_NODE_ID, PATH_START_NODE_ID}, path::Path, path_edge::PathEdge, - sequence::Sequence, strand::Strand, }; @@ -26,19 +26,17 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) } let mut edges = edge_set.into_iter().collect(); - let (blocks, boundary_edges) = Edge::blocks_from_edges(conn, &edges); + let (blocks, boundary_edges) = Edge::blocks_from_edges_new(conn, &edges); edges.extend(boundary_edges.clone()); - let (graph, edges_by_node_pair) = Edge::build_graph(&edges, &blocks); + let (graph, edges_by_node_pair) = Edge::build_graph_new(&edges, &blocks); let file = File::create(filename).unwrap(); let mut writer = BufWriter::new(file); let mut terminal_block_ids = HashSet::new(); for block in &blocks { - if block.sequence_hash == Sequence::PATH_START_HASH - || block.sequence_hash == Sequence::PATH_END_HASH - { + if block.node_id == PATH_START_NODE_ID || block.node_id == PATH_END_NODE_ID { terminal_block_ids.insert(block.id); continue; } @@ -125,21 +123,21 @@ fn link_line( fn nodes_for_edges( edge1: &Edge, edge2: &Edge, - blocks_by_hash_and_start: &HashMap<(&str, i32), GroupBlock>, - blocks_by_hash_and_end: &HashMap<(&str, i32), GroupBlock>, + blocks_by_node_and_start: &HashMap<(i32, i32), GroupBlock>, + blocks_by_node_and_end: &HashMap<(i32, i32), GroupBlock>, ) -> Vec { - let mut current_block = blocks_by_hash_and_start - .get(&(edge1.target_hash.as_str(), edge1.target_coordinate)) + let mut current_block = blocks_by_node_and_start + .get(&(edge1.target_node_id, edge1.target_coordinate)) .unwrap(); - let end_block = blocks_by_hash_and_end - .get(&(edge2.source_hash.as_str(), edge2.source_coordinate)) + let end_block = blocks_by_node_and_end + .get(&(edge2.source_node_id, edge2.source_coordinate)) .unwrap(); let mut node_ids = vec![]; #[allow(clippy::while_immutable_condition)] while current_block.id != end_block.id { node_ids.push(current_block.id); - current_block = blocks_by_hash_and_start - .get(&(current_block.sequence_hash.as_str(), current_block.end)) + current_block = blocks_by_node_and_start + .get(&(current_block.node_id, current_block.end)) .unwrap(); } node_ids.push(end_block.id); @@ -157,34 +155,34 @@ fn write_paths( let edges_by_path_id = PathEdge::edges_for_paths(conn, paths.iter().map(|path| path.id).collect()); - let blocks_by_hash_and_start = blocks + let blocks_by_node_and_start = blocks .iter() - .map(|block| ((block.sequence_hash.as_str(), block.start), block.clone())) - .collect::>(); - let blocks_by_hash_and_end = blocks + .map(|block| ((block.node_id, block.start), block.clone())) + .collect::>(); + let blocks_by_node_and_end = blocks .iter() - .map(|block| ((block.sequence_hash.as_str(), block.end), block.clone())) - .collect::>(); + .map(|block| ((block.node_id, block.end), block.clone())) + .collect::>(); for path in paths { let edges_for_path = edges_by_path_id.get(&path.id).unwrap(); - let mut node_ids = vec![]; + let mut graph_node_ids = vec![]; let mut node_strands = vec![]; for (edge1, edge2) in edges_for_path.iter().tuple_windows() { let current_node_ids = nodes_for_edges( edge1, edge2, - &blocks_by_hash_and_start, - &blocks_by_hash_and_end, + &blocks_by_node_and_start, + &blocks_by_node_and_end, ); for node_id in ¤t_node_ids { - node_ids.push(*node_id); + graph_node_ids.push(*node_id); node_strands.push(edge1.target_strand); } } writer - .write_all(&path_line(&path.name, &node_ids, &node_strands).into_bytes()) + .write_all(&path_line(&path.name, &graph_node_ids, &node_strands).into_bytes()) .unwrap_or_else(|_| panic!("Error writing path {} to GFA stream", path.name)); } } @@ -206,9 +204,7 @@ mod tests { use crate::imports::gfa::import_gfa; use crate::models::{ - block_group::BlockGroup, - collection::Collection, - node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, + block_group::BlockGroup, collection::Collection, node::Node, sequence::Sequence, }; use crate::test_helpers::{get_connection, setup_gen_dir}; @@ -238,15 +234,19 @@ mod tests { .sequence_type("DNA") .sequence("CCCC") .save(&conn); + let node1 = Node::create(&conn, &sequence1.hash); + let node2 = Node::create(&conn, &sequence2.hash); + let node3 = Node::create(&conn, &sequence3.hash); + let node4 = Node::create(&conn, &sequence4.hash); let edge1 = Edge::create( &conn, Sequence::PATH_START_HASH.to_string(), - BOGUS_SOURCE_NODE_ID, + PATH_START_NODE_ID, 0, Strand::Forward, sequence1.hash.clone(), - BOGUS_TARGET_NODE_ID, + node1.id, 0, Strand::Forward, 0, @@ -255,11 +255,11 @@ mod tests { let edge2 = Edge::create( &conn, sequence1.hash, - BOGUS_SOURCE_NODE_ID, + node1.id, 4, Strand::Forward, sequence2.hash.clone(), - BOGUS_TARGET_NODE_ID, + node2.id, 0, Strand::Forward, 0, @@ -268,11 +268,11 @@ mod tests { let edge3 = Edge::create( &conn, sequence2.hash, - BOGUS_SOURCE_NODE_ID, + node2.id, 4, Strand::Forward, sequence3.hash.clone(), - BOGUS_TARGET_NODE_ID, + node3.id, 0, Strand::Forward, 0, @@ -281,11 +281,11 @@ mod tests { let edge4 = Edge::create( &conn, sequence3.hash, - BOGUS_SOURCE_NODE_ID, + node3.id, 4, Strand::Forward, sequence4.hash.clone(), - BOGUS_TARGET_NODE_ID, + node4.id, 0, Strand::Forward, 0, @@ -294,11 +294,11 @@ mod tests { let edge5 = Edge::create( &conn, sequence4.hash, - BOGUS_SOURCE_NODE_ID, + node4.id, 4, Strand::Forward, Sequence::PATH_END_HASH.to_string(), - BOGUS_TARGET_NODE_ID, + PATH_END_NODE_ID, 0, Strand::Forward, 0, From cfc0986035c05624104eccad65bee0315f4b918b Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 15:03:49 -0400 Subject: [PATCH 06/16] More code changes, migrate edge and path unit tests --- migrations/core/01-initial/up.sql | 3 - src/exports/gfa.rs | 18 +- src/imports/gfa.rs | 2 +- src/models/block_group.rs | 182 +++++++++++++++++- src/models/edge.rs | 153 +++++++-------- src/models/node.rs | 35 +++- src/models/path.rs | 296 ++++++++++++++++++++++-------- 7 files changed, 508 insertions(+), 181 deletions(-) diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index dabbc40..84320d0 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -79,11 +79,8 @@ CREATE TABLE edges ( target_strand TEXT NOT NULL, chromosome_index INTEGER NOT NULL, phased INTEGER NOT NULL, - FOREIGN KEY(source_hash) REFERENCES sequence(hash), - FOREIGN KEY(target_hash) REFERENCES sequence(hash), constraint chk_phased check (phased in (0, 1)) ) STRICT; -CREATE UNIQUE INDEX edge_uidx ON edges(source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased); -- CREATE UNIQUE INDEX edge_uidx ON edges(source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased); CREATE TABLE path_edges ( diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 385856a..f7600d5 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -107,7 +107,7 @@ fn link_line( target_strand: Strand, ) -> String { format!( - "L\t{}\t{}\t{}\t{}\t*\n", + "L\t{}\t{}\t{}\t{}\t0M\n", source_index + 1, source_strand, target_index + 1, @@ -318,7 +318,7 @@ mod tests { &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group.id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -331,7 +331,7 @@ mod tests { let block_group2 = Collection::get_block_groups(&conn, "test collection 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences(&conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences_new(&conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); @@ -350,7 +350,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -362,7 +362,7 @@ mod tests { let block_group2 = Collection::get_block_groups(conn, "test collection 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences_new(conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); } @@ -377,7 +377,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -389,7 +389,7 @@ mod tests { let block_group2 = Collection::get_block_groups(conn, "anderson promoters 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences_new(conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); } @@ -404,7 +404,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -416,7 +416,7 @@ mod tests { let block_group2 = Collection::get_block_groups(conn, "test collection 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences_new(conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); } diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 276262b..bb939b8 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -289,7 +289,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec!["AAAATTTTGGGGCCCC".to_string()]) diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 2e287e6..e13978b 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -8,8 +8,8 @@ use serde::{Deserialize, Serialize}; use crate::graph::all_simple_paths; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData, GroupBlock}; -use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; -use crate::models::path::{NewBlock, Path, PathData}; +use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID, PATH_START_NODE_ID}; +use crate::models::path::{NewBlock, Path, PathBlock, PathData}; use crate::models::path_edge::PathEdge; use crate::models::sequence::Sequence; use crate::models::strand::Strand; @@ -40,6 +40,17 @@ pub struct PathChange { pub phased: i32, } +#[derive(Clone, Debug)] +pub struct PathChangeNew { + pub block_group_id: i32, + pub path: Path, + pub start: i32, + pub end: i32, + pub block: PathBlock, + pub chromosome_index: i32, + pub phased: i32, +} + pub struct PathCache<'a> { pub cache: HashMap, pub intervaltree_cache: HashMap>, @@ -86,6 +97,52 @@ impl PathCache<'_> { } } +pub struct PathCacheNew<'a> { + pub cache: HashMap, + pub intervaltree_cache: HashMap>, + pub conn: &'a Connection, +} + +impl PathCacheNew<'_> { + pub fn new(conn: &Connection) -> PathCacheNew { + PathCacheNew { + cache: HashMap::::new(), + intervaltree_cache: HashMap::>::new(), + conn, + } + } + + pub fn lookup(path_cache: &mut PathCacheNew, block_group_id: i32, name: String) -> Path { + let path_key = PathData { + name: name.clone(), + block_group_id, + }; + let path_lookup = path_cache.cache.get(&path_key); + if let Some(path) = path_lookup { + path.clone() + } else { + let new_path = Path::get_paths( + path_cache.conn, + "select * from path where block_group_id = ?1 AND name = ?2", + vec![SQLValue::from(block_group_id), SQLValue::from(name)], + )[0] + .clone(); + + path_cache.cache.insert(path_key, new_path.clone()); + let tree = Path::intervaltree_for_new(path_cache.conn, &new_path); + path_cache.intervaltree_cache.insert(new_path.clone(), tree); + new_path + } + } + + pub fn get_intervaltree<'a>( + path_cache: &'a PathCacheNew<'a>, + path: &'a Path, + ) -> Option<&'a IntervalTree> { + path_cache.intervaltree_cache.get(path) + } +} + impl BlockGroup { pub fn create( conn: &Connection, @@ -375,6 +432,27 @@ impl BlockGroup { } } + pub fn insert_changes_new( + conn: &Connection, + changes: &Vec, + cache: &PathCacheNew, + ) { + let mut new_edges_by_block_group = HashMap::>::new(); + for change in changes { + let tree = PathCacheNew::get_intervaltree(cache, &change.path).unwrap(); + let new_edges = BlockGroup::set_up_new_edges_new(change, tree); + new_edges_by_block_group + .entry(change.block_group_id) + .and_modify(|new_edge_data| new_edge_data.extend(new_edges.clone())) + .or_insert_with(|| new_edges.clone()); + } + + for (block_group_id, new_edges) in new_edges_by_block_group { + let edge_ids = Edge::bulk_create(conn, new_edges); + BlockGroupEdge::bulk_create(conn, block_group_id, &edge_ids); + } + } + #[allow(clippy::ptr_arg)] #[allow(clippy::needless_late_init)] pub fn insert_change( @@ -486,6 +564,106 @@ impl BlockGroup { new_edges } + + pub fn set_up_new_edges_new( + change: &PathChangeNew, + tree: &IntervalTree, + ) -> Vec { + let start_blocks: Vec<&PathBlock> = + tree.query_point(change.start).map(|x| &x.value).collect(); + assert_eq!(start_blocks.len(), 1); + // NOTE: This may not be used but needs to be initialized here instead of inside the if + // statement that uses it, so that the borrow checker is happy + let previous_start_blocks: Vec<&PathBlock> = tree + .query_point(change.start - 1) + .map(|x| &x.value) + .collect(); + assert_eq!(previous_start_blocks.len(), 1); + let start_block = if start_blocks[0].path_start == change.start { + // First part of this block will be replaced/deleted, need to get previous block to add + // edge including it + previous_start_blocks[0] + } else { + start_blocks[0] + }; + + let end_blocks: Vec<&PathBlock> = tree.query_point(change.end).map(|x| &x.value).collect(); + assert_eq!(end_blocks.len(), 1); + let end_block = end_blocks[0]; + + let mut new_edges = vec![]; + + if change.block.sequence_start == change.block.sequence_end { + // Deletion + let new_edge = EdgeData { + source_hash: "".to_string(), + source_node_id: start_block.node_id, + source_coordinate: change.start - start_block.path_start + + start_block.sequence_start, + source_strand: Strand::Forward, + target_hash: "".to_string(), + target_node_id: end_block.node_id, + target_coordinate: change.end - end_block.path_start + end_block.sequence_start, + target_strand: Strand::Forward, + chromosome_index: change.chromosome_index, + phased: change.phased, + }; + new_edges.push(new_edge); + + // NOTE: If the deletion is happening at the very beginning of a path, we need to add + // an edge from the dedicated start node to the end of the deletion, to indicate it's + // another start point in the block group DAG. + if change.start == 0 { + let new_beginning_edge = EdgeData { + source_hash: "".to_string(), + source_node_id: PATH_START_NODE_ID, + source_coordinate: 0, + source_strand: Strand::Forward, + target_hash: "".to_string(), + target_node_id: end_block.node_id, + target_coordinate: change.end - end_block.path_start + end_block.sequence_start, + target_strand: Strand::Forward, + chromosome_index: change.chromosome_index, + phased: change.phased, + }; + new_edges.push(new_beginning_edge); + } + // NOTE: If the deletion is happening at the very end of a path, we might add an edge + // from the beginning of the deletion to the dedicated end node, but in practice it + // doesn't affect sequence readouts, so it may not be worth it. + } else { + // Insertion/replacement + let new_start_edge = EdgeData { + source_hash: "".to_string(), + source_node_id: start_block.node_id, + source_coordinate: change.start - start_block.path_start + + start_block.sequence_start, + source_strand: Strand::Forward, + target_hash: "".to_string(), + target_node_id: change.block.node_id, + target_coordinate: change.block.sequence_start, + target_strand: Strand::Forward, + chromosome_index: change.chromosome_index, + phased: change.phased, + }; + let new_end_edge = EdgeData { + source_hash: "".to_string(), + source_node_id: change.block.node_id, + source_coordinate: change.block.sequence_end, + source_strand: Strand::Forward, + target_hash: "".to_string(), + target_node_id: end_block.node_id, + target_coordinate: change.end - end_block.path_start + end_block.sequence_start, + target_strand: Strand::Forward, + chromosome_index: change.chromosome_index, + phased: change.phased, + }; + new_edges.push(new_start_edge); + new_edges.push(new_end_edge); + } + + new_edges + } } #[cfg(test)] diff --git a/src/models/edge.rs b/src/models/edge.rs index f5ca99c..7a71f8e 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -475,33 +475,14 @@ impl Edge { } } - let nodes = Node::get_nodes(conn, node_ids.into_iter().collect::>()); - let sequence_hashes_by_node_id = nodes - .iter() - .map(|node| (node.id, node.sequence_hash.clone())) - .collect::>(); - let sequences_by_hash = Sequence::sequences_by_hash( - conn, - sequence_hashes_by_node_id - .values() - .map(|hash| hash.as_str()) - .collect::>(), - ); - let sequences_by_node_id = sequence_hashes_by_node_id - .clone() - .into_iter() - .map(|(node_id, sequence_hash)| { - ( - node_id, - sequences_by_hash.get(&sequence_hash).unwrap().clone(), - ) - }) - .collect::>(); + let sequences_by_node_id = + Node::get_sequences_by_node_ids(conn, node_ids.into_iter().collect::>()); + let mut blocks = vec![]; let mut block_index = 0; let mut boundary_edges = vec![]; for (node_id, sequence) in sequences_by_node_id.into_iter() { - let hash = sequence_hashes_by_node_id.get(&node_id).unwrap(); + let hash = sequence.hash.clone(); let block_boundaries = Edge::get_block_boundaries( edges_by_source_node_id.get(&node_id), edges_by_target_node_id.get(&node_id), @@ -739,13 +720,14 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_hash: sequence1.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: node1.id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -755,25 +737,26 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: sequence1.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: node1.id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: sequence2.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: node2.id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_hash: sequence2.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: node2.id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: Sequence::PATH_END_HASH.to_string(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -785,22 +768,22 @@ mod tests { let edges = Edge::bulk_load(conn, &edge_ids); assert_eq!(edges.len(), 3); - let edges_by_source_hash = edges + let edges_by_source_node_id = edges .into_iter() - .map(|edge| (edge.source_hash.clone(), edge)) - .collect::>(); + .map(|edge| (edge.source_node_id, edge)) + .collect::>(); - let edge_result1 = edges_by_source_hash.get(Sequence::PATH_START_HASH).unwrap(); + let edge_result1 = edges_by_source_node_id.get(&PATH_START_NODE_ID).unwrap(); assert_eq!(edge_result1.source_coordinate, -1); - assert_eq!(edge_result1.target_hash, sequence1.hash); + assert_eq!(edge_result1.target_node_id, node1.id); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = edges_by_source_hash.get(&sequence1.hash).unwrap(); + let edge_result2 = edges_by_source_node_id.get(&node1.id).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); - assert_eq!(edge_result2.target_hash, sequence2.hash); + assert_eq!(edge_result2.target_node_id, node2.id); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = edges_by_source_hash.get(&sequence2.hash).unwrap(); + let edge_result3 = edges_by_source_node_id.get(&node2.id).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); - assert_eq!(edge_result3.target_hash, Sequence::PATH_END_HASH); + assert_eq!(edge_result3.target_node_id, PATH_END_NODE_ID); assert_eq!(edge_result3.target_coordinate, -1); } @@ -812,13 +795,14 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_hash: sequence1.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: node1.id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -828,25 +812,26 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: sequence1.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: node1.id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: sequence2.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: node2.id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_hash: sequence2.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: node2.id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: Sequence::PATH_END_HASH.to_string(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -891,32 +876,33 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1 = Node::create(conn, sequence1.hash.as_str()); // NOTE: Create one edge ahead of time to confirm an existing row ID gets returned in the bulk create let existing_edge = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + PATH_START_NODE_ID, -1, Strand::Forward, - sequence1.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node1.id, 1, Strand::Forward, 0, 0, ); - assert_eq!(existing_edge.source_hash, Sequence::PATH_START_HASH); + assert_eq!(existing_edge.source_node_id, PATH_START_NODE_ID); assert_eq!(existing_edge.source_coordinate, -1); - assert_eq!(existing_edge.target_hash, sequence1.hash); + assert_eq!(existing_edge.target_node_id, node1.id); assert_eq!(existing_edge.target_coordinate, 1); let edge1 = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), + source_hash: "".to_string(), source_coordinate: -1, - source_node_id: BOGUS_SOURCE_NODE_ID, + source_node_id: PATH_START_NODE_ID, source_strand: Strand::Forward, - target_hash: sequence1.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: node1.id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -926,25 +912,26 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: sequence1.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: node1.id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: sequence2.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: node2.id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_hash: sequence2.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_hash: "".to_string(), + source_node_id: node2.id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: Sequence::PATH_END_HASH.to_string(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_hash: "".to_string(), + target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, chromosome_index: 0, @@ -956,25 +943,25 @@ mod tests { let edges = Edge::bulk_load(conn, &edge_ids); assert_eq!(edges.len(), 3); - let edges_by_source_hash = edges + let edges_by_source_node_id = edges .into_iter() - .map(|edge| (edge.source_hash.clone(), edge)) - .collect::>(); + .map(|edge| (edge.source_node_id, edge)) + .collect::>(); - let edge_result1 = edges_by_source_hash.get(Sequence::PATH_START_HASH).unwrap(); + let edge_result1 = edges_by_source_node_id.get(&PATH_START_NODE_ID).unwrap(); assert_eq!(edge_result1.id, existing_edge.id); assert_eq!(edge_result1.source_coordinate, -1); - assert_eq!(edge_result1.target_hash, sequence1.hash); + assert_eq!(edge_result1.target_node_id, node1.id); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = edges_by_source_hash.get(&sequence1.hash).unwrap(); + let edge_result2 = edges_by_source_node_id.get(&node1.id).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); - assert_eq!(edge_result2.target_hash, sequence2.hash); + assert_eq!(edge_result2.target_node_id, node2.id); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = edges_by_source_hash.get(&sequence2.hash).unwrap(); + let edge_result3 = edges_by_source_node_id.get(&node2.id).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); - assert_eq!(edge_result3.target_hash, Sequence::PATH_END_HASH); + assert_eq!(edge_result3.target_node_id, PATH_END_NODE_ID); assert_eq!(edge_result3.target_coordinate, -1); } } diff --git a/src/models/node.rs b/src/models/node.rs index 6aa6633..1219ba0 100644 --- a/src/models/node.rs +++ b/src/models/node.rs @@ -1,4 +1,5 @@ use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; +use std::collections::HashMap; use crate::models::sequence::Sequence; @@ -58,15 +59,31 @@ impl Node { ) } - pub fn sequences_from_node_ids(conn: &Connection, node_ids: Vec) -> Vec { - let nodes = Node::get_nodes(conn, node_ids); - let sequence_hashes = nodes + pub fn get_sequences_by_node_ids( + conn: &Connection, + node_ids: Vec, + ) -> HashMap { + let nodes = Node::get_nodes(conn, node_ids.into_iter().collect::>()); + let sequence_hashes_by_node_id = nodes .iter() - .map(|node| node.sequence_hash.as_str()) - .collect::>(); - Sequence::sequences_by_hash(conn, sequence_hashes) - .values() - .cloned() - .collect() + .map(|node| (node.id, node.sequence_hash.clone())) + .collect::>(); + let sequences_by_hash = Sequence::sequences_by_hash( + conn, + sequence_hashes_by_node_id + .values() + .map(|hash| hash.as_str()) + .collect::>(), + ); + sequence_hashes_by_node_id + .clone() + .into_iter() + .map(|(node_id, sequence_hash)| { + ( + node_id, + sequences_by_hash.get(&sequence_hash).unwrap().clone(), + ) + }) + .collect::>() } } diff --git a/src/models/path.rs b/src/models/path.rs index 1e5d145..1ba460f 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -6,7 +6,13 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection}; use serde::{Deserialize, Serialize}; -use crate::models::{edge::Edge, path_edge::PathEdge, sequence::Sequence, strand::Strand}; +use crate::models::{ + edge::Edge, + node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}, + path_edge::PathEdge, + sequence::Sequence, + strand::Strand, +}; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] pub struct Path { @@ -66,6 +72,18 @@ pub struct NewBlock { pub strand: Strand, } +#[derive(Clone, Debug)] +pub struct PathBlock { + pub id: i32, + pub node_id: i32, + pub block_sequence: String, + pub sequence_start: i32, + pub sequence_end: i32, + pub path_start: i32, + pub path_end: i32, + pub strand: Strand, +} + impl Path { pub fn create(conn: &Connection, name: &str, block_group_id: i32, edge_ids: &[i32]) -> Path { // TODO: Should we do something if edge_ids don't match here? Suppose we have a path @@ -158,7 +176,7 @@ impl Path { } pub fn sequence(conn: &Connection, path: Path) -> String { - let blocks = Path::blocks_for(conn, &path); + let blocks = Path::blocks_for_new(conn, &path); blocks .into_iter() .map(|block| block.block_sequence) @@ -287,6 +305,128 @@ impl Path { .collect(); tree } + + pub fn edge_pairs_to_block_new( + block_id: i32, + path: &Path, + into: Edge, + out_of: Edge, + sequences_by_node_id: &HashMap, + current_path_length: i32, + ) -> PathBlock { + if into.target_node_id != out_of.source_node_id { + panic!( + "Consecutive edges in path {0} don't share the same sequence", + path.id + ); + } + + let sequence = sequences_by_node_id.get(&into.target_node_id).unwrap(); + let start = into.target_coordinate; + let end = out_of.source_coordinate; + + let strand; + let block_sequence_length; + + if into.target_strand == out_of.source_strand { + strand = into.target_strand; + block_sequence_length = end - start; + } else { + panic!( + "Edge pair with target_strand/source_strand mismatch for path {}", + path.id + ); + } + + let block_sequence = if strand == Strand::Reverse { + revcomp(&sequence.get_sequence(start, end)) + } else { + sequence.get_sequence(start, end) + }; + + PathBlock { + id: block_id, + node_id: into.target_node_id, + block_sequence, + sequence_start: start, + sequence_end: end, + path_start: current_path_length, + path_end: current_path_length + block_sequence_length, + strand, + } + } + + pub fn blocks_for_new(conn: &Connection, path: &Path) -> Vec { + let edges = PathEdge::edges_for_path(conn, path.id); + let mut sequence_node_ids = HashSet::new(); + for edge in &edges { + if edge.source_node_id != PATH_START_NODE_ID { + sequence_node_ids.insert(edge.source_node_id); + } + if edge.target_node_id != PATH_END_NODE_ID { + sequence_node_ids.insert(edge.target_node_id); + } + } + let sequences_by_node_id = Node::get_sequences_by_node_ids( + conn, + sequence_node_ids.into_iter().collect::>(), + ); + + let mut blocks = vec![]; + let mut path_length = 0; + + // NOTE: Adding a "start block" for the dedicated start sequence with a range from i32::MIN + // to 0 makes interval tree lookups work better. If the point being looked up is -1 (or + // below), it will return this block. + blocks.push(PathBlock { + id: -1, + node_id: PATH_START_NODE_ID, + block_sequence: "".to_string(), + sequence_start: 0, + sequence_end: 0, + path_start: i32::MIN + 1, + path_end: 0, + strand: Strand::Forward, + }); + + for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { + let block = Path::edge_pairs_to_block_new( + index as i32, + path, + into, + out_of, + &sequences_by_node_id, + path_length, + ); + path_length += block.block_sequence.len() as i32; + blocks.push(block); + } + + // NOTE: Adding an "end block" for the dedicated end sequence with a range from the path + // length to i32::MAX makes interval tree lookups work better. If the point being looked up + // is the path length (or higher), it will return this block. + blocks.push(PathBlock { + id: -2, + node_id: PATH_END_NODE_ID, + block_sequence: "".to_string(), + sequence_start: 0, + sequence_end: 0, + path_start: path_length, + path_end: i32::MAX - 1, + strand: Strand::Forward, + }); + + blocks + } + + pub fn intervaltree_for_new(conn: &Connection, path: &Path) -> IntervalTree { + let blocks = Path::blocks_for_new(conn, path); + let tree: IntervalTree = blocks + .into_iter() + .map(|block| (block.path_start..block.path_end, block)) + .collect(); + tree + } } #[cfg(test)] @@ -294,11 +434,7 @@ mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; - use crate::models::{ - block_group::BlockGroup, - collection::Collection, - node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, - }; + use crate::models::{block_group::BlockGroup, collection::Collection}; use crate::test_helpers::get_connection; #[test] @@ -310,14 +446,15 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + PATH_START_NODE_ID, -123, Strand::Forward, - sequence1.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node1.id, 0, Strand::Forward, 0, @@ -327,14 +464,15 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - sequence1.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node1.id, 8, Strand::Forward, - sequence2.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node2.id, 1, Strand::Forward, 0, @@ -344,14 +482,15 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); + let node3 = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - sequence2.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node2.id, 8, Strand::Forward, - sequence3.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node3.id, 1, Strand::Forward, 0, @@ -361,14 +500,15 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); + let node4 = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - sequence3.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node3.id, 8, Strand::Forward, - sequence4.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node4.id, 1, Strand::Forward, 0, @@ -376,12 +516,12 @@ mod tests { ); let edge5 = Edge::create( conn, - sequence4.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node4.id, 8, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + PATH_END_NODE_ID, -1, Strand::Forward, 0, @@ -406,14 +546,15 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1 = Node::create(conn, sequence1.hash.as_str()); let edge5 = Edge::create( conn, - sequence1.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node1.id, 8, Strand::Reverse, - Sequence::PATH_END_HASH.to_string(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + PATH_END_NODE_ID, 0, Strand::Reverse, 0, @@ -423,14 +564,15 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2 = Node::create(conn, sequence2.hash.as_str()); let edge4 = Edge::create( conn, - sequence2.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node2.id, 7, Strand::Reverse, - sequence1.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node1.id, 0, Strand::Reverse, 0, @@ -440,14 +582,15 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); + let node3 = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - sequence3.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node3.id, 7, Strand::Reverse, - sequence2.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node2.id, 0, Strand::Reverse, 0, @@ -457,14 +600,15 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); + let node4 = Node::create(conn, sequence4.hash.as_str()); let edge2 = Edge::create( conn, - sequence4.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node4.id, 7, Strand::Reverse, - sequence3.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node3.id, 0, Strand::Reverse, 0, @@ -472,12 +616,12 @@ mod tests { ); let edge1 = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + PATH_START_NODE_ID, -1, Strand::Reverse, - sequence4.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node4.id, 0, Strand::Reverse, 0, @@ -509,14 +653,15 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); + let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + PATH_START_NODE_ID, -1, Strand::Forward, - sequence1.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node1.id, 0, Strand::Forward, 0, @@ -526,14 +671,15 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); + let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - sequence1.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node1.id, 8, Strand::Forward, - sequence2.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node2.id, 1, Strand::Forward, 0, @@ -543,14 +689,15 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); + let node3 = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - sequence2.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node2.id, 8, Strand::Forward, - sequence3.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node3.id, 1, Strand::Forward, 0, @@ -560,14 +707,15 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); + let node4 = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - sequence3.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node3.id, 8, Strand::Forward, - sequence4.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + node4.id, 1, Strand::Forward, 0, @@ -575,12 +723,12 @@ mod tests { ); let edge5 = Edge::create( conn, - sequence4.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + node4.id, 8, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + PATH_END_NODE_ID, -1, Strand::Forward, 0, @@ -593,31 +741,31 @@ mod tests { block_group.id, &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - let tree = Path::intervaltree_for(conn, &path); - let blocks1: Vec<_> = tree.query_point(2).map(|x| x.value.clone()).collect(); + let tree = Path::intervaltree_for_new(conn, &path); + let blocks1: Vec = tree.query_point(2).map(|x| x.value.clone()).collect(); assert_eq!(blocks1.len(), 1); let block1 = &blocks1[0]; - assert_eq!(block1.sequence.hash, sequence1.hash); + assert_eq!(block1.node_id, node1.id); assert_eq!(block1.sequence_start, 0); assert_eq!(block1.sequence_end, 8); assert_eq!(block1.path_start, 0); assert_eq!(block1.path_end, 8); assert_eq!(block1.strand, Strand::Forward); - let blocks2: Vec<_> = tree.query_point(12).map(|x| x.value.clone()).collect(); + let blocks2: Vec = tree.query_point(12).map(|x| x.value.clone()).collect(); assert_eq!(blocks2.len(), 1); let block2 = &blocks2[0]; - assert_eq!(block2.sequence.hash, sequence2.hash); + assert_eq!(block2.node_id, node2.id); assert_eq!(block2.sequence_start, 1); assert_eq!(block2.sequence_end, 8); assert_eq!(block2.path_start, 8); assert_eq!(block2.path_end, 15); assert_eq!(block2.strand, Strand::Forward); - let blocks4: Vec<_> = tree.query_point(25).map(|x| x.value.clone()).collect(); + let blocks4: Vec = tree.query_point(25).map(|x| x.value.clone()).collect(); assert_eq!(blocks4.len(), 1); let block4 = &blocks4[0]; - assert_eq!(block4.sequence.hash, sequence4.hash); + assert_eq!(block4.node_id, node4.id); assert_eq!(block4.sequence_start, 1); assert_eq!(block4.sequence_end, 8); assert_eq!(block4.path_start, 22); From a66f5df31943927280eaa42edb081d5f5b25143d Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 15:41:25 -0400 Subject: [PATCH 07/16] Block group tests --- src/models/block_group.rs | 320 +++++++++++++++++++++----------------- src/test_helpers.rs | 106 ++++++++++++- 2 files changed, 281 insertions(+), 145 deletions(-) diff --git a/src/models/block_group.rs b/src/models/block_group.rs index e13978b..720b5c8 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -465,6 +465,18 @@ impl BlockGroup { BlockGroupEdge::bulk_create(conn, change.block_group_id, &edge_ids); } + #[allow(clippy::ptr_arg)] + #[allow(clippy::needless_late_init)] + pub fn insert_change_new( + conn: &Connection, + change: &PathChangeNew, + tree: &IntervalTree, + ) { + let new_edges = BlockGroup::set_up_new_edges_new(change, tree); + let edge_ids = Edge::bulk_create(conn, new_edges); + BlockGroupEdge::bulk_create(conn, change.block_group_id, &edge_ids); + } + pub fn set_up_new_edges( change: &PathChange, tree: &IntervalTree, @@ -669,8 +681,8 @@ impl BlockGroup { #[cfg(test)] mod tests { use super::*; - use crate::models::{collection::Collection, sample::Sample}; - use crate::test_helpers::{get_connection, setup_block_group}; + use crate::models::{collection::Collection, node::Node, sample::Sample}; + use crate::test_helpers::{get_connection, setup_block_group_new}; #[test] fn test_blockgroup_create() { @@ -705,14 +717,15 @@ mod tests { #[test] fn insert_and_deletion_get_all() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -720,7 +733,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 7, @@ -729,10 +742,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -745,9 +758,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node.id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -756,7 +770,7 @@ mod tests { strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 19, @@ -766,9 +780,9 @@ mod tests { phased: 0, }; // take out an entire block. - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -783,14 +797,15 @@ mod tests { #[test] fn simple_insert_get_all() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -798,7 +813,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 7, @@ -807,10 +822,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -823,14 +838,15 @@ mod tests { #[test] fn insert_on_block_boundary_middle() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -838,7 +854,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 15, @@ -847,10 +863,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -863,14 +879,15 @@ mod tests { #[test] fn insert_within_block() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -878,7 +895,7 @@ mod tests { path_end: 17, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 12, @@ -887,10 +904,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -903,14 +920,15 @@ mod tests { #[test] fn insert_on_block_boundary_start() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -918,7 +936,7 @@ mod tests { path_end: 10, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 10, @@ -927,10 +945,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -943,14 +961,15 @@ mod tests { #[test] fn insert_on_block_boundary_end() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -958,7 +977,7 @@ mod tests { path_end: 9, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 9, @@ -967,10 +986,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -983,14 +1002,15 @@ mod tests { #[test] fn insert_across_entire_block_boundary() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -998,7 +1018,7 @@ mod tests { path_end: 20, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 10, @@ -1007,10 +1027,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1023,14 +1043,15 @@ mod tests { #[test] fn insert_across_two_blocks() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1038,7 +1059,7 @@ mod tests { path_end: 25, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 15, @@ -1047,10 +1068,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1063,14 +1084,15 @@ mod tests { #[test] fn insert_spanning_blocks() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1078,7 +1100,7 @@ mod tests { path_end: 35, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 5, @@ -1087,10 +1109,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1103,14 +1125,15 @@ mod tests { #[test] fn simple_deletion() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node.id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1119,7 +1142,7 @@ mod tests { strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 19, @@ -1130,9 +1153,9 @@ mod tests { }; // take out an entire block. - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1145,14 +1168,15 @@ mod tests { #[test] fn doesnt_apply_same_insert_twice() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1160,7 +1184,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 7, @@ -1169,10 +1193,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1181,10 +1205,10 @@ mod tests { ]) ); - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1197,14 +1221,15 @@ mod tests { #[test] fn insert_at_beginning_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1212,7 +1237,7 @@ mod tests { path_end: 0, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 0, @@ -1221,10 +1246,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1237,15 +1262,16 @@ mod tests { #[test] fn insert_at_end_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1253,7 +1279,7 @@ mod tests { path_end: 40, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 40, @@ -1262,10 +1288,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1278,14 +1304,15 @@ mod tests { #[test] fn insert_at_one_bp_into_block() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1293,7 +1320,7 @@ mod tests { path_end: 11, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 10, @@ -1302,10 +1329,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1318,14 +1345,15 @@ mod tests { #[test] fn insert_at_one_bp_from_end_of_block() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert = NewBlock { + let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert = PathBlock { id: 0, - sequence: insert_sequence.clone(), + node_id: insert_node.id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1333,7 +1361,7 @@ mod tests { path_end: 20, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 19, @@ -1342,10 +1370,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1358,14 +1386,15 @@ mod tests { #[test] fn delete_at_beginning_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node.id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1373,7 +1402,7 @@ mod tests { path_end: 1, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 0, @@ -1382,10 +1411,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1398,14 +1427,15 @@ mod tests { #[test] fn delete_at_end_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node.id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1413,7 +1443,7 @@ mod tests { path_end: 40, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 35, @@ -1422,10 +1452,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1438,14 +1468,15 @@ mod tests { #[test] fn deletion_starting_at_block_boundary() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node.id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1453,7 +1484,7 @@ mod tests { path_end: 12, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 10, @@ -1462,10 +1493,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1478,14 +1509,15 @@ mod tests { #[test] fn deletion_ending_at_block_boundary() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group(&conn); + let (block_group_id, path) = setup_block_group_new(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") .save(&conn); - let deletion = NewBlock { + let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion = PathBlock { id: 0, - sequence: deletion_sequence.clone(), + node_id: deletion_node.id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1493,7 +1525,7 @@ mod tests { path_end: 20, strand: Strand::Forward, }; - let change = PathChange { + let change = PathChangeNew { block_group_id, path: path.clone(), start: 18, @@ -1502,10 +1534,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for(&conn, &path); - BlockGroup::insert_change(&conn, &change, &tree); + let tree = Path::intervaltree_for_new(&conn, &path); + BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ diff --git a/src/test_helpers.rs b/src/test_helpers.rs index 81f65ef..48f29d3 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -9,7 +9,9 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::collection::Collection; use crate::models::edge::Edge; -use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; +use crate::models::node::{ + Node, BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID, PATH_END_NODE_ID, PATH_START_NODE_ID, +}; use crate::models::path::Path; use crate::models::sequence::Sequence; use crate::models::strand::Strand; @@ -155,3 +157,105 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); (block_group.id, path) } + +pub fn setup_block_group_new(conn: &Connection) -> (i32, Path) { + let a_seq = Sequence::new() + .sequence_type("DNA") + .sequence("AAAAAAAAAA") + .save(conn); + let a_node = Node::create(conn, a_seq.hash.as_str()); + let t_seq = Sequence::new() + .sequence_type("DNA") + .sequence("TTTTTTTTTT") + .save(conn); + let t_node = Node::create(conn, t_seq.hash.as_str()); + let c_seq = Sequence::new() + .sequence_type("DNA") + .sequence("CCCCCCCCCC") + .save(conn); + let c_node = Node::create(conn, c_seq.hash.as_str()); + let g_seq = Sequence::new() + .sequence_type("DNA") + .sequence("GGGGGGGGGG") + .save(conn); + let g_node = Node::create(conn, g_seq.hash.as_str()); + let _collection = Collection::create(conn, "test"); + let block_group = BlockGroup::create(conn, "test", None, "hg19"); + let edge0 = Edge::create( + conn, + "".to_string(), + PATH_START_NODE_ID, + 0, + Strand::Forward, + "".to_string(), + a_node.id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge1 = Edge::create( + conn, + "".to_string(), + a_node.id, + 10, + Strand::Forward, + "".to_string(), + t_node.id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge2 = Edge::create( + conn, + "".to_string(), + t_node.id, + 10, + Strand::Forward, + "".to_string(), + c_node.id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge3 = Edge::create( + conn, + "".to_string(), + c_node.id, + 10, + Strand::Forward, + "".to_string(), + g_node.id, + 0, + Strand::Forward, + 0, + 0, + ); + let edge4 = Edge::create( + conn, + "".to_string(), + g_node.id, + 10, + Strand::Forward, + "".to_string(), + PATH_END_NODE_ID, + 0, + Strand::Forward, + 0, + 0, + ); + BlockGroupEdge::bulk_create( + conn, + block_group.id, + &[edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], + ); + let path = Path::create( + conn, + "chr1", + block_group.id, + &[edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], + ); + (block_group.id, path) +} From 4a5898cbb2401e53b6e0768feea12619ed340b50 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 15:56:25 -0400 Subject: [PATCH 08/16] Update vcf code --- src/imports/fasta.rs | 2 +- src/updates/vcf.rs | 47 +++++++++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index aafb33d..f984a67 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -138,7 +138,7 @@ mod tests { op_conn, ); assert_eq!( - BlockGroup::get_all_sequences(&conn, 1), + BlockGroup::get_all_sequences_new(&conn, 1), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index 7bca403..2b441f3 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -3,11 +3,12 @@ use std::fmt::Debug; use std::{io, str}; use crate::models::{ - block_group::{BlockGroup, BlockGroupData, PathCache, PathChange}, + block_group::{BlockGroup, BlockGroupData, PathCacheNew, PathChangeNew}, file_types::FileTypes, metadata, + node::Node, operations::{FileAddition, Operation, OperationSummary}, - path::{NewBlock, Path}, + path::{Path, PathBlock}, sample::Sample, sequence::Sequence, strand::Strand, @@ -118,20 +119,22 @@ fn prepare_change( ref_end: i32, chromosome_index: i32, phased: i32, - sequence: Sequence, -) -> PathChange { + block_sequence: String, + sequence_length: i32, + node_id: i32, +) -> PathChangeNew { // TODO: new sequence may not be real and be or some sort. Handle these. - let new_block = NewBlock { + let new_block = PathBlock { id: 0, - sequence: sequence.clone(), - block_sequence: sequence.get_sequence(None, None), + node_id, + block_sequence, sequence_start: 0, - sequence_end: sequence.length, + sequence_end: sequence_length, path_start: ref_start, path_end: ref_end, strand: Strand::Forward, }; - PathChange { + PathChangeNew { block_group_id: sample_bg_id, path: sample_path.clone(), start: ref_start, @@ -191,10 +194,10 @@ pub fn update_with_vcf( // Cache a bunch of data ahead of making changes let mut block_group_cache = BlockGroupCache::new(conn); - let mut path_cache = PathCache::new(conn); + let mut path_cache = PathCacheNew::new(conn); let mut sequence_cache = SequenceCache::new(conn); - let mut changes: HashMap<(Path, String), Vec> = HashMap::new(); + let mut changes: HashMap<(Path, String), Vec> = HashMap::new(); for result in reader.records() { let record = result.unwrap(); @@ -213,7 +216,7 @@ pub fn update_with_vcf( &fixed_sample, seq_name.clone(), ); - let sample_path = PathCache::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); + let sample_path = PathCacheNew::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); for (chromosome_index, genotype) in genotype.iter().enumerate() { if let Some(gt) = genotype { @@ -243,7 +246,7 @@ pub fn update_with_vcf( seq_name.clone(), ); let sample_path = - PathCache::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); + PathCacheNew::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); let genotype = sample.get(&header, "GT"); if genotype.is_some() { @@ -277,6 +280,8 @@ pub fn update_with_vcf( for vcf_entry in vcf_entries { let sequence = SequenceCache::lookup(&mut sequence_cache, "DNA", vcf_entry.alt_seq.to_string()); + let sequence_string = sequence.get_sequence(None, None); + let node = Node::create(conn, sequence.hash.as_str()); let change = prepare_change( vcf_entry.block_group_id, &vcf_entry.path, @@ -284,7 +289,9 @@ pub fn update_with_vcf( ref_end as i32, vcf_entry.chromosome_index, vcf_entry.phased, - sequence, + sequence_string.clone(), + sequence_string.len() as i32, + node.id, ); changes .entry((vcf_entry.path, vcf_entry.sample_name)) @@ -294,7 +301,7 @@ pub fn update_with_vcf( } let mut summary: HashMap> = HashMap::new(); for ((path, sample_name), path_changes) in changes { - BlockGroup::insert_changes(conn, &path_changes, &path_cache); + BlockGroup::insert_changes_new(conn, &path_changes, &path_cache); summary .entry(sample_name) .or_default() @@ -355,7 +362,7 @@ mod tests { op_conn, ); assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences_new(conn, 1), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); // A homozygous set of variants should only return 1 sequence @@ -366,12 +373,12 @@ mod tests { // ); // Blockgroup 3 belongs to the `G1` genotype and has no changes assert_eq!( - BlockGroup::get_all_sequences(conn, 3), + BlockGroup::get_all_sequences_new(conn, 3), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); // This individual is homozygous for the first variant and does not contain the second assert_eq!( - BlockGroup::get_all_sequences(conn, 4), + BlockGroup::get_all_sequences_new(conn, 4), HashSet::from_iter(vec![ "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATCATCGATCGATCGATCGGGAACACACAGAGA".to_string(), @@ -410,11 +417,11 @@ mod tests { op_conn, ); assert_eq!( - BlockGroup::get_all_sequences(conn, 1), + BlockGroup::get_all_sequences_new(conn, 1), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); assert_eq!( - BlockGroup::get_all_sequences(conn, 2), + BlockGroup::get_all_sequences_new(conn, 2), HashSet::from_iter( [ "ATCGATCGATAGAGATCGATCGGGAACACACAGAGA", From 1332a570657efa7e82489503597d706c5ea3ca9d Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 16:09:41 -0400 Subject: [PATCH 09/16] Remove old unused code --- src/exports/gfa.rs | 20 +- src/imports/fasta.rs | 2 +- src/imports/gfa.rs | 4 +- src/models/block_group.rs | 419 ++++++++------------------------------ src/models/edge.rs | 196 +----------------- src/models/path.rs | 134 +----------- src/updates/vcf.rs | 26 +-- 7 files changed, 121 insertions(+), 680 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index f7600d5..0839bce 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -26,10 +26,10 @@ pub fn export_gfa(conn: &Connection, collection_name: &str, filename: &PathBuf) } let mut edges = edge_set.into_iter().collect(); - let (blocks, boundary_edges) = Edge::blocks_from_edges_new(conn, &edges); + let (blocks, boundary_edges) = Edge::blocks_from_edges(conn, &edges); edges.extend(boundary_edges.clone()); - let (graph, edges_by_node_pair) = Edge::build_graph_new(&edges, &blocks); + let (graph, edges_by_node_pair) = Edge::build_graph(&edges, &blocks); let file = File::create(filename).unwrap(); let mut writer = BufWriter::new(file); @@ -318,7 +318,7 @@ mod tests { &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group.id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group.id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -331,7 +331,7 @@ mod tests { let block_group2 = Collection::get_block_groups(&conn, "test collection 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences_new(&conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences(&conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); @@ -350,7 +350,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -362,7 +362,7 @@ mod tests { let block_group2 = Collection::get_block_groups(conn, "test collection 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences_new(conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); } @@ -377,7 +377,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -389,7 +389,7 @@ mod tests { let block_group2 = Collection::get_block_groups(conn, "anderson promoters 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences_new(conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); } @@ -404,7 +404,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); let temp_dir = tempdir().expect("Couldn't get handle to temp directory"); let mut gfa_path = PathBuf::from(temp_dir.path()); @@ -416,7 +416,7 @@ mod tests { let block_group2 = Collection::get_block_groups(conn, "test collection 2") .pop() .unwrap(); - let all_sequences2 = BlockGroup::get_all_sequences_new(conn, block_group2.id); + let all_sequences2 = BlockGroup::get_all_sequences(conn, block_group2.id); assert_eq!(all_sequences, all_sequences2); } diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index f984a67..aafb33d 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -138,7 +138,7 @@ mod tests { op_conn, ); assert_eq!( - BlockGroup::get_all_sequences_new(&conn, 1), + BlockGroup::get_all_sequences(&conn, 1), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index bb939b8..0c39723 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -289,7 +289,7 @@ mod tests { import_gfa(&gfa_path, &collection_name, conn); let block_group_id = BlockGroup::get_id(conn, &collection_name, None, ""); - let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec!["AAAATTTTGGGGCCCC".to_string()]) @@ -394,7 +394,7 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "AA"); - let all_sequences = BlockGroup::get_all_sequences_new(conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); assert_eq!(all_sequences, HashSet::from_iter(vec!["AA".to_string()])); } } diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 720b5c8..97e2370 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -8,8 +8,8 @@ use serde::{Deserialize, Serialize}; use crate::graph::all_simple_paths; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData, GroupBlock}; -use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID, PATH_START_NODE_ID}; -use crate::models::path::{NewBlock, Path, PathBlock, PathData}; +use crate::models::node::PATH_START_NODE_ID; +use crate::models::path::{Path, PathBlock, PathData}; use crate::models::path_edge::PathEdge; use crate::models::sequence::Sequence; use crate::models::strand::Strand; @@ -31,17 +31,6 @@ pub struct BlockGroupData<'a> { #[derive(Clone, Debug)] pub struct PathChange { - pub block_group_id: i32, - pub path: Path, - pub start: i32, - pub end: i32, - pub block: NewBlock, - pub chromosome_index: i32, - pub phased: i32, -} - -#[derive(Clone, Debug)] -pub struct PathChangeNew { pub block_group_id: i32, pub path: Path, pub start: i32, @@ -53,7 +42,7 @@ pub struct PathChangeNew { pub struct PathCache<'a> { pub cache: HashMap, - pub intervaltree_cache: HashMap>, + pub intervaltree_cache: HashMap>, pub conn: &'a Connection, } @@ -61,7 +50,7 @@ impl PathCache<'_> { pub fn new(conn: &Connection) -> PathCache { PathCache { cache: HashMap::::new(), - intervaltree_cache: HashMap::>::new(), + intervaltree_cache: HashMap::>::new(), conn, } } @@ -92,52 +81,6 @@ impl PathCache<'_> { pub fn get_intervaltree<'a>( path_cache: &'a PathCache<'a>, path: &'a Path, - ) -> Option<&'a IntervalTree> { - path_cache.intervaltree_cache.get(path) - } -} - -pub struct PathCacheNew<'a> { - pub cache: HashMap, - pub intervaltree_cache: HashMap>, - pub conn: &'a Connection, -} - -impl PathCacheNew<'_> { - pub fn new(conn: &Connection) -> PathCacheNew { - PathCacheNew { - cache: HashMap::::new(), - intervaltree_cache: HashMap::>::new(), - conn, - } - } - - pub fn lookup(path_cache: &mut PathCacheNew, block_group_id: i32, name: String) -> Path { - let path_key = PathData { - name: name.clone(), - block_group_id, - }; - let path_lookup = path_cache.cache.get(&path_key); - if let Some(path) = path_lookup { - path.clone() - } else { - let new_path = Path::get_paths( - path_cache.conn, - "select * from path where block_group_id = ?1 AND name = ?2", - vec![SQLValue::from(block_group_id), SQLValue::from(name)], - )[0] - .clone(); - - path_cache.cache.insert(path_key, new_path.clone()); - let tree = Path::intervaltree_for_new(path_cache.conn, &new_path); - path_cache.intervaltree_cache.insert(new_path.clone(), tree); - new_path - } - } - - pub fn get_intervaltree<'a>( - path_cache: &'a PathCacheNew<'a>, - path: &'a Path, ) -> Option<&'a IntervalTree> { path_cache.intervaltree_cache.get(path) } @@ -362,59 +305,6 @@ impl BlockGroup { sequences } - pub fn get_all_sequences_new(conn: &Connection, block_group_id: i32) -> HashSet { - let mut edges = BlockGroupEdge::edges_for_block_group(conn, block_group_id); - let (blocks, boundary_edges) = Edge::blocks_from_edges_new(conn, &edges); - edges.extend(boundary_edges.clone()); - let (graph, _) = Edge::build_graph_new(&edges, &blocks); - - let mut start_nodes = vec![]; - let mut end_nodes = vec![]; - for node in graph.nodes() { - let has_incoming = graph.neighbors_directed(node, Direction::Incoming).next(); - let has_outgoing = graph.neighbors_directed(node, Direction::Outgoing).next(); - if has_incoming.is_none() { - start_nodes.push(node); - } - if has_outgoing.is_none() { - end_nodes.push(node); - } - } - - let blocks_by_id = blocks - .clone() - .into_iter() - .map(|block| (block.id, block)) - .collect::>(); - let mut sequences = HashSet::::new(); - - for start_node in start_nodes { - for end_node in &end_nodes { - // TODO: maybe make all_simple_paths return a single path id where start == end - if start_node == *end_node { - let block = blocks_by_id.get(&start_node).unwrap(); - if block.sequence_hash != Sequence::PATH_START_HASH - && block.sequence_hash != Sequence::PATH_END_HASH - { - sequences.insert(block.sequence.clone()); - } - } else { - for path in all_simple_paths(&graph, start_node, *end_node) { - let mut current_sequence = "".to_string(); - for node in path { - let block = blocks_by_id.get(&node).unwrap(); - let block_sequence = block.sequence.clone(); - current_sequence.push_str(&block_sequence); - } - sequences.insert(current_sequence); - } - } - } - } - - sequences - } - pub fn insert_changes(conn: &Connection, changes: &Vec, cache: &PathCache) { let mut new_edges_by_block_group = HashMap::>::new(); for change in changes { @@ -432,153 +322,20 @@ impl BlockGroup { } } - pub fn insert_changes_new( - conn: &Connection, - changes: &Vec, - cache: &PathCacheNew, - ) { - let mut new_edges_by_block_group = HashMap::>::new(); - for change in changes { - let tree = PathCacheNew::get_intervaltree(cache, &change.path).unwrap(); - let new_edges = BlockGroup::set_up_new_edges_new(change, tree); - new_edges_by_block_group - .entry(change.block_group_id) - .and_modify(|new_edge_data| new_edge_data.extend(new_edges.clone())) - .or_insert_with(|| new_edges.clone()); - } - - for (block_group_id, new_edges) in new_edges_by_block_group { - let edge_ids = Edge::bulk_create(conn, new_edges); - BlockGroupEdge::bulk_create(conn, block_group_id, &edge_ids); - } - } - #[allow(clippy::ptr_arg)] #[allow(clippy::needless_late_init)] pub fn insert_change( conn: &Connection, change: &PathChange, - tree: &IntervalTree, - ) { - let new_edges = BlockGroup::set_up_new_edges(change, tree); - let edge_ids = Edge::bulk_create(conn, new_edges); - BlockGroupEdge::bulk_create(conn, change.block_group_id, &edge_ids); - } - - #[allow(clippy::ptr_arg)] - #[allow(clippy::needless_late_init)] - pub fn insert_change_new( - conn: &Connection, - change: &PathChangeNew, tree: &IntervalTree, ) { - let new_edges = BlockGroup::set_up_new_edges_new(change, tree); + let new_edges = BlockGroup::set_up_new_edges(change, tree); let edge_ids = Edge::bulk_create(conn, new_edges); BlockGroupEdge::bulk_create(conn, change.block_group_id, &edge_ids); } pub fn set_up_new_edges( change: &PathChange, - tree: &IntervalTree, - ) -> Vec { - let start_blocks: Vec<&NewBlock> = - tree.query_point(change.start).map(|x| &x.value).collect(); - assert_eq!(start_blocks.len(), 1); - // NOTE: This may not be used but needs to be initialized here instead of inside the if - // statement that uses it, so that the borrow checker is happy - let previous_start_blocks: Vec<&NewBlock> = tree - .query_point(change.start - 1) - .map(|x| &x.value) - .collect(); - assert_eq!(previous_start_blocks.len(), 1); - let start_block = if start_blocks[0].path_start == change.start { - // First part of this block will be replaced/deleted, need to get previous block to add - // edge including it - previous_start_blocks[0] - } else { - start_blocks[0] - }; - - let end_blocks: Vec<&NewBlock> = tree.query_point(change.end).map(|x| &x.value).collect(); - assert_eq!(end_blocks.len(), 1); - let end_block = end_blocks[0]; - - let mut new_edges = vec![]; - - if change.block.sequence_start == change.block.sequence_end { - // Deletion - let new_edge = EdgeData { - source_hash: start_block.sequence.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, - source_coordinate: change.start - start_block.path_start - + start_block.sequence_start, - source_strand: Strand::Forward, - target_hash: end_block.sequence.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, - target_coordinate: change.end - end_block.path_start + end_block.sequence_start, - target_strand: Strand::Forward, - chromosome_index: change.chromosome_index, - phased: change.phased, - }; - new_edges.push(new_edge); - - // NOTE: If the deletion is happening at the very beginning of a path, we need to add - // an edge from the dedicated start node to the end of the deletion, to indicate it's - // another start point in the block group DAG. - if change.start == 0 { - let new_beginning_edge = EdgeData { - source_hash: Sequence::PATH_START_HASH.to_string(), - source_node_id: BOGUS_SOURCE_NODE_ID, - source_coordinate: 0, - source_strand: Strand::Forward, - target_hash: end_block.sequence.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, - target_coordinate: change.end - end_block.path_start + end_block.sequence_start, - target_strand: Strand::Forward, - chromosome_index: change.chromosome_index, - phased: change.phased, - }; - new_edges.push(new_beginning_edge); - } - // NOTE: If the deletion is happening at the very end of a path, we might add an edge - // from the beginning of the deletion to the dedicated end node, but in practice it - // doesn't affect sequence readouts, so it may not be worth it. - } else { - // Insertion/replacement - let new_start_edge = EdgeData { - source_hash: start_block.sequence.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, - source_coordinate: change.start - start_block.path_start - + start_block.sequence_start, - source_strand: Strand::Forward, - target_hash: change.block.sequence.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, - target_coordinate: change.block.sequence_start, - target_strand: Strand::Forward, - chromosome_index: change.chromosome_index, - phased: change.phased, - }; - let new_end_edge = EdgeData { - source_hash: change.block.sequence.hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, - source_coordinate: change.block.sequence_end, - source_strand: Strand::Forward, - target_hash: end_block.sequence.hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, - target_coordinate: change.end - end_block.path_start + end_block.sequence_start, - target_strand: Strand::Forward, - chromosome_index: change.chromosome_index, - phased: change.phased, - }; - new_edges.push(new_start_edge); - new_edges.push(new_end_edge); - } - - new_edges - } - - pub fn set_up_new_edges_new( - change: &PathChangeNew, tree: &IntervalTree, ) -> Vec { let start_blocks: Vec<&PathBlock> = @@ -733,7 +490,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 7, @@ -742,10 +499,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -770,7 +527,7 @@ mod tests { strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 19, @@ -780,9 +537,9 @@ mod tests { phased: 0, }; // take out an entire block. - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -813,7 +570,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 7, @@ -822,10 +579,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -854,7 +611,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 15, @@ -863,10 +620,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -895,7 +652,7 @@ mod tests { path_end: 17, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 12, @@ -904,10 +661,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -936,7 +693,7 @@ mod tests { path_end: 10, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 10, @@ -945,10 +702,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -977,7 +734,7 @@ mod tests { path_end: 9, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 9, @@ -986,10 +743,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1018,7 +775,7 @@ mod tests { path_end: 20, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 10, @@ -1027,10 +784,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1059,7 +816,7 @@ mod tests { path_end: 25, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 15, @@ -1068,10 +825,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1100,7 +857,7 @@ mod tests { path_end: 35, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 5, @@ -1109,10 +866,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1142,7 +899,7 @@ mod tests { strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 19, @@ -1153,9 +910,9 @@ mod tests { }; // take out an entire block. - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1184,7 +941,7 @@ mod tests { path_end: 15, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 7, @@ -1193,10 +950,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1205,10 +962,10 @@ mod tests { ]) ); - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1237,7 +994,7 @@ mod tests { path_end: 0, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 0, @@ -1246,10 +1003,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1279,7 +1036,7 @@ mod tests { path_end: 40, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 40, @@ -1288,10 +1045,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1320,7 +1077,7 @@ mod tests { path_end: 11, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 10, @@ -1329,10 +1086,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1361,7 +1118,7 @@ mod tests { path_end: 20, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 19, @@ -1370,10 +1127,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1402,7 +1159,7 @@ mod tests { path_end: 1, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 0, @@ -1411,10 +1168,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1443,7 +1200,7 @@ mod tests { path_end: 40, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 35, @@ -1452,10 +1209,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1484,7 +1241,7 @@ mod tests { path_end: 12, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 10, @@ -1493,10 +1250,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ @@ -1525,7 +1282,7 @@ mod tests { path_end: 20, strand: Strand::Forward, }; - let change = PathChangeNew { + let change = PathChange { block_group_id, path: path.clone(), start: 18, @@ -1534,10 +1291,10 @@ mod tests { chromosome_index: 1, phased: 0, }; - let tree = Path::intervaltree_for_new(&conn, &path); - BlockGroup::insert_change_new(&conn, &change, &tree); + let tree = Path::intervaltree_for(&conn, &path); + BlockGroup::insert_change(&conn, &change, &tree); - let all_sequences = BlockGroup::get_all_sequences_new(&conn, block_group_id); + let all_sequences = BlockGroup::get_all_sequences(&conn, block_group_id); assert_eq!( all_sequences, HashSet::from_iter(vec![ diff --git a/src/models/edge.rs b/src/models/edge.rs index 7a71f8e..f70ad9a 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -7,9 +7,7 @@ use rusqlite::types::Value; use rusqlite::{params_from_iter, Connection, Result as SQLResult, Row}; use serde::{Deserialize, Serialize}; -use crate::models::node::{ - Node, BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID, PATH_END_NODE_ID, PATH_START_NODE_ID, -}; +use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::{sequence::Sequence, strand::Strand}; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] @@ -322,139 +320,6 @@ impl Edge { } pub fn blocks_from_edges(conn: &Connection, edges: &Vec) -> (Vec, Vec) { - let mut sequence_hashes = HashSet::new(); - let mut edges_by_source_hash: HashMap<&str, Vec<&Edge>> = HashMap::new(); - let mut edges_by_target_hash: HashMap<&str, Vec<&Edge>> = HashMap::new(); - for edge in edges { - if edge.source_hash != Sequence::PATH_START_HASH { - sequence_hashes.insert(edge.source_hash.as_str()); - edges_by_source_hash - .entry(&edge.source_hash) - .and_modify(|edges| edges.push(edge)) - .or_default(); - } - if edge.target_hash != Sequence::PATH_END_HASH { - sequence_hashes.insert(edge.target_hash.as_str()); - edges_by_target_hash - .entry(&edge.target_hash) - .and_modify(|edges| edges.push(edge)) - .or_default(); - } - } - - let sequences_by_hash = - Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect::>()); - let mut blocks = vec![]; - let mut block_index = 0; - let mut boundary_edges = vec![]; - for (hash, sequence) in sequences_by_hash.into_iter() { - let block_boundaries = Edge::get_block_boundaries( - edges_by_source_hash.get(hash.as_str()), - edges_by_target_hash.get(hash.as_str()), - sequence.length, - ); - for block_boundary in &block_boundaries { - // NOTE: Most of this data is bogus, the Edge struct is just a convenient wrapper - // for the data we need to set up boundary edges in the block group graph - boundary_edges.push(Edge { - id: -1, - source_hash: hash.clone(), - source_node_id: BOGUS_SOURCE_NODE_ID, - source_coordinate: *block_boundary, - source_strand: Strand::Unknown, - target_hash: hash.clone(), - target_node_id: BOGUS_TARGET_NODE_ID, - target_coordinate: *block_boundary, - target_strand: Strand::Unknown, - chromosome_index: 0, - phased: 0, - }); - } - - if !block_boundaries.is_empty() { - let start = 0; - let end = block_boundaries[0]; - let block_sequence = sequence.get_sequence(start, end).to_string(); - let first_block = GroupBlock { - id: block_index, - sequence_hash: hash.clone(), - node_id: BOGUS_SOURCE_NODE_ID, - sequence: block_sequence, - start, - end, - }; - blocks.push(first_block); - block_index += 1; - for (start, end) in block_boundaries.clone().into_iter().tuple_windows() { - let block_sequence = sequence.get_sequence(start, end).to_string(); - let block = GroupBlock { - id: block_index, - sequence_hash: hash.clone(), - node_id: BOGUS_SOURCE_NODE_ID, - sequence: block_sequence, - start, - end, - }; - blocks.push(block); - block_index += 1; - } - let start = block_boundaries[block_boundaries.len() - 1]; - let end = sequence.length; - let block_sequence = sequence.get_sequence(start, end).to_string(); - let last_block = GroupBlock { - id: block_index, - sequence_hash: hash.clone(), - node_id: BOGUS_SOURCE_NODE_ID, - sequence: block_sequence, - start, - end, - }; - blocks.push(last_block); - block_index += 1; - } else { - blocks.push(GroupBlock { - id: block_index, - sequence_hash: hash.clone(), - node_id: BOGUS_SOURCE_NODE_ID, - sequence: sequence.get_sequence(None, None), - start: 0, - end: sequence.length, - }); - block_index += 1; - } - } - - // NOTE: We need a dedicated start node and a dedicated end node for the graph formed by the - // block group, since different paths in the block group may start or end at different - // places on sequences. These two "start sequence" and "end sequence" blocks will serve - // that role. - let start_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_START_HASH).unwrap(); - let start_block = GroupBlock { - id: block_index + 1, - sequence_hash: start_sequence.hash.clone(), - node_id: BOGUS_SOURCE_NODE_ID, - sequence: "".to_string(), - start: 0, - end: 0, - }; - blocks.push(start_block); - let end_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); - let end_block = GroupBlock { - id: block_index + 2, - sequence_hash: end_sequence.hash.clone(), - node_id: BOGUS_SOURCE_NODE_ID, - sequence: "".to_string(), - start: 0, - end: 0, - }; - blocks.push(end_block); - (blocks, boundary_edges) - } - - pub fn blocks_from_edges_new( - conn: &Connection, - edges: &Vec, - ) -> (Vec, Vec) { let mut node_ids = HashSet::new(); let mut edges_by_source_node_id: HashMap> = HashMap::new(); let mut edges_by_target_node_id: HashMap> = HashMap::new(); @@ -589,65 +454,6 @@ impl Edge { pub fn build_graph( edges: &Vec, blocks: &Vec, - ) -> (DiGraphMap, HashMap<(i32, i32), Edge>) { - let blocks_by_start = blocks - .clone() - .into_iter() - .map(|block| { - ( - BlockKey { - sequence_hash: block.sequence_hash, - coordinate: block.start, - }, - block.id, - ) - }) - .collect::>(); - let blocks_by_end = blocks - .clone() - .into_iter() - .map(|block| { - ( - BlockKey { - sequence_hash: block.sequence_hash, - coordinate: block.end, - }, - block.id, - ) - }) - .collect::>(); - - let mut graph: DiGraphMap = DiGraphMap::new(); - let mut edges_by_node_pair = HashMap::new(); - for block in blocks { - graph.add_node(block.id); - } - for edge in edges { - let source_key = BlockKey { - sequence_hash: edge.source_hash.clone(), - coordinate: edge.source_coordinate, - }; - let source_id = blocks_by_end.get(&source_key); - let target_key = BlockKey { - sequence_hash: edge.target_hash.clone(), - coordinate: edge.target_coordinate, - }; - let target_id = blocks_by_start.get(&target_key); - - if let Some(source_id_value) = source_id { - if let Some(target_id_value) = target_id { - graph.add_edge(*source_id_value, *target_id_value, ()); - edges_by_node_pair.insert((*source_id_value, *target_id_value), edge.clone()); - } - } - } - - (graph, edges_by_node_pair) - } - - pub fn build_graph_new( - edges: &Vec, - blocks: &Vec, ) -> (DiGraphMap, HashMap<(i32, i32), Edge>) { let blocks_by_start = blocks .clone() diff --git a/src/models/path.rs b/src/models/path.rs index 1ba460f..5def5d6 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -176,7 +176,7 @@ impl Path { } pub fn sequence(conn: &Connection, path: Path) -> String { - let blocks = Path::blocks_for_new(conn, &path); + let blocks = Path::blocks_for(conn, &path); blocks .into_iter() .map(|block| block.block_sequence) @@ -185,128 +185,6 @@ impl Path { } pub fn edge_pairs_to_block( - block_id: i32, - path: &Path, - into: Edge, - out_of: Edge, - sequences_by_hash: &HashMap, - current_path_length: i32, - ) -> NewBlock { - if into.target_hash != out_of.source_hash { - panic!( - "Consecutive edges in path {0} don't share the same sequence", - path.id - ); - } - - let sequence = sequences_by_hash.get(&into.target_hash).unwrap(); - let start = into.target_coordinate; - let end = out_of.source_coordinate; - - let strand; - let block_sequence_length; - - if into.target_strand == out_of.source_strand { - strand = into.target_strand; - block_sequence_length = end - start; - } else { - panic!( - "Edge pair with target_strand/source_strand mismatch for path {}", - path.id - ); - } - - let block_sequence = if strand == Strand::Reverse { - revcomp(&sequence.get_sequence(start, end)) - } else { - sequence.get_sequence(start, end) - }; - - NewBlock { - id: block_id, - sequence: sequence.clone(), - block_sequence, - sequence_start: start, - sequence_end: end, - path_start: current_path_length, - path_end: current_path_length + block_sequence_length, - strand, - } - } - - pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { - let edges = PathEdge::edges_for_path(conn, path.id); - let mut sequence_hashes = HashSet::new(); - for edge in &edges { - if edge.source_hash != Sequence::PATH_START_HASH { - sequence_hashes.insert(edge.source_hash.as_str()); - } - if edge.target_hash != Sequence::PATH_END_HASH { - sequence_hashes.insert(edge.target_hash.as_str()); - } - } - let sequences_by_hash = - Sequence::sequences_by_hash(conn, sequence_hashes.into_iter().collect::>()); - - let mut blocks = vec![]; - let mut path_length = 0; - - // NOTE: Adding a "start block" for the dedicated start sequence with a range from i32::MIN - // to 0 makes interval tree lookups work better. If the point being looked up is -1 (or - // below), it will return this block. - let start_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_START_HASH).unwrap(); - blocks.push(NewBlock { - id: -1, - sequence: start_sequence, - block_sequence: "".to_string(), - sequence_start: 0, - sequence_end: 0, - path_start: i32::MIN + 1, - path_end: 0, - strand: Strand::Forward, - }); - - for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { - let block = Path::edge_pairs_to_block( - index as i32, - path, - into, - out_of, - &sequences_by_hash, - path_length, - ); - path_length += block.block_sequence.len() as i32; - blocks.push(block); - } - - // NOTE: Adding an "end block" for the dedicated end sequence with a range from the path - // length to i32::MAX makes interval tree lookups work better. If the point being looked up - // is the path length (or higher), it will return this block. - let end_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); - blocks.push(NewBlock { - id: -2, - sequence: end_sequence, - block_sequence: "".to_string(), - sequence_start: 0, - sequence_end: 0, - path_start: path_length, - path_end: i32::MAX - 1, - strand: Strand::Forward, - }); - - blocks - } - - pub fn intervaltree_for(conn: &Connection, path: &Path) -> IntervalTree { - let blocks = Path::blocks_for(conn, path); - let tree: IntervalTree = blocks - .into_iter() - .map(|block| (block.path_start..block.path_end, block)) - .collect(); - tree - } - - pub fn edge_pairs_to_block_new( block_id: i32, path: &Path, into: Edge, @@ -356,7 +234,7 @@ impl Path { } } - pub fn blocks_for_new(conn: &Connection, path: &Path) -> Vec { + pub fn blocks_for(conn: &Connection, path: &Path) -> Vec { let edges = PathEdge::edges_for_path(conn, path.id); let mut sequence_node_ids = HashSet::new(); for edge in &edges { @@ -390,7 +268,7 @@ impl Path { }); for (index, (into, out_of)) in edges.into_iter().tuple_windows().enumerate() { - let block = Path::edge_pairs_to_block_new( + let block = Path::edge_pairs_to_block( index as i32, path, into, @@ -419,8 +297,8 @@ impl Path { blocks } - pub fn intervaltree_for_new(conn: &Connection, path: &Path) -> IntervalTree { - let blocks = Path::blocks_for_new(conn, path); + pub fn intervaltree_for(conn: &Connection, path: &Path) -> IntervalTree { + let blocks = Path::blocks_for(conn, path); let tree: IntervalTree = blocks .into_iter() .map(|block| (block.path_start..block.path_end, block)) @@ -741,7 +619,7 @@ mod tests { block_group.id, &[edge1.id, edge2.id, edge3.id, edge4.id, edge5.id], ); - let tree = Path::intervaltree_for_new(conn, &path); + let tree = Path::intervaltree_for(conn, &path); let blocks1: Vec = tree.query_point(2).map(|x| x.value.clone()).collect(); assert_eq!(blocks1.len(), 1); let block1 = &blocks1[0]; diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index 2b441f3..61d8bf5 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -3,7 +3,7 @@ use std::fmt::Debug; use std::{io, str}; use crate::models::{ - block_group::{BlockGroup, BlockGroupData, PathCacheNew, PathChangeNew}, + block_group::{BlockGroup, BlockGroupData, PathCache, PathChange}, file_types::FileTypes, metadata, node::Node, @@ -122,7 +122,7 @@ fn prepare_change( block_sequence: String, sequence_length: i32, node_id: i32, -) -> PathChangeNew { +) -> PathChange { // TODO: new sequence may not be real and be or some sort. Handle these. let new_block = PathBlock { id: 0, @@ -134,7 +134,7 @@ fn prepare_change( path_end: ref_end, strand: Strand::Forward, }; - PathChangeNew { + PathChange { block_group_id: sample_bg_id, path: sample_path.clone(), start: ref_start, @@ -194,10 +194,10 @@ pub fn update_with_vcf( // Cache a bunch of data ahead of making changes let mut block_group_cache = BlockGroupCache::new(conn); - let mut path_cache = PathCacheNew::new(conn); + let mut path_cache = PathCache::new(conn); let mut sequence_cache = SequenceCache::new(conn); - let mut changes: HashMap<(Path, String), Vec> = HashMap::new(); + let mut changes: HashMap<(Path, String), Vec> = HashMap::new(); for result in reader.records() { let record = result.unwrap(); @@ -216,7 +216,7 @@ pub fn update_with_vcf( &fixed_sample, seq_name.clone(), ); - let sample_path = PathCacheNew::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); + let sample_path = PathCache::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); for (chromosome_index, genotype) in genotype.iter().enumerate() { if let Some(gt) = genotype { @@ -246,7 +246,7 @@ pub fn update_with_vcf( seq_name.clone(), ); let sample_path = - PathCacheNew::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); + PathCache::lookup(&mut path_cache, sample_bg_id, seq_name.clone()); let genotype = sample.get(&header, "GT"); if genotype.is_some() { @@ -301,7 +301,7 @@ pub fn update_with_vcf( } let mut summary: HashMap> = HashMap::new(); for ((path, sample_name), path_changes) in changes { - BlockGroup::insert_changes_new(conn, &path_changes, &path_cache); + BlockGroup::insert_changes(conn, &path_changes, &path_cache); summary .entry(sample_name) .or_default() @@ -362,7 +362,7 @@ mod tests { op_conn, ); assert_eq!( - BlockGroup::get_all_sequences_new(conn, 1), + BlockGroup::get_all_sequences(conn, 1), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); // A homozygous set of variants should only return 1 sequence @@ -373,12 +373,12 @@ mod tests { // ); // Blockgroup 3 belongs to the `G1` genotype and has no changes assert_eq!( - BlockGroup::get_all_sequences_new(conn, 3), + BlockGroup::get_all_sequences(conn, 3), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); // This individual is homozygous for the first variant and does not contain the second assert_eq!( - BlockGroup::get_all_sequences_new(conn, 4), + BlockGroup::get_all_sequences(conn, 4), HashSet::from_iter(vec![ "ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string(), "ATCATCGATCGATCGATCGGGAACACACAGAGA".to_string(), @@ -417,11 +417,11 @@ mod tests { op_conn, ); assert_eq!( - BlockGroup::get_all_sequences_new(conn, 1), + BlockGroup::get_all_sequences(conn, 1), HashSet::from_iter(vec!["ATCGATCGATCGATCGATCGGGAACACACAGAGA".to_string()]) ); assert_eq!( - BlockGroup::get_all_sequences_new(conn, 2), + BlockGroup::get_all_sequences(conn, 2), HashSet::from_iter( [ "ATCGATCGATAGAGATCGATCGGGAACACACAGAGA", From 56e51412089b24fa19789bef198e6e777ab5edaf Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 19:28:12 -0400 Subject: [PATCH 10/16] Update operation management, part 1 --- src/models/block_group.rs | 40 +++++++------- src/models/node.rs | 3 -- src/operation_management.rs | 32 ++++++----- src/test_helpers.rs | 102 +----------------------------------- 4 files changed, 36 insertions(+), 141 deletions(-) diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 97e2370..12d997d 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -439,7 +439,7 @@ impl BlockGroup { mod tests { use super::*; use crate::models::{collection::Collection, node::Node, sample::Sample}; - use crate::test_helpers::{get_connection, setup_block_group_new}; + use crate::test_helpers::{get_connection, setup_block_group}; #[test] fn test_blockgroup_create() { @@ -474,7 +474,7 @@ mod tests { #[test] fn insert_and_deletion_get_all() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -554,7 +554,7 @@ mod tests { #[test] fn simple_insert_get_all() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -595,7 +595,7 @@ mod tests { #[test] fn insert_on_block_boundary_middle() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -636,7 +636,7 @@ mod tests { #[test] fn insert_within_block() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -677,7 +677,7 @@ mod tests { #[test] fn insert_on_block_boundary_start() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -718,7 +718,7 @@ mod tests { #[test] fn insert_on_block_boundary_end() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -759,7 +759,7 @@ mod tests { #[test] fn insert_across_entire_block_boundary() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -800,7 +800,7 @@ mod tests { #[test] fn insert_across_two_blocks() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -841,7 +841,7 @@ mod tests { #[test] fn insert_spanning_blocks() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -882,7 +882,7 @@ mod tests { #[test] fn simple_deletion() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") @@ -925,7 +925,7 @@ mod tests { #[test] fn doesnt_apply_same_insert_twice() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -978,7 +978,7 @@ mod tests { #[test] fn insert_at_beginning_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -1019,7 +1019,7 @@ mod tests { #[test] fn insert_at_end_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") @@ -1061,7 +1061,7 @@ mod tests { #[test] fn insert_at_one_bp_into_block() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -1102,7 +1102,7 @@ mod tests { #[test] fn insert_at_one_bp_from_end_of_block() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let insert_sequence = Sequence::new() .sequence_type("DNA") .sequence("NNNN") @@ -1143,7 +1143,7 @@ mod tests { #[test] fn delete_at_beginning_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") @@ -1184,7 +1184,7 @@ mod tests { #[test] fn delete_at_end_of_path() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") @@ -1225,7 +1225,7 @@ mod tests { #[test] fn deletion_starting_at_block_boundary() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") @@ -1266,7 +1266,7 @@ mod tests { #[test] fn deletion_ending_at_block_boundary() { let conn = get_connection(None); - let (block_group_id, path) = setup_block_group_new(&conn); + let (block_group_id, path) = setup_block_group(&conn); let deletion_sequence = Sequence::new() .sequence_type("DNA") .sequence("") diff --git a/src/models/node.rs b/src/models/node.rs index 1219ba0..7c56887 100644 --- a/src/models/node.rs +++ b/src/models/node.rs @@ -3,9 +3,6 @@ use std::collections::HashMap; use crate::models::sequence::Sequence; -pub const BOGUS_SOURCE_NODE_ID: i32 = -1; -pub const BOGUS_TARGET_NODE_ID: i32 = -2; - pub const PATH_START_NODE_ID: i32 = 1; pub const PATH_END_NODE_ID: i32 = 2; diff --git a/src/operation_management.rs b/src/operation_management.rs index 902607c..1d3ad63 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -15,7 +15,6 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData}; use crate::models::file_types::FileTypes; -use crate::models::node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}; use crate::models::operations::{ Branch, FileAddition, Operation, OperationState, OperationSummary, }; @@ -333,12 +332,12 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { edge_pk, EdgeData { source_hash: item.new_value(1).unwrap().as_str().unwrap().to_string(), - source_node_id: BOGUS_SOURCE_NODE_ID, + source_node_id: item.new_value(2).unwrap().as_i64().unwrap() as i32, source_coordinate: item.new_value(3).unwrap().as_i64().unwrap() as i32, source_strand: Strand::column_result(item.new_value(4).unwrap()) .unwrap(), target_hash: item.new_value(5).unwrap().as_str().unwrap().to_string(), - target_node_id: BOGUS_TARGET_NODE_ID, + target_node_id: item.new_value(6).unwrap().as_i64().unwrap() as i32, target_coordinate: item.new_value(7).unwrap().as_i64().unwrap() as i32, target_strand: Strand::column_result(item.new_value(8).unwrap()) .unwrap(), @@ -539,12 +538,7 @@ mod tests { use crate::imports::fasta::import_fasta; use crate::models::file_types::FileTypes; use crate::models::operations::{setup_db, Branch, FileAddition, Operation, OperationState}; - use crate::models::{ - edge::Edge, - metadata, - node::{BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID}, - sample::Sample, - }; + use crate::models::{edge::Edge, metadata, node::Node, sample::Sample}; use crate::test_helpers::{ get_connection, get_operation_connection, setup_block_group, setup_gen_dir, }; @@ -589,15 +583,17 @@ mod tests { .sequence("ATCG") .save(conn); let existing_seq = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); + let random_node = Node::create(conn, random_seq.hash.as_str()); + let existing_node = Node::create(conn, existing_seq.hash.as_str()); let new_edge = Edge::create( conn, - random_seq.hash.clone(), - BOGUS_SOURCE_NODE_ID, + "".to_string(), + random_node.id, 0, Strand::Forward, - existing_seq.hash.clone(), - BOGUS_TARGET_NODE_ID, + "".to_string(), + existing_node.id, 0, Strand::Forward, 0, @@ -614,6 +610,8 @@ mod tests { get_changeset_path(&operation).join(format!("{op_id}.dep", op_id = operation.id)); let dependencies: DependencyModels = serde_json::from_reader(fs::File::open(dependency_path).unwrap()).unwrap(); + println!("here1"); + println!("{:?}", dependencies); assert_eq!(dependencies.sequences.len(), 1); assert_eq!( dependencies.block_group[0].collection_name, @@ -661,7 +659,7 @@ mod tests { let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + assert_eq!(edge_count, 14); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); @@ -693,7 +691,7 @@ mod tests { let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + assert_eq!(edge_count, 14); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); } @@ -874,7 +872,7 @@ mod tests { let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + assert_eq!(edge_count, 14); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); @@ -935,7 +933,7 @@ mod tests { let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; - assert_eq!(edge_count, 10); + assert_eq!(edge_count, 14); assert_eq!(sample_count, 3); assert_eq!(op_count, 3); } diff --git a/src/test_helpers.rs b/src/test_helpers.rs index 48f29d3..ab3936f 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -9,9 +9,7 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::collection::Collection; use crate::models::edge::Edge; -use crate::models::node::{ - Node, BOGUS_SOURCE_NODE_ID, BOGUS_TARGET_NODE_ID, PATH_END_NODE_ID, PATH_START_NODE_ID, -}; +use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::path::Path; use crate::models::sequence::Sequence; use crate::models::strand::Strand; @@ -61,104 +59,6 @@ pub fn setup_gen_dir() { } pub fn setup_block_group(conn: &Connection) -> (i32, Path) { - let a_seq = Sequence::new() - .sequence_type("DNA") - .sequence("AAAAAAAAAA") - .save(conn); - let t_seq = Sequence::new() - .sequence_type("DNA") - .sequence("TTTTTTTTTT") - .save(conn); - let c_seq = Sequence::new() - .sequence_type("DNA") - .sequence("CCCCCCCCCC") - .save(conn); - let g_seq = Sequence::new() - .sequence_type("DNA") - .sequence("GGGGGGGGGG") - .save(conn); - let _collection = Collection::create(conn, "test"); - let block_group = BlockGroup::create(conn, "test", None, "hg19"); - let edge0 = Edge::create( - conn, - Sequence::PATH_START_HASH.to_string(), - BOGUS_SOURCE_NODE_ID, - 0, - Strand::Forward, - a_seq.hash.clone(), - BOGUS_TARGET_NODE_ID, - 0, - Strand::Forward, - 0, - 0, - ); - let edge1 = Edge::create( - conn, - a_seq.hash, - BOGUS_SOURCE_NODE_ID, - 10, - Strand::Forward, - t_seq.hash.clone(), - BOGUS_TARGET_NODE_ID, - 0, - Strand::Forward, - 0, - 0, - ); - let edge2 = Edge::create( - conn, - t_seq.hash, - BOGUS_SOURCE_NODE_ID, - 10, - Strand::Forward, - c_seq.hash.clone(), - BOGUS_TARGET_NODE_ID, - 0, - Strand::Forward, - 0, - 0, - ); - let edge3 = Edge::create( - conn, - c_seq.hash, - BOGUS_SOURCE_NODE_ID, - 10, - Strand::Forward, - g_seq.hash.clone(), - BOGUS_TARGET_NODE_ID, - 0, - Strand::Forward, - 0, - 0, - ); - let edge4 = Edge::create( - conn, - g_seq.hash, - BOGUS_SOURCE_NODE_ID, - 10, - Strand::Forward, - Sequence::PATH_END_HASH.to_string(), - BOGUS_TARGET_NODE_ID, - 0, - Strand::Forward, - 0, - 0, - ); - BlockGroupEdge::bulk_create( - conn, - block_group.id, - &[edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], - ); - let path = Path::create( - conn, - "chr1", - block_group.id, - &[edge0.id, edge1.id, edge2.id, edge3.id, edge4.id], - ); - (block_group.id, path) -} - -pub fn setup_block_group_new(conn: &Connection) -> (i32, Path) { let a_seq = Sequence::new() .sequence_type("DNA") .sequence("AAAAAAAAAA") From ce3ae1bae8378e36e2e6f8324537471cffadfad9 Mon Sep 17 00:00:00 2001 From: hofer Date: Wed, 25 Sep 2024 19:51:19 -0400 Subject: [PATCH 11/16] Fix all tests, further cleanup of old unused fields --- migrations/core/01-initial/up.sql | 6 +- src/exports/gfa.rs | 10 --- src/imports/fasta.rs | 4 - src/imports/gfa.rs | 33 -------- src/models/block_group.rs | 17 +--- src/models/edge.rs | 126 ++++++++---------------------- src/models/path.rs | 30 ------- src/operation_management.rs | 34 ++++---- src/test_helpers.rs | 10 --- 9 files changed, 51 insertions(+), 219 deletions(-) diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index 84320d0..d0a8a42 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -69,19 +69,19 @@ CREATE TABLE operation_summary ( CREATE TABLE edges ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, - source_hash TEXT NOT NULL, source_node_id INTEGER, source_coordinate INTEGER NOT NULL, source_strand TEXT NOT NULL, - target_hash TEXT NOT NULL, target_node_id INTEGER, target_coordinate INTEGER NOT NULL, target_strand TEXT NOT NULL, chromosome_index INTEGER NOT NULL, phased INTEGER NOT NULL, + FOREIGN KEY(source_node_id) REFERENCES nodes(id), + FOREIGN KEY(target_node_id) REFERENCES nodes(id), constraint chk_phased check (phased in (0, 1)) ) STRICT; --- CREATE UNIQUE INDEX edge_uidx ON edges(source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased); +CREATE UNIQUE INDEX edge_uidx ON edges(source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased); CREATE TABLE path_edges ( id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 0839bce..457cbf9 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -241,11 +241,9 @@ mod tests { let edge1 = Edge::create( &conn, - Sequence::PATH_START_HASH.to_string(), PATH_START_NODE_ID, 0, Strand::Forward, - sequence1.hash.clone(), node1.id, 0, Strand::Forward, @@ -254,11 +252,9 @@ mod tests { ); let edge2 = Edge::create( &conn, - sequence1.hash, node1.id, 4, Strand::Forward, - sequence2.hash.clone(), node2.id, 0, Strand::Forward, @@ -267,11 +263,9 @@ mod tests { ); let edge3 = Edge::create( &conn, - sequence2.hash, node2.id, 4, Strand::Forward, - sequence3.hash.clone(), node3.id, 0, Strand::Forward, @@ -280,11 +274,9 @@ mod tests { ); let edge4 = Edge::create( &conn, - sequence3.hash, node3.id, 4, Strand::Forward, - sequence4.hash.clone(), node4.id, 0, Strand::Forward, @@ -293,11 +285,9 @@ mod tests { ); let edge5 = Edge::create( &conn, - sequence4.hash, node4.id, 4, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), PATH_END_NODE_ID, 0, Strand::Forward, diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index aafb33d..821759d 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -68,11 +68,9 @@ pub fn import_fasta( let block_group = BlockGroup::create(conn, &collection.name, None, &name); let edge_into = Edge::create( conn, - Sequence::PATH_START_HASH.to_string(), PATH_START_NODE_ID, 0, Strand::Forward, - seq.hash.to_string(), node.id, 0, Strand::Forward, @@ -81,11 +79,9 @@ pub fn import_fasta( ); let edge_out_of = Edge::create( conn, - seq.hash.to_string(), node.id, sequence_length, Strand::Forward, - Sequence::PATH_END_HASH.to_string(), PATH_END_NODE_ID, 0, Strand::Forward, diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 0c39723..101c252 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -43,22 +43,18 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let mut edges = HashSet::new(); for link in &gfa.links { let source = sequences_by_segment_id.get(&link.from).unwrap(); - let target = sequences_by_segment_id.get(&link.to).unwrap(); let source_node_id = *node_ids_by_segment_id.get(&link.from).unwrap(); let target_node_id = *node_ids_by_segment_id.get(&link.to).unwrap(); edges.insert(edge_data_from_fields( - &source.hash, source_node_id, source.length, bool_to_strand(link.from_dir), - &target.hash, target_node_id, bool_to_strand(link.to_dir), )); } for input_path in &gfa.paths { - let mut source_hash = Sequence::PATH_START_HASH; let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; @@ -67,32 +63,26 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_path.dir[index]); edges.insert(edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - &target.hash, target_node_id, target_strand, )); - source_hash = &target.hash; source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } edges.insert(edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, PATH_END_NODE_ID, Strand::Forward, )); } for input_walk in &gfa.walk { - let mut source_hash = Sequence::PATH_START_HASH; let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; @@ -101,25 +91,20 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_walk.walk_dir[index]); edges.insert(edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - &target.hash, target_node_id, target_strand, )); - source_hash = &target.hash; source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } edges.insert(edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, PATH_END_NODE_ID, Strand::Forward, )); @@ -132,11 +117,9 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let mut edge_ids_by_data = HashMap::new(); for edge in saved_edges { let key = edge_data_from_fields( - &edge.source_hash, edge.source_node_id, edge.source_coordinate, edge.source_strand, - &edge.target_hash, edge.target_node_id, edge.target_strand, ); @@ -145,7 +128,6 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) for input_path in &gfa.paths { let path_name = &input_path.name; - let mut source_hash = Sequence::PATH_START_HASH; let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; @@ -155,27 +137,22 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_path.dir[index]); let key = edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - &target.hash, target_node_id, target_strand, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); - source_hash = &target.hash; source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } let key = edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, PATH_END_NODE_ID, Strand::Forward, ); @@ -186,7 +163,6 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) for input_walk in &gfa.walk { let path_name = &input_walk.sample_id; - let mut source_hash = Sequence::PATH_START_HASH; let mut source_node_id = PATH_START_NODE_ID; let mut source_coordinate = 0; let mut source_strand = Strand::Forward; @@ -196,27 +172,22 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) let target_node_id = *node_ids_by_segment_id.get(segment_id).unwrap(); let target_strand = bool_to_strand(input_walk.walk_dir[index]); let key = edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - &target.hash, target_node_id, target_strand, ); let edge_id = *edge_ids_by_data.get(&key).unwrap(); path_edge_ids.push(edge_id); - source_hash = &target.hash; source_node_id = target_node_id; source_coordinate = target.length; source_strand = target_strand; } let key = edge_data_from_fields( - source_hash, source_node_id, source_coordinate, source_strand, - Sequence::PATH_END_HASH, PATH_END_NODE_ID, Strand::Forward, ); @@ -227,20 +198,16 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) } fn edge_data_from_fields( - source_hash: &str, source_node_id: i32, source_coordinate: i32, source_strand: Strand, - target_hash: &str, target_node_id: i32, target_strand: Strand, ) -> EdgeData { EdgeData { - source_hash: source_hash.to_string(), source_node_id, source_coordinate, source_strand, - target_hash: target_hash.to_string(), target_node_id, target_coordinate: 0, target_strand, diff --git a/src/models/block_group.rs b/src/models/block_group.rs index 12d997d..dc6996a 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -8,10 +8,9 @@ use serde::{Deserialize, Serialize}; use crate::graph::all_simple_paths; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData, GroupBlock}; -use crate::models::node::PATH_START_NODE_ID; +use crate::models::node::{PATH_END_NODE_ID, PATH_START_NODE_ID}; use crate::models::path::{Path, PathBlock, PathData}; use crate::models::path_edge::PathEdge; -use crate::models::sequence::Sequence; use crate::models::strand::Strand; #[derive(Debug, Deserialize, Serialize)] @@ -283,9 +282,7 @@ impl BlockGroup { // TODO: maybe make all_simple_paths return a single path id where start == end if start_node == *end_node { let block = blocks_by_id.get(&start_node).unwrap(); - if block.sequence_hash != Sequence::PATH_START_HASH - && block.sequence_hash != Sequence::PATH_END_HASH - { + if block.node_id != PATH_START_NODE_ID && block.node_id != PATH_END_NODE_ID { sequences.insert(block.sequence.clone()); } } else { @@ -365,12 +362,10 @@ impl BlockGroup { if change.block.sequence_start == change.block.sequence_end { // Deletion let new_edge = EdgeData { - source_hash: "".to_string(), source_node_id: start_block.node_id, source_coordinate: change.start - start_block.path_start + start_block.sequence_start, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: end_block.node_id, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, @@ -384,11 +379,9 @@ impl BlockGroup { // another start point in the block group DAG. if change.start == 0 { let new_beginning_edge = EdgeData { - source_hash: "".to_string(), source_node_id: PATH_START_NODE_ID, source_coordinate: 0, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: end_block.node_id, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, @@ -403,12 +396,10 @@ impl BlockGroup { } else { // Insertion/replacement let new_start_edge = EdgeData { - source_hash: "".to_string(), source_node_id: start_block.node_id, source_coordinate: change.start - start_block.path_start + start_block.sequence_start, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: change.block.node_id, target_coordinate: change.block.sequence_start, target_strand: Strand::Forward, @@ -416,11 +407,9 @@ impl BlockGroup { phased: change.phased, }; let new_end_edge = EdgeData { - source_hash: "".to_string(), source_node_id: change.block.node_id, source_coordinate: change.block.sequence_end, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: end_block.node_id, target_coordinate: change.end - end_block.path_start + end_block.sequence_start, target_strand: Strand::Forward, @@ -438,7 +427,7 @@ impl BlockGroup { #[cfg(test)] mod tests { use super::*; - use crate::models::{collection::Collection, node::Node, sample::Sample}; + use crate::models::{collection::Collection, node::Node, sample::Sample, sequence::Sequence}; use crate::test_helpers::{get_connection, setup_block_group}; #[test] diff --git a/src/models/edge.rs b/src/models/edge.rs index f70ad9a..3baacfe 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -8,16 +8,14 @@ use rusqlite::{params_from_iter, Connection, Result as SQLResult, Row}; use serde::{Deserialize, Serialize}; use crate::models::node::{Node, PATH_END_NODE_ID, PATH_START_NODE_ID}; -use crate::models::{sequence::Sequence, strand::Strand}; +use crate::models::strand::Strand; #[derive(Clone, Debug, Eq, Hash, PartialEq, Deserialize, Serialize)] pub struct Edge { pub id: i32, - pub source_hash: String, pub source_node_id: i32, pub source_coordinate: i32, pub source_strand: Strand, - pub target_hash: String, pub target_node_id: i32, pub target_coordinate: i32, pub target_strand: Strand, @@ -27,11 +25,9 @@ pub struct Edge { #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct EdgeData { - pub source_hash: String, pub source_node_id: i32, pub source_coordinate: i32, pub source_strand: Strand, - pub target_hash: String, pub target_node_id: i32, pub target_coordinate: i32, pub target_strand: Strand, @@ -42,11 +38,9 @@ pub struct EdgeData { impl From<&Edge> for EdgeData { fn from(item: &Edge) -> Self { EdgeData { - source_hash: item.source_hash.clone(), source_node_id: item.source_node_id, source_coordinate: item.source_coordinate, source_strand: item.source_strand, - target_hash: item.target_hash.clone(), target_node_id: item.target_node_id, target_coordinate: item.target_coordinate, target_strand: item.target_strand, @@ -58,12 +52,6 @@ impl From<&Edge> for EdgeData { #[derive(Eq, Hash, PartialEq)] pub struct BlockKey { - pub sequence_hash: String, - pub coordinate: i32, -} - -#[derive(Eq, Hash, PartialEq)] -pub struct BlockKeyNew { pub node_id: i32, pub coordinate: i32, } @@ -71,7 +59,6 @@ pub struct BlockKeyNew { #[derive(Clone, Debug)] pub struct GroupBlock { pub id: i32, - pub sequence_hash: String, pub node_id: i32, pub sequence: String, pub start: i32, @@ -82,25 +69,21 @@ impl Edge { #[allow(clippy::too_many_arguments)] pub fn create( conn: &Connection, - source_hash: String, source_node_id: i32, source_coordinate: i32, source_strand: Strand, - target_hash: String, target_node_id: i32, target_coordinate: i32, target_strand: Strand, chromosome_index: i32, phased: i32, ) -> Edge { - let query = "INSERT INTO edges (source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10) RETURNING *"; - let id_query = "select id from edges where source_hash = ?1 and source_node_id = ?2 and source_coordinate = ?3 and source_strand = ?4 and target_hash = ?5 and target_node_id = ?6 and target_coordinate = ?7 and target_strand = ?8 and chromosome_index = ?9 and phased = ?10"; + let query = "INSERT INTO edges (source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8) RETURNING *"; + let id_query = "select id from edges where and source_node_id = ?1 and source_coordinate = ?2 and source_strand = ?3 and target_node_id = ?4 and target_coordinate = ?5 and target_strand = ?6 and chromosome_index = ?7 and phased = ?8"; let placeholders: Vec = vec![ - source_hash.clone().into(), source_node_id.into(), source_coordinate.into(), source_strand.into(), - target_hash.clone().into(), target_node_id.into(), target_coordinate.into(), target_strand.into(), @@ -112,16 +95,14 @@ impl Edge { match stmt.query_row(params_from_iter(&placeholders), |row| { Ok(Edge { id: row.get(0)?, - source_hash: row.get(1)?, - source_node_id: row.get(2)?, - source_coordinate: row.get(3)?, - source_strand: row.get(4)?, - target_hash: row.get(5)?, - target_node_id: row.get(6)?, - target_coordinate: row.get(7)?, - target_strand: row.get(8)?, - chromosome_index: row.get(9)?, - phased: row.get(10)?, + source_node_id: row.get(1)?, + source_coordinate: row.get(2)?, + source_strand: row.get(3)?, + target_node_id: row.get(4)?, + target_coordinate: row.get(5)?, + target_strand: row.get(6)?, + chromosome_index: row.get(7)?, + phased: row.get(8)?, }) }) { Ok(edge) => edge, @@ -132,11 +113,9 @@ impl Edge { id: conn .query_row(id_query, params_from_iter(&placeholders), |row| row.get(0)) .unwrap(), - source_hash, source_node_id, source_coordinate, source_strand, - target_hash, target_node_id, target_coordinate, target_strand, @@ -156,16 +135,14 @@ impl Edge { fn edge_from_row(row: &Row) -> SQLResult { Ok(Edge { id: row.get(0)?, - source_hash: row.get(1)?, - source_node_id: row.get(2)?, - source_coordinate: row.get(3)?, - source_strand: row.get(4)?, - target_hash: row.get(5)?, - target_node_id: row.get(6)?, - target_coordinate: row.get(7)?, - target_strand: row.get(8)?, - chromosome_index: row.get(9)?, - phased: row.get(10)?, + source_node_id: row.get(1)?, + source_coordinate: row.get(2)?, + source_strand: row.get(3)?, + target_node_id: row.get(4)?, + target_coordinate: row.get(5)?, + target_strand: row.get(6)?, + chromosome_index: row.get(7)?, + phased: row.get(8)?, }) } @@ -175,7 +152,7 @@ impl Edge { .map(|edge_id| edge_id.to_string()) .collect::>() .join(","); - let query = format!("select id, source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased from edges where id in ({});", formatted_edge_ids); + let query = format!("select id, source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased from edges where id in ({});", formatted_edge_ids); Edge::query(conn, &query, vec![]) } @@ -195,17 +172,13 @@ impl Edge { let mut edge_rows = vec![]; let mut edge_map: HashMap = HashMap::new(); for edge in &edges { - let source_hash = format!("\"{0}\"", edge.source_hash); let source_strand = format!("\"{0}\"", edge.source_strand); - let target_hash = format!("\"{0}\"", edge.target_hash); let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( - "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})", - source_hash, + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", edge.source_node_id, edge.source_coordinate, source_strand, - target_hash, edge.target_node_id, edge.target_coordinate, target_strand, @@ -216,7 +189,7 @@ impl Edge { } let formatted_edge_rows = edge_rows.join(", "); - let select_statement = format!("SELECT * FROM edges WHERE (source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); + let select_statement = format!("SELECT * FROM edges WHERE (source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased) in ({0});", formatted_edge_rows); let existing_edges = Edge::query(conn, &select_statement, vec![]); for edge in existing_edges.iter() { edge_map.insert(EdgeData::from(edge), edge.id); @@ -234,17 +207,13 @@ impl Edge { let mut edge_rows_to_insert = vec![]; for edge in edges_to_insert { - let source_hash = format!("\"{0}\"", edge.source_hash); - let target_hash = format!("\"{0}\"", edge.target_hash); let source_strand = format!("\"{0}\"", edge.source_strand); let target_strand = format!("\"{0}\"", edge.target_strand); let edge_row = format!( - "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9})", - source_hash, + "({0}, {1}, {2}, {3}, {4}, {5}, {6}, {7})", edge.source_node_id, edge.source_coordinate, source_strand, - target_hash, edge.target_node_id, edge.target_coordinate, target_strand, @@ -258,7 +227,7 @@ impl Edge { for chunk in edge_rows_to_insert.chunks(100000) { let formatted_edge_rows_to_insert = chunk.join(", "); - let insert_statement = format!("INSERT INTO edges (source_hash, source_node_id, source_coordinate, source_strand, target_hash, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING *;", formatted_edge_rows_to_insert); + let insert_statement = format!("INSERT INTO edges (source_node_id, source_coordinate, source_strand, target_node_id, target_coordinate, target_strand, chromosome_index, phased) VALUES {0} RETURNING *;", formatted_edge_rows_to_insert); let mut stmt = conn.prepare(&insert_statement).unwrap(); let rows = stmt.query_map([], Edge::edge_from_row).unwrap(); for row in rows { @@ -275,11 +244,9 @@ impl Edge { pub fn to_data(edge: Edge) -> EdgeData { EdgeData { - source_hash: edge.source_hash, source_node_id: edge.source_node_id, source_coordinate: edge.source_coordinate, source_strand: edge.source_strand, - target_hash: edge.target_hash, target_node_id: edge.target_node_id, target_coordinate: edge.target_coordinate, target_strand: edge.target_strand, @@ -347,7 +314,6 @@ impl Edge { let mut block_index = 0; let mut boundary_edges = vec![]; for (node_id, sequence) in sequences_by_node_id.into_iter() { - let hash = sequence.hash.clone(); let block_boundaries = Edge::get_block_boundaries( edges_by_source_node_id.get(&node_id), edges_by_target_node_id.get(&node_id), @@ -358,11 +324,9 @@ impl Edge { // for the data we need to set up boundary edges in the block group graph boundary_edges.push(Edge { id: -1, - source_hash: hash.clone(), source_node_id: node_id, source_coordinate: *block_boundary, source_strand: Strand::Unknown, - target_hash: hash.clone(), target_node_id: node_id, target_coordinate: *block_boundary, target_strand: Strand::Unknown, @@ -377,7 +341,6 @@ impl Edge { let block_sequence = sequence.get_sequence(start, end).to_string(); let first_block = GroupBlock { id: block_index, - sequence_hash: hash.clone(), node_id, sequence: block_sequence, start, @@ -389,7 +352,6 @@ impl Edge { let block_sequence = sequence.get_sequence(start, end).to_string(); let block = GroupBlock { id: block_index, - sequence_hash: hash.clone(), node_id, sequence: block_sequence, start, @@ -403,7 +365,6 @@ impl Edge { let block_sequence = sequence.get_sequence(start, end).to_string(); let last_block = GroupBlock { id: block_index, - sequence_hash: hash.clone(), node_id, sequence: block_sequence, start, @@ -414,7 +375,6 @@ impl Edge { } else { blocks.push(GroupBlock { id: block_index, - sequence_hash: hash.clone(), node_id, sequence: sequence.get_sequence(None, None), start: 0, @@ -428,20 +388,16 @@ impl Edge { // block group, since different paths in the block group may start or end at different // places on sequences. These two "start sequence" and "end sequence" blocks will serve // that role. - let start_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_START_HASH).unwrap(); let start_block = GroupBlock { id: block_index + 1, - sequence_hash: start_sequence.hash.clone(), node_id: PATH_START_NODE_ID, sequence: "".to_string(), start: 0, end: 0, }; blocks.push(start_block); - let end_sequence = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); let end_block = GroupBlock { id: block_index + 2, - sequence_hash: end_sequence.hash.clone(), node_id: PATH_END_NODE_ID, sequence: "".to_string(), start: 0, @@ -460,27 +416,27 @@ impl Edge { .into_iter() .map(|block| { ( - BlockKeyNew { + BlockKey { node_id: block.node_id, coordinate: block.start, }, block.id, ) }) - .collect::>(); + .collect::>(); let blocks_by_end = blocks .clone() .into_iter() .map(|block| { ( - BlockKeyNew { + BlockKey { node_id: block.node_id, coordinate: block.end, }, block.id, ) }) - .collect::>(); + .collect::>(); let mut graph: DiGraphMap = DiGraphMap::new(); let mut edges_by_node_pair = HashMap::new(); @@ -488,12 +444,12 @@ impl Edge { graph.add_node(block.id); } for edge in edges { - let source_key = BlockKeyNew { + let source_key = BlockKey { node_id: edge.source_node_id, coordinate: edge.source_coordinate, }; let source_id = blocks_by_end.get(&source_key); - let target_key = BlockKeyNew { + let target_key = BlockKey { node_id: edge.target_node_id, coordinate: edge.target_coordinate, }; @@ -515,7 +471,7 @@ impl Edge { mod tests { // Note this useful idiom: importing names from outer (for mod tests) scope. use super::*; - use crate::models::collection::Collection; + use crate::models::{collection::Collection, sequence::Sequence}; use crate::test_helpers::get_connection; #[test] @@ -528,11 +484,9 @@ mod tests { .save(conn); let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { - source_hash: "".to_string(), source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: node1.id, target_coordinate: 1, target_strand: Strand::Forward, @@ -545,11 +499,9 @@ mod tests { .save(conn); let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: "".to_string(), source_node_id: node1.id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: node2.id, target_coordinate: 3, target_strand: Strand::Forward, @@ -557,11 +509,9 @@ mod tests { phased: 0, }; let edge3 = EdgeData { - source_hash: "".to_string(), source_node_id: node2.id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, @@ -603,11 +553,9 @@ mod tests { .save(conn); let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { - source_hash: "".to_string(), source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: node1.id, target_coordinate: 1, target_strand: Strand::Forward, @@ -620,11 +568,9 @@ mod tests { .save(conn); let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: "".to_string(), source_node_id: node1.id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: node2.id, target_coordinate: 3, target_strand: Strand::Forward, @@ -632,11 +578,9 @@ mod tests { phased: 0, }; let edge3 = EdgeData { - source_hash: "".to_string(), source_node_id: node2.id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, @@ -686,11 +630,9 @@ mod tests { // NOTE: Create one edge ahead of time to confirm an existing row ID gets returned in the bulk create let existing_edge = Edge::create( conn, - "".to_string(), PATH_START_NODE_ID, -1, Strand::Forward, - "".to_string(), node1.id, 1, Strand::Forward, @@ -703,11 +645,9 @@ mod tests { assert_eq!(existing_edge.target_coordinate, 1); let edge1 = EdgeData { - source_hash: "".to_string(), source_coordinate: -1, source_node_id: PATH_START_NODE_ID, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: node1.id, target_coordinate: 1, target_strand: Strand::Forward, @@ -720,11 +660,9 @@ mod tests { .save(conn); let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_hash: "".to_string(), source_node_id: node1.id, source_coordinate: 2, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: node2.id, target_coordinate: 3, target_strand: Strand::Forward, @@ -732,11 +670,9 @@ mod tests { phased: 0, }; let edge3 = EdgeData { - source_hash: "".to_string(), source_node_id: node2.id, source_coordinate: 4, source_strand: Strand::Forward, - target_hash: "".to_string(), target_node_id: PATH_END_NODE_ID, target_coordinate: -1, target_strand: Strand::Forward, diff --git a/src/models/path.rs b/src/models/path.rs index 5def5d6..a302d4d 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -327,11 +327,9 @@ mod tests { let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, - "".to_string(), PATH_START_NODE_ID, -123, Strand::Forward, - "".to_string(), node1.id, 0, Strand::Forward, @@ -345,11 +343,9 @@ mod tests { let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - "".to_string(), node1.id, 8, Strand::Forward, - "".to_string(), node2.id, 1, Strand::Forward, @@ -363,11 +359,9 @@ mod tests { let node3 = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - "".to_string(), node2.id, 8, Strand::Forward, - "".to_string(), node3.id, 1, Strand::Forward, @@ -381,11 +375,9 @@ mod tests { let node4 = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - "".to_string(), node3.id, 8, Strand::Forward, - "".to_string(), node4.id, 1, Strand::Forward, @@ -394,11 +386,9 @@ mod tests { ); let edge5 = Edge::create( conn, - "".to_string(), node4.id, 8, Strand::Forward, - "".to_string(), PATH_END_NODE_ID, -1, Strand::Forward, @@ -427,11 +417,9 @@ mod tests { let node1 = Node::create(conn, sequence1.hash.as_str()); let edge5 = Edge::create( conn, - "".to_string(), node1.id, 8, Strand::Reverse, - "".to_string(), PATH_END_NODE_ID, 0, Strand::Reverse, @@ -445,11 +433,9 @@ mod tests { let node2 = Node::create(conn, sequence2.hash.as_str()); let edge4 = Edge::create( conn, - "".to_string(), node2.id, 7, Strand::Reverse, - "".to_string(), node1.id, 0, Strand::Reverse, @@ -463,11 +449,9 @@ mod tests { let node3 = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - "".to_string(), node3.id, 7, Strand::Reverse, - "".to_string(), node2.id, 0, Strand::Reverse, @@ -481,11 +465,9 @@ mod tests { let node4 = Node::create(conn, sequence4.hash.as_str()); let edge2 = Edge::create( conn, - "".to_string(), node4.id, 7, Strand::Reverse, - "".to_string(), node3.id, 0, Strand::Reverse, @@ -494,11 +476,9 @@ mod tests { ); let edge1 = Edge::create( conn, - "".to_string(), PATH_START_NODE_ID, -1, Strand::Reverse, - "".to_string(), node4.id, 0, Strand::Reverse, @@ -534,11 +514,9 @@ mod tests { let node1 = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, - "".to_string(), PATH_START_NODE_ID, -1, Strand::Forward, - "".to_string(), node1.id, 0, Strand::Forward, @@ -552,11 +530,9 @@ mod tests { let node2 = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - "".to_string(), node1.id, 8, Strand::Forward, - "".to_string(), node2.id, 1, Strand::Forward, @@ -570,11 +546,9 @@ mod tests { let node3 = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - "".to_string(), node2.id, 8, Strand::Forward, - "".to_string(), node3.id, 1, Strand::Forward, @@ -588,11 +562,9 @@ mod tests { let node4 = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - "".to_string(), node3.id, 8, Strand::Forward, - "".to_string(), node4.id, 1, Strand::Forward, @@ -601,11 +573,9 @@ mod tests { ); let edge5 = Edge::create( conn, - "".to_string(), node4.id, 8, Strand::Forward, - "".to_string(), PATH_END_NODE_ID, -1, Strand::Forward, diff --git a/src/operation_management.rs b/src/operation_management.rs index 1d3ad63..f186a8f 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -15,6 +15,7 @@ use crate::models::block_group::BlockGroup; use crate::models::block_group_edge::BlockGroupEdge; use crate::models::edge::{Edge, EdgeData}; use crate::models::file_types::FileTypes; +use crate::models::node::Node; use crate::models::operations::{ Branch, FileAddition, Operation, OperationState, OperationSummary, }; @@ -108,18 +109,15 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec } "edges" => { let edge_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; - let source_hash = - str::from_utf8(item.new_value(1).unwrap().as_bytes().unwrap()) - .unwrap() - .to_string(); - let target_hash = - str::from_utf8(item.new_value(5).unwrap().as_bytes().unwrap()) - .unwrap() - .to_string(); + let source_node_id = item.new_value(1).unwrap().as_i64().unwrap() as i32; + let target_node_id = item.new_value(4).unwrap().as_i64().unwrap() as i32; created_edges.insert(edge_pk); + let nodes = Node::get_nodes(conn, vec![source_node_id, target_node_id]); + let source_hash = nodes[0].sequence_hash.clone(); if !created_sequences.contains(&source_hash) { previous_sequences.insert(source_hash); } + let target_hash = nodes[1].sequence_hash.clone(); if !created_sequences.contains(&target_hash) { previous_sequences.insert(target_hash); } @@ -331,18 +329,16 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { edge_map.insert( edge_pk, EdgeData { - source_hash: item.new_value(1).unwrap().as_str().unwrap().to_string(), - source_node_id: item.new_value(2).unwrap().as_i64().unwrap() as i32, - source_coordinate: item.new_value(3).unwrap().as_i64().unwrap() as i32, - source_strand: Strand::column_result(item.new_value(4).unwrap()) + source_node_id: item.new_value(1).unwrap().as_i64().unwrap() as i32, + source_coordinate: item.new_value(2).unwrap().as_i64().unwrap() as i32, + source_strand: Strand::column_result(item.new_value(3).unwrap()) .unwrap(), - target_hash: item.new_value(5).unwrap().as_str().unwrap().to_string(), - target_node_id: item.new_value(6).unwrap().as_i64().unwrap() as i32, - target_coordinate: item.new_value(7).unwrap().as_i64().unwrap() as i32, - target_strand: Strand::column_result(item.new_value(8).unwrap()) + target_node_id: item.new_value(4).unwrap().as_i64().unwrap() as i32, + target_coordinate: item.new_value(5).unwrap().as_i64().unwrap() as i32, + target_strand: Strand::column_result(item.new_value(6).unwrap()) .unwrap(), - chromosome_index: item.new_value(9).unwrap().as_i64().unwrap() as i32, - phased: item.new_value(10).unwrap().as_i64().unwrap() as i32, + chromosome_index: item.new_value(7).unwrap().as_i64().unwrap() as i32, + phased: item.new_value(8).unwrap().as_i64().unwrap() as i32, }, ); } @@ -588,11 +584,9 @@ mod tests { let new_edge = Edge::create( conn, - "".to_string(), random_node.id, 0, Strand::Forward, - "".to_string(), existing_node.id, 0, Strand::Forward, diff --git a/src/test_helpers.rs b/src/test_helpers.rs index ab3936f..b749aef 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -83,11 +83,9 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { let block_group = BlockGroup::create(conn, "test", None, "hg19"); let edge0 = Edge::create( conn, - "".to_string(), PATH_START_NODE_ID, 0, Strand::Forward, - "".to_string(), a_node.id, 0, Strand::Forward, @@ -96,11 +94,9 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge1 = Edge::create( conn, - "".to_string(), a_node.id, 10, Strand::Forward, - "".to_string(), t_node.id, 0, Strand::Forward, @@ -109,11 +105,9 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge2 = Edge::create( conn, - "".to_string(), t_node.id, 10, Strand::Forward, - "".to_string(), c_node.id, 0, Strand::Forward, @@ -122,11 +116,9 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge3 = Edge::create( conn, - "".to_string(), c_node.id, 10, Strand::Forward, - "".to_string(), g_node.id, 0, Strand::Forward, @@ -135,11 +127,9 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge4 = Edge::create( conn, - "".to_string(), g_node.id, 10, Strand::Forward, - "".to_string(), PATH_END_NODE_ID, 0, Strand::Forward, From 59d9859cbfad443e84d605354a1869fbf0a46a28 Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 26 Sep 2024 11:09:49 -0400 Subject: [PATCH 12/16] More cleanup --- fixtures/aa.gfa | 2 +- migrations/core/01-initial/up.sql | 2 +- src/models/path.rs | 12 ------------ src/models/sequence.rs | 18 +----------------- src/operation_management.rs | 11 +++++++---- 5 files changed, 10 insertions(+), 35 deletions(-) diff --git a/fixtures/aa.gfa b/fixtures/aa.gfa index 071f3e2..2e83010 100644 --- a/fixtures/aa.gfa +++ b/fixtures/aa.gfa @@ -2,4 +2,4 @@ H VN:Z:1.2 S 1 A SN:Z:123 SO:i:0 SR:i:0 S 2 A SN:Z:123 SO:i:0 SR:i:0 L 1 + 2 + * -P 124 1+,2+ 4M +P 124 1+,2+ 0M diff --git a/migrations/core/01-initial/up.sql b/migrations/core/01-initial/up.sql index d0a8a42..bcd95a0 100644 --- a/migrations/core/01-initial/up.sql +++ b/migrations/core/01-initial/up.sql @@ -102,7 +102,6 @@ CREATE TABLE block_group_edges ( ) STRICT; CREATE UNIQUE INDEX block_group_edges_uidx ON block_group_edges(block_group_id, edge_id); -INSERT INTO sequence (hash, sequence_type, sequence, name, file_path, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "", "", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "", "", 64); INSERT INTO gen_metadata (db_uuid) values (lower( hex(randomblob(4)) || '-' || hex(randomblob(2)) || '-' || '4' || substr(hex( randomblob(2)), 2) || '-' || @@ -110,6 +109,7 @@ INSERT INTO gen_metadata (db_uuid) values (lower( substr(hex(randomblob(2)), 2) || '-' || hex(randomblob(6)) )); +INSERT INTO sequence (hash, sequence_type, sequence, name, file_path, "length") values ("start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "OTHER", "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy", "", "", 64), ("end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "OTHER", "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "", "", 64); INSERT INTO nodes (id, sequence_hash) values (1, "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); INSERT INTO nodes (id, sequence_hash) values (2, "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"); UPDATE SQLITE_SEQUENCE SET seq = 2 WHERE name = 'nodes'; diff --git a/src/models/path.rs b/src/models/path.rs index a302d4d..3c877b2 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -60,18 +60,6 @@ pub fn revcomp(seq: &str) -> String { .unwrap() } -#[derive(Clone, Debug)] -pub struct NewBlock { - pub id: i32, - pub sequence: Sequence, - pub block_sequence: String, - pub sequence_start: i32, - pub sequence_end: i32, - pub path_start: i32, - pub path_end: i32, - pub strand: Strand, -} - #[derive(Clone, Debug)] pub struct PathBlock { pub id: i32, diff --git a/src/models/sequence.rs b/src/models/sequence.rs index 6b69770..8706a4d 100644 --- a/src/models/sequence.rs +++ b/src/models/sequence.rs @@ -189,10 +189,6 @@ impl<'a> NewSequence<'a> { } impl Sequence { - pub const PATH_START_HASH: &'static str = - "start-node-yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"; - pub const PATH_END_HASH: &'static str = - "end-node-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"; #[allow(clippy::new_ret_no_self)] pub fn new() -> NewSequence<'static> { NewSequence::new() @@ -265,10 +261,6 @@ impl Sequence { self.sequence[start as usize..end as usize].to_string() } - fn is_delimiter_hash(hash: &str) -> bool { - hash == Self::PATH_START_HASH || hash == Self::PATH_END_HASH - } - pub fn sequences(conn: &Connection, query: &str, placeholders: Vec) -> Vec { let mut stmt = conn.prepare_cached(query).unwrap(); let rows = stmt @@ -279,15 +271,7 @@ impl Sequence { external_sequence = true; } let hash: String = row.get(0).unwrap(); - // NOTE: "Delimiter" sequences are present to point to the actual start or end of a - // path or node in a block group. They are stored with a non-empty sequence in the - // database in order to satisfy foreign key constraints, so we must make them empty - // here. - let sequence: String = if Sequence::is_delimiter_hash(&hash) { - "".to_string() - } else { - row.get(2).unwrap() - }; + let sequence = row.get(2).unwrap(); Ok(Sequence { hash, sequence_type: row.get(1).unwrap(), diff --git a/src/operation_management.rs b/src/operation_management.rs index f186a8f..fef5c51 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -571,6 +571,13 @@ mod tests { vec![Value::from(bg_id)], ); let dep_bg = binding.first().unwrap(); + + let existing_seq = Sequence::new() + .sequence_type("DNA") + .sequence("AAAATTTT") + .save(conn); + let existing_node = Node::create(conn, existing_seq.hash.as_str()); + let mut session = Session::new(conn).unwrap(); attach_session(&mut session); @@ -578,9 +585,7 @@ mod tests { .sequence_type("DNA") .sequence("ATCG") .save(conn); - let existing_seq = Sequence::sequence_from_hash(conn, Sequence::PATH_END_HASH).unwrap(); let random_node = Node::create(conn, random_seq.hash.as_str()); - let existing_node = Node::create(conn, existing_seq.hash.as_str()); let new_edge = Edge::create( conn, @@ -604,8 +609,6 @@ mod tests { get_changeset_path(&operation).join(format!("{op_id}.dep", op_id = operation.id)); let dependencies: DependencyModels = serde_json::from_reader(fs::File::open(dependency_path).unwrap()).unwrap(); - println!("here1"); - println!("{:?}", dependencies); assert_eq!(dependencies.sequences.len(), 1); assert_eq!( dependencies.block_group[0].collection_name, From 4844c735703ca79b22f3ac5a0af383207967ccab Mon Sep 17 00:00:00 2001 From: hofer Date: Thu, 26 Sep 2024 14:03:23 -0400 Subject: [PATCH 13/16] Better anderson test --- src/imports/gfa.rs | 77 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index 101c252..fc60100 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -332,10 +332,83 @@ mod tests { .clone(); let result = Path::sequence(conn, path); - let expected_sequence_parts = vec!["T", "T", "G", "A", "C", "G", "GCTAGCTCAG", "T", "CCT", "A", "GG", "T", "A", "C", "A", "G", - "TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"]; + let big_part = "TGCTAGCTACTAGTGAAAGAGGAGAAATACTAGATGGCTTCCTCCGAAGACGTTATCAAAGAGTTCATGCGTTTCAAAGTTCGTATGGAAGGTTCCGTTAACGGTCACGAGTTCGAAATCGAAGGTGAAGGTGAAGGTCGTCCGTACGAAGGTACCCAGACCGCTAAACTGAAAGTTACCAAAGGTGGTCCGCTGCCGTTCGCTTGGGACATCCTGTCCCCGCAGTTCCAGTACGGTTCCAAAGCTTACGTTAAACACCCGGCTGACATCCCGGACTACCTGAAACTGTCCTTCCCGGAAGGTTTCAAATGGGAACGTGTTATGAACTTCGAAGACGGTGGTGTTGTTACCGTTACCCAGGACTCCTCCCTGCAAGACGGTGAGTTCATCTACAAAGTTAAACTGCGTGGTACCAACTTCCCGTCCGACGGTCCGGTTATGCAGAAAAAAACCATGGGTTGGGAAGCTTCCACCGAACGTATGTACCCGGAAGACGGTGCTCTGAAAGGTGAAATCAAAATGCGTCTGAAACTGAAAGACGGTGGTCACTACGACGCTGAAGTTAAAACCACCTACATGGCTAAAAAACCGGTTCAGCTGCCGGGTGCTTACAAAACCGACATCAAACTGGACATCACCTCCCACAACGAAGACTACACCATCGTTGAACAGTACGAACGTGCTGAAGGTCGTCACTCCACCGGTGCTTAATAACGCTGATAGTGCTAGTGTAGATCGCTACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATATACTAGAAGCGGCCGCTGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCATCTAGAG"; + let expected_sequence_parts = vec![ + "T", + "T", + "G", + "A", + "C", + "G", + "GCTAGCTCAG", + "T", + "CCT", + "A", + "GG", + "T", + "A", + "C", + "A", + "G", + big_part, + ]; + let expected_sequence = expected_sequence_parts.join(""); assert_eq!(result, expected_sequence); + + let part1 = "T"; + let part3 = "T"; + let part4_5 = vec!["G", "T"]; + let part6 = "A"; + let part7_8 = vec!["C", "T"]; + let part9_10 = vec!["A", "G"]; + let part11 = "GCTAGCTCAG"; + let part12_13 = vec!["T", "C"]; + let part14 = "CCT"; + let part15_16 = vec!["A", "T"]; + let part17 = "GG"; + let part18_19 = vec!["T", "G"]; + let part20 = "A"; + let part21_22 = vec!["T", "C"]; + let part23_24 = vec!["A", "T"]; + let part25_26 = vec!["A", "G"]; + + let mut expected_sequences = HashSet::new(); + for part_a in &part4_5 { + for part_b in &part7_8 { + for part_c in &part9_10 { + for part_d in &part12_13 { + for part_e in &part15_16 { + for part_f in &part18_19 { + for part_g in &part21_22 { + for part_h in &part23_24 { + for part_i in &part25_26 { + let expected_sequence_parts1 = vec![ + part1, part3, part_a, part6, part_b, part_c, + part11, part_d, part14, part_e, part17, part_f, + part20, part_g, part_h, part_i, big_part, + ]; + let temp_sequence1 = expected_sequence_parts1.join(""); + let expected_sequence_parts2 = vec![ + part3, part_a, part6, part_b, part_c, part11, + part_d, part14, part_e, part17, part_f, part20, + part_g, part_h, part_i, big_part, + ]; + let temp_sequence2 = expected_sequence_parts2.join(""); + expected_sequences.insert(temp_sequence1); + expected_sequences.insert(temp_sequence2); + } + } + } + } + } + } + } + } + } + let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); + assert_eq!(all_sequences.len(), 1024); + assert_eq!(all_sequences, expected_sequences); } #[test] From b5dd0ae13354cf6ae686df849c502e00815fbd4b Mon Sep 17 00:00:00 2001 From: hofer Date: Fri, 27 Sep 2024 18:31:53 -0400 Subject: [PATCH 14/16] Add nodes to change management --- src/models/node.rs | 3 +- src/operation_management.rs | 113 +++++++++++++++++++++++++++++++++--- 2 files changed, 107 insertions(+), 9 deletions(-) diff --git a/src/models/node.rs b/src/models/node.rs index 7c56887..b457eaa 100644 --- a/src/models/node.rs +++ b/src/models/node.rs @@ -1,4 +1,5 @@ use rusqlite::{params_from_iter, types::Value as SQLValue, Connection}; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; use crate::models::sequence::Sequence; @@ -6,7 +7,7 @@ use crate::models::sequence::Sequence; pub const PATH_START_NODE_ID: i32 = 1; pub const PATH_END_NODE_ID: i32 = 2; -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct Node { pub id: i32, pub sequence_hash: String, diff --git a/src/operation_management.rs b/src/operation_management.rs index fef5c51..6ff3000 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -41,6 +41,7 @@ pub enum FileMode { struct DependencyModels { sequences: Vec, block_group: Vec, + nodes: Vec, edges: Vec, paths: Vec, } @@ -75,6 +76,7 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec let mut created_block_groups = HashSet::new(); let mut created_paths = HashSet::new(); let mut created_edges = HashSet::new(); + let mut created_nodes = HashSet::new(); let mut created_sequences: HashSet = HashSet::new(); while let Some(item) = iter.next().unwrap() { let op = item.op().unwrap(); @@ -107,19 +109,28 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec previous_block_groups.insert(bg_id); } } + "nodes" => { + created_nodes + .insert(item.new_value(pk_column).unwrap().as_i64().unwrap() as i32); + let sequence_hash = + str::from_utf8(item.new_value(1).unwrap().as_bytes().unwrap()) + .unwrap() + .to_string(); + if !created_sequences.contains(&sequence_hash) { + previous_sequences.insert(sequence_hash); + } + } "edges" => { let edge_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; let source_node_id = item.new_value(1).unwrap().as_i64().unwrap() as i32; let target_node_id = item.new_value(4).unwrap().as_i64().unwrap() as i32; created_edges.insert(edge_pk); let nodes = Node::get_nodes(conn, vec![source_node_id, target_node_id]); - let source_hash = nodes[0].sequence_hash.clone(); - if !created_sequences.contains(&source_hash) { - previous_sequences.insert(source_hash); + if !created_nodes.contains(&source_node_id) { + previous_sequences.insert(nodes[0].sequence_hash.clone()); } - let target_hash = nodes[1].sequence_hash.clone(); - if !created_sequences.contains(&target_hash) { - previous_sequences.insert(target_hash); + if !created_nodes.contains(&target_node_id) { + previous_sequences.insert(nodes[1].sequence_hash.clone()); } } "path_edges" => { @@ -164,6 +175,7 @@ pub fn get_changeset_dependencies(conn: &Connection, changes: &[u8]) -> Vec ), vec![], ), + nodes: vec![], edges: Edge::query( conn, &format!( @@ -220,6 +232,12 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { dep_bg_map.insert(&bg.id, new_bg.id); } + let mut dep_node_map = HashMap::new(); + for node in dependencies.nodes.iter() { + let new_node = Node::create(conn, &node.sequence_hash.clone()); + dep_node_map.insert(&node.id, new_node.id); + } + let mut dep_edge_map = HashMap::new(); let new_edges = Edge::bulk_create( conn, @@ -254,6 +272,7 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { let mut blockgroup_map: HashMap = HashMap::new(); let mut edge_map: HashMap = HashMap::new(); + let mut node_map: HashMap = HashMap::new(); let mut path_edges: HashMap> = HashMap::new(); let mut insert_paths = vec![]; let mut insert_block_group_edges = vec![]; @@ -324,6 +343,15 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { .to_string(), }); } + "nodes" => { + let node_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; + node_map.insert( + node_pk, + str::from_utf8(item.new_value(1).unwrap().as_bytes().unwrap()) + .unwrap() + .to_string(), + ); + } "edges" => { let edge_pk = item.new_value(pk_column).unwrap().as_i64().unwrap() as i32; edge_map.insert( @@ -365,12 +393,49 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { } } - let sorted_edge_ids = edge_map.keys().copied().sorted().collect::>(); + let mut node_id_map: HashMap = HashMap::new(); + for (node_id, sequence_hash) in node_map { + let new_node = Node::create(conn, &sequence_hash); + node_id_map.insert(node_id, new_node.id); + } + + let mut updated_edge_map = HashMap::new(); + for (edge_id, edge) in edge_map { + let updated_source_node_id = dep_node_map.get(&edge.source_node_id).unwrap_or( + node_id_map + .get(&edge.source_node_id) + .unwrap_or(&edge.source_node_id), + ); + let updated_target_node_id = dep_node_map.get(&edge.target_node_id).unwrap_or( + node_id_map + .get(&edge.target_node_id) + .unwrap_or(&edge.target_node_id), + ); + updated_edge_map.insert( + edge_id, + EdgeData { + source_node_id: *updated_source_node_id, + source_coordinate: edge.source_coordinate, + source_strand: edge.source_strand, + target_node_id: *updated_target_node_id, + target_coordinate: edge.target_coordinate, + target_strand: edge.target_strand, + chromosome_index: edge.chromosome_index, + phased: edge.phased, + }, + ); + } + + let sorted_edge_ids = updated_edge_map + .keys() + .copied() + .sorted() + .collect::>(); let created_edges = Edge::bulk_create( conn, sorted_edge_ids .iter() - .map(|id| edge_map[id].clone()) + .map(|id| updated_edge_map[id].clone()) .collect::>(), ); let mut edge_id_map: HashMap = HashMap::new(); @@ -490,6 +555,7 @@ pub fn attach_session(session: &mut session::Session) { "sequence", "block_group", "path", + "nodes", "edges", "path_edges", "block_group_edges", @@ -638,10 +704,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 1); update_with_vcf( @@ -653,10 +721,25 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; + // NOTE: The edge count is 14 because of the following: + // * 1 edge from the source node to the node created by the fasta import + // * 1 edge from the node created by the fasta import to the sink node + // * 8 edges to and from nodes representing the first alt sequence. Topologically there are + // just 2 edges, but there is redundancy because of phasing. There is further redundancy + // because there are 2 non-reference samples, causing 2 nodes to be created for each alt + // sequence. + // * 4 edges to and from nodes representing the second alt sequence. (One sample uses the + // reference part instead of the alt sequence in this case.) assert_eq!(edge_count, 14); + // NOTE: The node count is 9: + // * 2 source and sink nodes + // * 1 node created by the initial fasta import + // * 6 nodes created by the VCF update. See above explanation of edge count for more details. + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); @@ -670,10 +753,12 @@ mod tests { ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 2); @@ -685,10 +770,12 @@ mod tests { ), ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 14); + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); } @@ -840,10 +927,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 1); @@ -866,10 +955,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 14); + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 2); @@ -889,10 +980,12 @@ mod tests { // ensure branch 1 operations have been undone let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 2); + assert_eq!(node_count, 3); assert_eq!(sample_count, 0); assert_eq!(op_count, 2); @@ -906,10 +999,12 @@ mod tests { operation_conn, ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 6); + assert_eq!(node_count, 5); assert_eq!(sample_count, 1); assert_eq!(op_count, 3); @@ -927,10 +1022,12 @@ mod tests { ); let edge_count = Edge::query(conn, "select * from edges", vec![]).len() as i32; + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; let sample_count = Sample::query(conn, "select * from sample", vec![]).len() as i32; let op_count = Operation::query(operation_conn, "select * from operation", vec![]).len() as i32; assert_eq!(edge_count, 14); + assert_eq!(node_count, 9); assert_eq!(sample_count, 3); assert_eq!(op_count, 3); } From d0877b15c59f89beaaced21d04846c6318fce27b Mon Sep 17 00:00:00 2001 From: hofer Date: Fri, 27 Sep 2024 18:37:00 -0400 Subject: [PATCH 15/16] Check node count in GFA import --- src/imports/gfa.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index fc60100..dcc25a8 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -245,6 +245,9 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "ATGGCATATTCGCAGCT"); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -261,6 +264,9 @@ mod tests { all_sequences, HashSet::from_iter(vec!["AAAATTTTGGGGCCCC".to_string()]) ); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -284,6 +290,9 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "ACCTACAAATTCAAAC"); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -307,6 +316,9 @@ mod tests { let result = Path::sequence(conn, path); assert_eq!(result, "TATGCCAGCTGCGAATA"); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 6); } #[test] @@ -409,6 +421,9 @@ mod tests { let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); assert_eq!(all_sequences.len(), 1024); assert_eq!(all_sequences, expected_sequences); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 28); } #[test] @@ -436,5 +451,8 @@ mod tests { let all_sequences = BlockGroup::get_all_sequences(conn, block_group_id); assert_eq!(all_sequences, HashSet::from_iter(vec!["AA".to_string()])); + + let node_count = Node::query(conn, "select * from nodes", vec![]).len() as i32; + assert_eq!(node_count, 4); } } From 283dad8a49d01a588157698d90054c946a4c35ea Mon Sep 17 00:00:00 2001 From: hofer Date: Fri, 27 Sep 2024 18:50:24 -0400 Subject: [PATCH 16/16] Just return ID from Node::create --- src/exports/gfa.rs | 24 +++++------ src/imports/fasta.rs | 6 +-- src/imports/gfa.rs | 4 +- src/models/block_group.rs | 80 ++++++++++++++++++------------------- src/models/edge.rs | 56 +++++++++++++------------- src/models/node.rs | 7 +--- src/models/path.rs | 78 ++++++++++++++++++------------------ src/operation_management.rs | 16 ++++---- src/test_helpers.rs | 24 +++++------ src/updates/vcf.rs | 4 +- 10 files changed, 148 insertions(+), 151 deletions(-) diff --git a/src/exports/gfa.rs b/src/exports/gfa.rs index 457cbf9..26f01ce 100644 --- a/src/exports/gfa.rs +++ b/src/exports/gfa.rs @@ -234,17 +234,17 @@ mod tests { .sequence_type("DNA") .sequence("CCCC") .save(&conn); - let node1 = Node::create(&conn, &sequence1.hash); - let node2 = Node::create(&conn, &sequence2.hash); - let node3 = Node::create(&conn, &sequence3.hash); - let node4 = Node::create(&conn, &sequence4.hash); + let node1_id = Node::create(&conn, &sequence1.hash); + let node2_id = Node::create(&conn, &sequence2.hash); + let node3_id = Node::create(&conn, &sequence3.hash); + let node4_id = Node::create(&conn, &sequence4.hash); let edge1 = Edge::create( &conn, PATH_START_NODE_ID, 0, Strand::Forward, - node1.id, + node1_id, 0, Strand::Forward, 0, @@ -252,10 +252,10 @@ mod tests { ); let edge2 = Edge::create( &conn, - node1.id, + node1_id, 4, Strand::Forward, - node2.id, + node2_id, 0, Strand::Forward, 0, @@ -263,10 +263,10 @@ mod tests { ); let edge3 = Edge::create( &conn, - node2.id, + node2_id, 4, Strand::Forward, - node3.id, + node3_id, 0, Strand::Forward, 0, @@ -274,10 +274,10 @@ mod tests { ); let edge4 = Edge::create( &conn, - node3.id, + node3_id, 4, Strand::Forward, - node4.id, + node4_id, 0, Strand::Forward, 0, @@ -285,7 +285,7 @@ mod tests { ); let edge5 = Edge::create( &conn, - node4.id, + node4_id, 4, Strand::Forward, PATH_END_NODE_ID, diff --git a/src/imports/fasta.rs b/src/imports/fasta.rs index 821759d..6aeed28 100644 --- a/src/imports/fasta.rs +++ b/src/imports/fasta.rs @@ -64,14 +64,14 @@ pub fn import_fasta( .sequence(&sequence) .save(conn) }; - let node = Node::create(conn, &seq.hash); + let node_id = Node::create(conn, &seq.hash); let block_group = BlockGroup::create(conn, &collection.name, None, &name); let edge_into = Edge::create( conn, PATH_START_NODE_ID, 0, Strand::Forward, - node.id, + node_id, 0, Strand::Forward, 0, @@ -79,7 +79,7 @@ pub fn import_fasta( ); let edge_out_of = Edge::create( conn, - node.id, + node_id, sequence_length, Strand::Forward, PATH_END_NODE_ID, diff --git a/src/imports/gfa.rs b/src/imports/gfa.rs index dcc25a8..aaba44c 100644 --- a/src/imports/gfa.rs +++ b/src/imports/gfa.rs @@ -36,8 +36,8 @@ pub fn import_gfa(gfa_path: &FilePath, collection_name: &str, conn: &Connection) .sequence(input_sequence) .save(conn); sequences_by_segment_id.insert(segment.id, sequence.clone()); - let node = Node::create(conn, &sequence.hash); - node_ids_by_segment_id.insert(segment.id, node.id); + let node_id = Node::create(conn, &sequence.hash); + node_ids_by_segment_id.insert(segment.id, node_id); } let mut edges = HashSet::new(); diff --git a/src/models/block_group.rs b/src/models/block_group.rs index dc6996a..4174fde 100644 --- a/src/models/block_group.rs +++ b/src/models/block_group.rs @@ -468,10 +468,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -504,10 +504,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); let deletion = PathBlock { id: 0, - node_id: deletion_node.id, + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -548,10 +548,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -589,10 +589,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -630,10 +630,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -671,10 +671,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -712,10 +712,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -753,10 +753,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -794,10 +794,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -835,10 +835,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -876,10 +876,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); let deletion = PathBlock { id: 0, - node_id: deletion_node.id, + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -919,10 +919,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -972,10 +972,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1014,10 +1014,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1055,10 +1055,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1096,10 +1096,10 @@ mod tests { .sequence_type("DNA") .sequence("NNNN") .save(&conn); - let insert_node = Node::create(&conn, insert_sequence.hash.as_str()); + let insert_node_id = Node::create(&conn, insert_sequence.hash.as_str()); let insert = PathBlock { id: 0, - node_id: insert_node.id, + node_id: insert_node_id, block_sequence: insert_sequence.get_sequence(0, 4).to_string(), sequence_start: 0, sequence_end: 4, @@ -1137,10 +1137,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); let deletion = PathBlock { id: 0, - node_id: deletion_node.id, + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1178,10 +1178,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); let deletion = PathBlock { id: 0, - node_id: deletion_node.id, + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1219,10 +1219,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); let deletion = PathBlock { id: 0, - node_id: deletion_node.id, + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, @@ -1260,10 +1260,10 @@ mod tests { .sequence_type("DNA") .sequence("") .save(&conn); - let deletion_node = Node::create(&conn, deletion_sequence.hash.as_str()); + let deletion_node_id = Node::create(&conn, deletion_sequence.hash.as_str()); let deletion = PathBlock { id: 0, - node_id: deletion_node.id, + node_id: deletion_node_id, block_sequence: deletion_sequence.get_sequence(None, None), sequence_start: 0, sequence_end: 0, diff --git a/src/models/edge.rs b/src/models/edge.rs index 3baacfe..edac91d 100644 --- a/src/models/edge.rs +++ b/src/models/edge.rs @@ -482,12 +482,12 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); - let node1 = Node::create(conn, sequence1.hash.as_str()); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_node_id: node1.id, + target_node_id: node1_id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -497,19 +497,19 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); - let node2 = Node::create(conn, sequence2.hash.as_str()); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_node_id: node1.id, + source_node_id: node1_id, source_coordinate: 2, source_strand: Strand::Forward, - target_node_id: node2.id, + target_node_id: node2_id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_node_id: node2.id, + source_node_id: node2_id, source_coordinate: 4, source_strand: Strand::Forward, target_node_id: PATH_END_NODE_ID, @@ -531,13 +531,13 @@ mod tests { let edge_result1 = edges_by_source_node_id.get(&PATH_START_NODE_ID).unwrap(); assert_eq!(edge_result1.source_coordinate, -1); - assert_eq!(edge_result1.target_node_id, node1.id); + assert_eq!(edge_result1.target_node_id, node1_id); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = edges_by_source_node_id.get(&node1.id).unwrap(); + let edge_result2 = edges_by_source_node_id.get(&node1_id).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); - assert_eq!(edge_result2.target_node_id, node2.id); + assert_eq!(edge_result2.target_node_id, node2_id); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = edges_by_source_node_id.get(&node2.id).unwrap(); + let edge_result3 = edges_by_source_node_id.get(&node2_id).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); assert_eq!(edge_result3.target_node_id, PATH_END_NODE_ID); assert_eq!(edge_result3.target_coordinate, -1); @@ -551,12 +551,12 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); - let node1 = Node::create(conn, sequence1.hash.as_str()); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = EdgeData { source_node_id: PATH_START_NODE_ID, source_coordinate: -1, source_strand: Strand::Forward, - target_node_id: node1.id, + target_node_id: node1_id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -566,19 +566,19 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); - let node2 = Node::create(conn, sequence2.hash.as_str()); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_node_id: node1.id, + source_node_id: node1_id, source_coordinate: 2, source_strand: Strand::Forward, - target_node_id: node2.id, + target_node_id: node2_id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_node_id: node2.id, + source_node_id: node2_id, source_coordinate: 4, source_strand: Strand::Forward, target_node_id: PATH_END_NODE_ID, @@ -626,14 +626,14 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); - let node1 = Node::create(conn, sequence1.hash.as_str()); + let node1_id = Node::create(conn, sequence1.hash.as_str()); // NOTE: Create one edge ahead of time to confirm an existing row ID gets returned in the bulk create let existing_edge = Edge::create( conn, PATH_START_NODE_ID, -1, Strand::Forward, - node1.id, + node1_id, 1, Strand::Forward, 0, @@ -641,14 +641,14 @@ mod tests { ); assert_eq!(existing_edge.source_node_id, PATH_START_NODE_ID); assert_eq!(existing_edge.source_coordinate, -1); - assert_eq!(existing_edge.target_node_id, node1.id); + assert_eq!(existing_edge.target_node_id, node1_id); assert_eq!(existing_edge.target_coordinate, 1); let edge1 = EdgeData { source_coordinate: -1, source_node_id: PATH_START_NODE_ID, source_strand: Strand::Forward, - target_node_id: node1.id, + target_node_id: node1_id, target_coordinate: 1, target_strand: Strand::Forward, chromosome_index: 0, @@ -658,19 +658,19 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); - let node2 = Node::create(conn, sequence2.hash.as_str()); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = EdgeData { - source_node_id: node1.id, + source_node_id: node1_id, source_coordinate: 2, source_strand: Strand::Forward, - target_node_id: node2.id, + target_node_id: node2_id, target_coordinate: 3, target_strand: Strand::Forward, chromosome_index: 0, phased: 0, }; let edge3 = EdgeData { - source_node_id: node2.id, + source_node_id: node2_id, source_coordinate: 4, source_strand: Strand::Forward, target_node_id: PATH_END_NODE_ID, @@ -695,13 +695,13 @@ mod tests { assert_eq!(edge_result1.id, existing_edge.id); assert_eq!(edge_result1.source_coordinate, -1); - assert_eq!(edge_result1.target_node_id, node1.id); + assert_eq!(edge_result1.target_node_id, node1_id); assert_eq!(edge_result1.target_coordinate, 1); - let edge_result2 = edges_by_source_node_id.get(&node1.id).unwrap(); + let edge_result2 = edges_by_source_node_id.get(&node1_id).unwrap(); assert_eq!(edge_result2.source_coordinate, 2); - assert_eq!(edge_result2.target_node_id, node2.id); + assert_eq!(edge_result2.target_node_id, node2_id); assert_eq!(edge_result2.target_coordinate, 3); - let edge_result3 = edges_by_source_node_id.get(&node2.id).unwrap(); + let edge_result3 = edges_by_source_node_id.get(&node2_id).unwrap(); assert_eq!(edge_result3.source_coordinate, 4); assert_eq!(edge_result3.target_node_id, PATH_END_NODE_ID); assert_eq!(edge_result3.target_coordinate, -1); diff --git a/src/models/node.rs b/src/models/node.rs index b457eaa..b1b9a8f 100644 --- a/src/models/node.rs +++ b/src/models/node.rs @@ -14,16 +14,13 @@ pub struct Node { } impl Node { - pub fn create(conn: &Connection, sequence_hash: &str) -> Node { + pub fn create(conn: &Connection, sequence_hash: &str) -> i32 { let insert_statement = format!( "INSERT INTO nodes (sequence_hash) VALUES ('{}');", sequence_hash ); let _ = conn.execute(&insert_statement, ()); - Node { - id: conn.last_insert_rowid() as i32, - sequence_hash: sequence_hash.to_string(), - } + conn.last_insert_rowid() as i32 } pub fn query(conn: &Connection, query: &str, placeholders: Vec) -> Vec { diff --git a/src/models/path.rs b/src/models/path.rs index 3c877b2..c067974 100644 --- a/src/models/path.rs +++ b/src/models/path.rs @@ -312,13 +312,13 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); - let node1 = Node::create(conn, sequence1.hash.as_str()); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, PATH_START_NODE_ID, -123, Strand::Forward, - node1.id, + node1_id, 0, Strand::Forward, 0, @@ -328,13 +328,13 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); - let node2 = Node::create(conn, sequence2.hash.as_str()); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - node1.id, + node1_id, 8, Strand::Forward, - node2.id, + node2_id, 1, Strand::Forward, 0, @@ -344,13 +344,13 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); - let node3 = Node::create(conn, sequence3.hash.as_str()); + let node3_id = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - node2.id, + node2_id, 8, Strand::Forward, - node3.id, + node3_id, 1, Strand::Forward, 0, @@ -360,13 +360,13 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); - let node4 = Node::create(conn, sequence4.hash.as_str()); + let node4_id = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - node3.id, + node3_id, 8, Strand::Forward, - node4.id, + node4_id, 1, Strand::Forward, 0, @@ -374,7 +374,7 @@ mod tests { ); let edge5 = Edge::create( conn, - node4.id, + node4_id, 8, Strand::Forward, PATH_END_NODE_ID, @@ -402,10 +402,10 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); - let node1 = Node::create(conn, sequence1.hash.as_str()); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge5 = Edge::create( conn, - node1.id, + node1_id, 8, Strand::Reverse, PATH_END_NODE_ID, @@ -418,13 +418,13 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); - let node2 = Node::create(conn, sequence2.hash.as_str()); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge4 = Edge::create( conn, - node2.id, + node2_id, 7, Strand::Reverse, - node1.id, + node1_id, 0, Strand::Reverse, 0, @@ -434,13 +434,13 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); - let node3 = Node::create(conn, sequence3.hash.as_str()); + let node3_id = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - node3.id, + node3_id, 7, Strand::Reverse, - node2.id, + node2_id, 0, Strand::Reverse, 0, @@ -450,13 +450,13 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); - let node4 = Node::create(conn, sequence4.hash.as_str()); + let node4_id = Node::create(conn, sequence4.hash.as_str()); let edge2 = Edge::create( conn, - node4.id, + node4_id, 7, Strand::Reverse, - node3.id, + node3_id, 0, Strand::Reverse, 0, @@ -467,7 +467,7 @@ mod tests { PATH_START_NODE_ID, -1, Strand::Reverse, - node4.id, + node4_id, 0, Strand::Reverse, 0, @@ -499,13 +499,13 @@ mod tests { .sequence_type("DNA") .sequence("ATCGATCG") .save(conn); - let node1 = Node::create(conn, sequence1.hash.as_str()); + let node1_id = Node::create(conn, sequence1.hash.as_str()); let edge1 = Edge::create( conn, PATH_START_NODE_ID, -1, Strand::Forward, - node1.id, + node1_id, 0, Strand::Forward, 0, @@ -515,13 +515,13 @@ mod tests { .sequence_type("DNA") .sequence("AAAAAAAA") .save(conn); - let node2 = Node::create(conn, sequence2.hash.as_str()); + let node2_id = Node::create(conn, sequence2.hash.as_str()); let edge2 = Edge::create( conn, - node1.id, + node1_id, 8, Strand::Forward, - node2.id, + node2_id, 1, Strand::Forward, 0, @@ -531,13 +531,13 @@ mod tests { .sequence_type("DNA") .sequence("CCCCCCCC") .save(conn); - let node3 = Node::create(conn, sequence3.hash.as_str()); + let node3_id = Node::create(conn, sequence3.hash.as_str()); let edge3 = Edge::create( conn, - node2.id, + node2_id, 8, Strand::Forward, - node3.id, + node3_id, 1, Strand::Forward, 0, @@ -547,13 +547,13 @@ mod tests { .sequence_type("DNA") .sequence("GGGGGGGG") .save(conn); - let node4 = Node::create(conn, sequence4.hash.as_str()); + let node4_id = Node::create(conn, sequence4.hash.as_str()); let edge4 = Edge::create( conn, - node3.id, + node3_id, 8, Strand::Forward, - node4.id, + node4_id, 1, Strand::Forward, 0, @@ -561,7 +561,7 @@ mod tests { ); let edge5 = Edge::create( conn, - node4.id, + node4_id, 8, Strand::Forward, PATH_END_NODE_ID, @@ -581,7 +581,7 @@ mod tests { let blocks1: Vec = tree.query_point(2).map(|x| x.value.clone()).collect(); assert_eq!(blocks1.len(), 1); let block1 = &blocks1[0]; - assert_eq!(block1.node_id, node1.id); + assert_eq!(block1.node_id, node1_id); assert_eq!(block1.sequence_start, 0); assert_eq!(block1.sequence_end, 8); assert_eq!(block1.path_start, 0); @@ -591,7 +591,7 @@ mod tests { let blocks2: Vec = tree.query_point(12).map(|x| x.value.clone()).collect(); assert_eq!(blocks2.len(), 1); let block2 = &blocks2[0]; - assert_eq!(block2.node_id, node2.id); + assert_eq!(block2.node_id, node2_id); assert_eq!(block2.sequence_start, 1); assert_eq!(block2.sequence_end, 8); assert_eq!(block2.path_start, 8); @@ -601,7 +601,7 @@ mod tests { let blocks4: Vec = tree.query_point(25).map(|x| x.value.clone()).collect(); assert_eq!(blocks4.len(), 1); let block4 = &blocks4[0]; - assert_eq!(block4.node_id, node4.id); + assert_eq!(block4.node_id, node4_id); assert_eq!(block4.sequence_start, 1); assert_eq!(block4.sequence_end, 8); assert_eq!(block4.path_start, 22); diff --git a/src/operation_management.rs b/src/operation_management.rs index 6ff3000..a0fba1e 100644 --- a/src/operation_management.rs +++ b/src/operation_management.rs @@ -234,8 +234,8 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { let mut dep_node_map = HashMap::new(); for node in dependencies.nodes.iter() { - let new_node = Node::create(conn, &node.sequence_hash.clone()); - dep_node_map.insert(&node.id, new_node.id); + let new_node_id = Node::create(conn, &node.sequence_hash.clone()); + dep_node_map.insert(&node.id, new_node_id); } let mut dep_edge_map = HashMap::new(); @@ -395,8 +395,8 @@ pub fn apply_changeset(conn: &Connection, operation: &Operation) { let mut node_id_map: HashMap = HashMap::new(); for (node_id, sequence_hash) in node_map { - let new_node = Node::create(conn, &sequence_hash); - node_id_map.insert(node_id, new_node.id); + let new_node_id = Node::create(conn, &sequence_hash); + node_id_map.insert(node_id, new_node_id); } let mut updated_edge_map = HashMap::new(); @@ -642,7 +642,7 @@ mod tests { .sequence_type("DNA") .sequence("AAAATTTT") .save(conn); - let existing_node = Node::create(conn, existing_seq.hash.as_str()); + let existing_node_id = Node::create(conn, existing_seq.hash.as_str()); let mut session = Session::new(conn).unwrap(); attach_session(&mut session); @@ -651,14 +651,14 @@ mod tests { .sequence_type("DNA") .sequence("ATCG") .save(conn); - let random_node = Node::create(conn, random_seq.hash.as_str()); + let random_node_id = Node::create(conn, random_seq.hash.as_str()); let new_edge = Edge::create( conn, - random_node.id, + random_node_id, 0, Strand::Forward, - existing_node.id, + existing_node_id, 0, Strand::Forward, 0, diff --git a/src/test_helpers.rs b/src/test_helpers.rs index b749aef..79b9684 100644 --- a/src/test_helpers.rs +++ b/src/test_helpers.rs @@ -63,22 +63,22 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { .sequence_type("DNA") .sequence("AAAAAAAAAA") .save(conn); - let a_node = Node::create(conn, a_seq.hash.as_str()); + let a_node_id = Node::create(conn, a_seq.hash.as_str()); let t_seq = Sequence::new() .sequence_type("DNA") .sequence("TTTTTTTTTT") .save(conn); - let t_node = Node::create(conn, t_seq.hash.as_str()); + let t_node_id = Node::create(conn, t_seq.hash.as_str()); let c_seq = Sequence::new() .sequence_type("DNA") .sequence("CCCCCCCCCC") .save(conn); - let c_node = Node::create(conn, c_seq.hash.as_str()); + let c_node_id = Node::create(conn, c_seq.hash.as_str()); let g_seq = Sequence::new() .sequence_type("DNA") .sequence("GGGGGGGGGG") .save(conn); - let g_node = Node::create(conn, g_seq.hash.as_str()); + let g_node_id = Node::create(conn, g_seq.hash.as_str()); let _collection = Collection::create(conn, "test"); let block_group = BlockGroup::create(conn, "test", None, "hg19"); let edge0 = Edge::create( @@ -86,7 +86,7 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { PATH_START_NODE_ID, 0, Strand::Forward, - a_node.id, + a_node_id, 0, Strand::Forward, 0, @@ -94,10 +94,10 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge1 = Edge::create( conn, - a_node.id, + a_node_id, 10, Strand::Forward, - t_node.id, + t_node_id, 0, Strand::Forward, 0, @@ -105,10 +105,10 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge2 = Edge::create( conn, - t_node.id, + t_node_id, 10, Strand::Forward, - c_node.id, + c_node_id, 0, Strand::Forward, 0, @@ -116,10 +116,10 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge3 = Edge::create( conn, - c_node.id, + c_node_id, 10, Strand::Forward, - g_node.id, + g_node_id, 0, Strand::Forward, 0, @@ -127,7 +127,7 @@ pub fn setup_block_group(conn: &Connection) -> (i32, Path) { ); let edge4 = Edge::create( conn, - g_node.id, + g_node_id, 10, Strand::Forward, PATH_END_NODE_ID, diff --git a/src/updates/vcf.rs b/src/updates/vcf.rs index 61d8bf5..ae53fee 100644 --- a/src/updates/vcf.rs +++ b/src/updates/vcf.rs @@ -281,7 +281,7 @@ pub fn update_with_vcf( let sequence = SequenceCache::lookup(&mut sequence_cache, "DNA", vcf_entry.alt_seq.to_string()); let sequence_string = sequence.get_sequence(None, None); - let node = Node::create(conn, sequence.hash.as_str()); + let node_id = Node::create(conn, sequence.hash.as_str()); let change = prepare_change( vcf_entry.block_group_id, &vcf_entry.path, @@ -291,7 +291,7 @@ pub fn update_with_vcf( vcf_entry.phased, sequence_string.clone(), sequence_string.len() as i32, - node.id, + node_id, ); changes .entry((vcf_entry.path, vcf_entry.sample_name))