Skip to content

Commit

Permalink
Merge pull request #535 from uclahs-cds/czhu-fix-call-variant
Browse files Browse the repository at this point in the history
Fix callVariant with issues with multiple frameshifting mutations
  • Loading branch information
zhuchcn authored Jul 25, 2022
2 parents ea6e037 + ea175ba commit a15c4d4
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 10 deletions.
18 changes: 14 additions & 4 deletions moPepGen/svgraph/PeptideVariantGraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,9 @@ def call_and_stage_known_orf_in_cds(self, cursor:PVGCursor,
node_copy.truncated = True

additional_variants = start_gain + cursor.cleavage_gain
upstream_indels = target_node.upstream_indel_map.get(cursor.in_node)
if upstream_indels:
additional_variants += upstream_indels

traversal.pool.add_miscleaved_sequences(
node=node_copy,
Expand All @@ -828,12 +831,20 @@ def call_and_stage_known_orf_in_cds(self, cursor:PVGCursor,
cur_start_gain = copy.copy(start_gain)
if not cur_start_gain:
new_start_gain = set()

for variant in out_node.variants:
if variant.variant.is_frameshifting():
new_start_gain.add(variant.variant)

for variant in target_node.variants:
if variant.variant.is_frameshifting():
new_start_gain.add(variant.variant)

upstream_indels = target_node.upstream_indel_map.get(cursor.in_node)
if upstream_indels:
for variant in upstream_indels:
new_start_gain.add(variant)

stop_index = self.known_orf[1]
stop_lost = target_node.get_stop_lost_variants(stop_index)
new_start_gain.update(stop_lost)
Expand All @@ -844,8 +855,8 @@ def call_and_stage_known_orf_in_cds(self, cursor:PVGCursor,
else:
cur_start_gain = []
cur_cleavage_gain = None
cur = PVGCursor(target_node, out_node, in_cds, orf,
cur_start_gain, cur_cleavage_gain)
cur = PVGCursor(target_node, out_node, in_cds, orf, cur_start_gain,
cur_cleavage_gain)
traversal.stage(target_node, out_node, cur)

def call_and_stage_known_orf_not_in_cds(self, cursor:PVGCursor,
Expand Down Expand Up @@ -1148,8 +1159,7 @@ def cmp_unknown_orf_check_orf(x:PVGCursor, y:PVGCursor) -> bool:
return -1 if sorted(x.start_gain)[0] > sorted(y.start_gain)[0] else 1
return -1

def stage(self, in_node:PVGNode, out_node:PVGNode,
cursor:PVGCursor):
def stage(self, in_node:PVGNode, out_node:PVGNode, cursor:PVGCursor):
""" When a node is visited through a particular edge during the variant
peptide finding graph traversal, it is staged until all inbond edges
are visited. """
Expand Down
14 changes: 9 additions & 5 deletions moPepGen/svgraph/TVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, seq:DNASeqRecordWithCoordinates,
variants:List[seqvar.VariantRecordWithCoordinate]=None,
branch:bool=False, orf:List[int]=None, reading_frame_index:int=None,
subgraph_id:str=None, global_variant:seqvar.VariantRecord=None,
level:int=0):
level:int=0, was_bridge:bool=False):
""" Constructor for TVGNode.
Args:
Expand All @@ -46,12 +46,13 @@ def __init__(self, seq:DNASeqRecordWithCoordinates,
self.subgraph_id = subgraph_id
self.global_variant = global_variant
self.level = level
self.was_bridge = was_bridge

def create_node(self, seq:DNASeqRecordWithCoordinates,
variants:List[seqvar.VariantRecordWithCoordinate]=None,
branch:bool=False, orf:List[int]=None, reading_frame_index:int=None,
subgraph_id:str=None, global_variant:seqvar.VariantRecord=None,
level:int=None):
level:int=None, was_bridge:bool=False):
""" Constructor for TVGNode.
Args:
Expand All @@ -67,7 +68,8 @@ def create_node(self, seq:DNASeqRecordWithCoordinates,
reading_frame_index=reading_frame_index,
subgraph_id=subgraph_id,
global_variant=global_variant or self.global_variant,
level=level or self.level
level=level or self.level,
was_bridge=was_bridge
)

def __hash__(self):
Expand Down Expand Up @@ -279,7 +281,8 @@ def copy(self) -> TVGNode:
reading_frame_index=self.reading_frame_index,
subgraph_id=self.subgraph_id,
global_variant=self.global_variant,
level=self.level
level=self.level,
was_bridge=self.was_bridge
)

def deepcopy(self, subgraph_id:str=None, level_increment:int=None) -> TVGNode:
Expand Down Expand Up @@ -434,7 +437,8 @@ def truncate_right(self, i:int) -> TVGNode:
orf=self.orf,
reading_frame_index=self.reading_frame_index,
subgraph_id=self.subgraph_id,
level=self.level
level=self.level,
was_bridge=self.was_bridge
)

self.seq = self.seq[:i]
Expand Down
6 changes: 5 additions & 1 deletion moPepGen/svgraph/ThreeFrameTVG.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,7 +1088,7 @@ def find_bridge_nodes_between(start:TVGNode, end=TVGNode
subgraph_out.add(cur)
continue
for e in cur.in_edges:
if e.in_node.reading_frame_index != this_id:
if e.in_node.reading_frame_index != this_id or e.in_node.was_bridge:
bridge_in.add(e.in_node)
elif e.in_node.subgraph_id != cur.subgraph_id:
subgraph_in.add(e.in_node)
Expand Down Expand Up @@ -1255,6 +1255,7 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:
The original input node.
"""
start_node = node
rf_index = node.reading_frame_index
end_node = self.find_farthest_node_with_overlap(node)
end_nodes = {end_node}
if end_node.is_subgraph_end():
Expand Down Expand Up @@ -1332,7 +1333,10 @@ def align_variants(self, node:TVGNode) -> Tuple[TVGNode, TVGNode]:

# create new node with the combined sequence
new_node = cur.copy()
was_bridge = new_node.reading_frame_index != rf_index \
and any(x.reading_frame_index != rf_index for x in out_node.get_out_nodes())
new_node.append_right(out_node)
new_node.was_bridge = was_bridge

for edge in copy.copy(out_node.out_edges):
edge_type = 'reference' \
Expand Down
37 changes: 37 additions & 0 deletions test/files/fuzz/15/brute_force.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
APAGGPGCWVFVF
APKSSLKFSPYYTR
APKSSLKSLSR
AWDQLGQGGGGADPPLGNSSPGWR
AWDQLGQGPTRPLVTR
AWLLGLCLLGSSFSQEGLSGSCEGR
DRSLSRSR
DTLQNCCPLPARISTPK
FSPYYTR
FSPYYTRDR
FSPYYTRDRSLSR
GPIRDTLQNCCPLPARISTPK
HPPPQPAAPPGSWVGHLAR
HPPPQPAAPPGSWVGHLARHVGK
HRPTNSWGHFNTAQSHFATRPMPPN
HVGKAWDQLGQGPTRPLVTR
ISTPKTQEIVVETPTLAAER
ISTPKTQEIVVETPTLAAERER
LSGYMSEQAPQVCDPAR
LSGYMSEQAPQVCDPARR
LSGYMSEQAPQVCDPARRGD
MTAEKLSGYMSEQAPQVCDPAR
MTAEKLSGYMSEQAPQVCDPARR
SLSRSRSRPQSR
SRPQSLSWTQSQPPIK
SRPQSRSR
SRPQSRSRSRPQSLSWTQSQPPIK
SRPQSRSRSRPQSR
SRSRPQSLSWTQSQPPIK
SRSRPQSR
SRSRPQSRSR
SSLKFSPYYTR
SSLKFSPYYTRDR
SSLKSLSR
SSLKSLSRSR
TQEIVVETPTLAAER
TQEIVVETPTLAAERER
21 changes: 21 additions & 0 deletions test/files/fuzz/15/fake_variants.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.2
##mopepgen_version=0.9.0
##parser=
##reference_index=
##genome_fasta=
##annotation_gtf=
##source=
##CHROM=<Description='Gene ID'>
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
#CHROM POS ID REF ALT QUAL FILTER INFO
ENSG00000158062.20 36502 ENSG00000158062.20-36501-ACTCATTCTTAG-A ACTCATTCTTAG A . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-534:524;GENE_SYMBOL=UBXN11
ENSG00000158062.20 34143 ENSG00000158062.20-34142-T-TGTCGGGTTACATG T TGTCGGGTTACATG . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-2893:2894;GENE_SYMBOL=UBXN11
ENSG00000158062.20 35943 ENSG00000158062.20-35942-TTCAGTCCTGGT-T TTCAGTCCTGGT T . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-1093:1083;GENE_SYMBOL=UBXN11
ENSG00000158062.20 35453 ENSG00000158062.20-35452-C-CAGTACTCCCAAAACA C CAGTACTCCCAAAACA . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-1583:1584;GENE_SYMBOL=UBXN11
ENSG00000158062.20 36758 ENSG00000158062.20-36757-AGAGCAGTCTCAGCACAG-A AGAGCAGTCTCAGCACAG A . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-278:262;GENE_SYMBOL=UBXN11
ENSG00000158062.20 36159 ENSG00000158062.20-36158-GAGGGGGCGGT-G GAGGGGGCGGT G . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-877:868;GENE_SYMBOL=UBXN11
ENSG00000158062.20 35951 ENSG00000158062.20-35950-T-TTATTACACGAGAGATA T TTATTACACGAGAGATA . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-1085:1086;GENE_SYMBOL=UBXN11
ENSG00000158062.20 36361 ENSG00000158062.20-36360-GTGCAGCCAGCA-G GTGCAGCCAGCA G . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-675:665;GENE_SYMBOL=UBXN11
ENSG00000158062.20 15456 ENSG00000158062.20-15455-ACAGCTGGATTCTAGAAGT-A ACAGCTGGATTCTAGAAGT A . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-21580:21563;GENE_SYMBOL=UBXN11
20 changes: 20 additions & 0 deletions test/files/fuzz/16/brute_force.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
CPCESAR
CPCESARGLIGAP
GIRIYGDEDEVDMLSDGCGSEAGR
GTHWRPLR
GTHWRPLRWTLTGCWPACR
GTHWRPLRWTLTGCWPACRILVSWW
HRCPCESAR
HRCPCESARGLIGAP
ISVPSCYGGR
ISVPSCYGGRQGDSLAPPEVDFDR
ISVRHRCPCESAR
ISVRSAR
ISVRSARGLIGAP
IYGDEDEVDMLSDGCGSEAGR
IYGDEDEVDMLSDGCGSEALPAMAA
IYGDEDEVDMLSDGCGSEEKISVR
RGIRIYGDEDEVDMLSDGCGSEAGR
SARGLIGAP
WTLTGCWPACR
WTLTGCWPACRILVSWW
15 changes: 15 additions & 0 deletions test/files/fuzz/16/fake_variants.gvf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
##fileformat=VCFv4.2
##mopepgen_version=0.9.0
##parser=
##reference_index=
##genome_fasta=
##annotation_gtf=
##source=
##CHROM=<Description='Gene ID'>
##INFO=<ID=TRANSCRIPT_ID,Number=1,Type=String,Description="Transcript ID">
##INFO=<ID=GENE_SYMBOL,Number=1,Type=String,Description="Gene Symbol">
##INFO=<ID=GENOMIC_POSITION,Number=1,Type=String,Description="Genomic Position">
#CHROM POS ID REF ALT QUAL FILTER INFO
ENSG00000158062.20 17381 ENSG00000158062.20-17380-GAAAAGATCTCAGT-G GAAAAGATCTCAGT G . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-19655:19643;GENE_SYMBOL=UBXN11
ENSG00000158062.20 17396 ENSG00000158062.20-17395-CCTTCCTGCTATGGC-C CCTTCCTGCTATGGC C . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-19640:19627;GENE_SYMBOL=UBXN11
ENSG00000158062.20 17412 ENSG00000158062.20-17411-GCATAGGTGCCCCTGTGAG-G GCATAGGTGCCCCTGTGAG G . . TRANSCRIPT_ID=ENST00000314675.11;GENOMIC_POSITION=chr1-19624:19607;GENE_SYMBOL=UBXN11
21 changes: 21 additions & 0 deletions test/integration/test_call_variant_peptides.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,3 +672,24 @@ def test_call_variant_peptide_case42(self):
expected = self.data_dir/'fuzz/14/brute_force.txt'
reference = self.data_dir/'downsampled_reference/ENST00000452737.5'
self.default_test_case(gvf, reference, expected)

def test_call_variant_peptide_case43(self):
""" Test case from fuzz test that indel variants after collapsing
not considered. #533 """
gvf = [
self.data_dir/'fuzz/15/fake_variants.gvf'
]
expected = self.data_dir/'fuzz/15/brute_force.txt'
reference = self.data_dir/'downsampled_reference/ENST00000314675.11'
self.default_test_case(gvf, reference, expected)

def test_call_variant_peptide_case44(self):
""" Test case from fuzz test that nodes are missed when there are
multiple frameshifting mutation that makes it go back to the original
reading frame. #534 """
gvf = [
self.data_dir/'fuzz/16/fake_variants.gvf'
]
expected = self.data_dir/'fuzz/16/brute_force.txt'
reference = self.data_dir/'downsampled_reference/ENST00000314675.11'
self.default_test_case(gvf, reference, expected)

0 comments on commit a15c4d4

Please sign in to comment.