Skip to content

Commit

Permalink
Merge pull request #590 from uclahs-cds/czhu-fuzz-test
Browse files Browse the repository at this point in the history
Refactor to better handle circRNA to ensure the same variant must appear or not appear in all loops
  • Loading branch information
zhuchcn authored Oct 18, 2022
2 parents f5d06ed + 52e9a75 commit f076b5d
Show file tree
Hide file tree
Showing 14 changed files with 313 additions and 126 deletions.
4 changes: 3 additions & 1 deletion moPepGen/cli/call_variant_peptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,8 @@ def call_peptide_circ_rna(record:circ.CircRNAModel, ref:params.ReferenceData,
gene_seq = gene_seqs[gene_id]
record.fragments.sort()
circ_seq = record.get_circ_rna_sequence(gene_seq)
for loc in circ_seq.locations:
loc.ref.seqname = record.id

# Alternative splicing should not be included. Alternative splicing
# represented as Insertion, Deletion or Substitution.
Expand Down Expand Up @@ -478,4 +480,4 @@ def call_peptide_circ_rna(record:circ.CircRNAModel, ref:params.ReferenceData,
cgraph.fit_into_codons()
pgraph = cgraph.translate()
pgraph.create_cleavage_graph()
return pgraph.call_variant_peptides(blacklist=ref.canonical_peptides)
return pgraph.call_variant_peptides(blacklist=ref.canonical_peptides, circ_rna=record)
9 changes: 6 additions & 3 deletions moPepGen/seqvar/VariantRecordWithCoordinate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def shift(self, index:int) -> VariantRecordWithCoordinate:
variant=self.variant,
location=FeatureLocation(
start=max(self.location.start + index, 0),
end=self.location.end + index
end=self.location.end + index,
seqname=self.location.seqname
),
is_stop_altering=self.is_stop_altering
)
Expand All @@ -34,7 +35,7 @@ def to_protein_coordinates(self) -> VariantRecordWithCoordinate:
end = math.ceil(self.location.end / 3)
return seqvar.VariantRecordWithCoordinate(
variant=self.variant,
location=FeatureLocation(start=start, end=end),
location=FeatureLocation(start=start, end=end, seqname=self.location.seqname),
is_stop_altering=self.is_stop_altering
)

Expand All @@ -43,7 +44,9 @@ def __getitem__(self, index) -> VariantRecordWithCoordinate:
start, stop, _ = index.indices(self.location.end)
start_index = max([start, self.location.start])
end_index = min([stop, self.location.end])
location = FeatureLocation(start=start_index, end=end_index)
location = FeatureLocation(
start=start_index, end=end_index, seqname=self.location.seqname
)
return VariantRecordWithCoordinate(
variant=self.variant,
location=location
Expand Down
61 changes: 60 additions & 1 deletion moPepGen/svgraph/PVGNode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
from __future__ import annotations
import copy
from functools import cmp_to_key
import math
from typing import Dict, List, Set
from moPepGen import aa, seqvar
from moPepGen import aa, circ, seqvar
from moPepGen.SeqFeature import FeatureLocation
from moPepGen.seqvar.VariantRecord import VariantRecord
from moPepGen.svgraph.SubgraphTree import SubgraphTree


class PVGNode():
Expand Down Expand Up @@ -149,6 +151,53 @@ def is_subgraph_start(self) -> bool:
return any(x.subgraph_id != self.subgraph_id for x in self.in_nodes) and \
all(x.subgraph_id == self.subgraph_id for x in self.out_nodes)

def is_at_least_one_loop_downstream(self, other, subgraphs:SubgraphTree,
circ_rna:circ.CircRNAModel) -> bool:
""" Checks if is at least one loop downstream to the other node. """
if self.seq.locations:
i = self.seq.locations[-1].ref.start
x_level = subgraphs[self.seq.locations[-1].ref.seqname].level
else:
for v in reversed(self.variants):
if not v.variant.is_circ_rna():
i = math.floor(v.variant.location.start / 3)
x_level = subgraphs[v.location.seqname].level
break
else:
raise ValueError("Failed to find a non circRNA variant.")

if other.seq.locations:
j = other.seq.locations[0].ref.start
y_level = subgraphs[other.seq.locations[0].ref.seqname].level
else:
for v in other.variants:
if not v.variant.is_circ_rna():
j = math.floor(v.variant.location.start)
y_level = subgraphs[v.location.seqname].level
break
else:
raise ValueError("Failed to find a non circRNA variant.")

if x_level - y_level > 1:
return True

if x_level == y_level:
return False

if x_level - y_level == 1:
for fragment in circ_rna.fragments:
frag = FeatureLocation(
start=math.floor((fragment.location.start - 3) / 3),
end=math.floor(fragment.location.end / 3)
)
if i in frag and j in frag:
return i >= j
if i in frag:
return False
if j in frag:
return True
raise ValueError('Locations not found from the fragments of the circRNA.')

def has_any_in_bridge(self) -> None:
""" Check if it has any incoming node that is bridge """
return any(node.is_bridge() for node in self.in_nodes)
Expand Down Expand Up @@ -546,6 +595,16 @@ def has_variant_at(self, start:int, end:int) -> bool:
return True
return False

def get_subgraph_id_at(self, i:int) -> str:
""" Get the subgraph ID at the ith amino acid of the node's sequence. """
for loc in self.seq.locations:
if i in loc.query:
return loc.ref.seqname
for v in self.variants:
if i in v.location:
return v.location.seqname
raise ValueError("Failed to get the subgraph ID.")

def is_missing_variant(self, variant:seqvar.VariantRecord) -> bool:
""" Checks if in the coordinates of the node if any variant from a
given set is missing. This is used for circRNA to ensure that the
Expand Down
21 changes: 18 additions & 3 deletions moPepGen/svgraph/PVGOrf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,25 @@
from __future__ import annotations
from typing import Set, List
import copy
from moPepGen import seqvar
from moPepGen import circ, seqvar
from moPepGen.svgraph.PVGNode import PVGNode
from moPepGen.svgraph.SubgraphTree import SubgraphTree


class PVGOrf():
""" Helper class for an ORF and its corresponding start gain variants. """
def __init__(self, orf:List[int,int]=None,
start_gain:Set[seqvar.VariantRecord]=None):
start_gain:Set[seqvar.VariantRecord]=None, start_node:PVGNode=None):
""" constructor """
self.orf = orf or None
self.start_gain = start_gain or set()
self.start_node = start_node

def copy(self) -> PVGOrf:
""" copy """
orf = self.orf
start_gain = copy.copy(self.start_gain)
return self.__class__(orf, start_gain)
return self.__class__(orf, start_gain, self.start_node)

def __eq__(self, other:PVGOrf) -> bool:
""" equal """
Expand Down Expand Up @@ -60,3 +63,15 @@ def __le__(self, other:PVGOrf) -> bool:
def __hash__(self):
""" hash """
return hash((tuple(self.orf), frozenset(self.start_gain)))

def is_valid_orf(self, node:PVGNode, subgraphs:SubgraphTree,
circ_rna:circ.CircRNAModel) -> bool:
""" Checks if it is a valid orf of a downstream node. """
if not node.is_at_least_one_loop_downstream(self.start_node, subgraphs, circ_rna):
return True
start_gain = {x for x in self.start_gain if not x.is_circ_rna()}
start_gain.update(x.variant for x in self.start_node.variants
if not x.variant.is_circ_rna())
variants = {x.variant for x in node.variants if not x.variant.is_circ_rna()}
return not any(node.is_missing_variant(v) for v in start_gain) \
and not any(x not in start_gain for x in variants)
Loading

0 comments on commit f076b5d

Please sign in to comment.