Skip to content

Commit

Permalink
Merge pull request #673 from uclahs-cds/czhu-fix-vpi
Browse files Browse the repository at this point in the history
Noncoding peptide headers not parsed correctly
  • Loading branch information
zhuchcn authored Feb 2, 2023
2 parents a6e5e87 + ca405b7 commit a8482f3
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 28 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]

## [0.11.2] - 2023-2-3

### Fixed

- Noncoding peptide headers not parsed successfully by summarizeFasta #672

## [0.11.1] - 2023-1-31

### Fixed
Expand Down
71 changes: 49 additions & 22 deletions moPepGen/aa/VariantPeptideIdentifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,29 +68,36 @@ def parse_variant_peptide_id(label:str) -> List[VariantPeptideIdentifier]:
"""
variant_ids = []
for it in label.split(VARIANT_PEPTIDE_SOURCE_DELIMITER):
x_id, *var_ids, index = it.split('|')

orf_id = None
if len(var_ids) > 0 and var_ids[0].startswith('ORF'):
orf_id = var_ids.pop(0)

if x_id.startswith('FUSION-'):
first_variants:List[str] = []
second_variants:List[str] = []
for var_id in var_ids:
which_gene, var_id = var_id.split('-', 1)
if int(which_gene) == 1:
first_variants.append(var_id)
elif int(which_gene) == 2:
second_variants.append(var_id)
else:
raise ValueError('Variant is not valid')
variant_id = FusionVariantPeptideIdentifier(x_id, first_variants,
second_variants, orf_id, index)
elif x_id.startswith('CIRC-') or x_id.startswith('CI-'):
variant_id = CircRNAVariantPeptideIdentifier(x_id, var_ids, orf_id, index)
fields = it.split('|')
if fields[-2].startswith('ORF'):
# Noncoding peptide
tx_id, gene_id, orf_id, index = fields
variant_id = NoncodingPeptideIdentifier(tx_id, gene_id, orf_id, int(index))

else:
variant_id = BaseVariantPeptideIdentifier(x_id, var_ids, orf_id, index)
x_id, *var_ids, index = fields

orf_id = None
if len(var_ids) > 0 and var_ids[0].startswith('ORF'):
orf_id = var_ids.pop(0)

if x_id.startswith('FUSION-'):
first_variants:List[str] = []
second_variants:List[str] = []
for var_id in var_ids:
which_gene, var_id = var_id.split('-', 1)
if int(which_gene) == 1:
first_variants.append(var_id)
elif int(which_gene) == 2:
second_variants.append(var_id)
else:
raise ValueError('Variant is not valid')
variant_id = FusionVariantPeptideIdentifier(x_id, first_variants,
second_variants, orf_id, index)
elif x_id.startswith('CIRC-') or x_id.startswith('CI-'):
variant_id = CircRNAVariantPeptideIdentifier(x_id, var_ids, orf_id, index)
else:
variant_id = BaseVariantPeptideIdentifier(x_id, var_ids, orf_id, index)

variant_ids.append(variant_id)
return variant_ids
Expand Down Expand Up @@ -185,3 +192,23 @@ def second_tx_id(self):
""" get first gene id """
_,_,second = self.fusion_id.split('-')
return second.split(':')[0]

class NoncodingPeptideIdentifier(VariantPeptideIdentifier):
""" Noncoding peptide identifier """
def __init__(self, transcript_id:str, gene_id:str, orf_id:str=None, index:int=None):
""" constructor """
self.transcript_id = transcript_id
self.gene_id = gene_id
self.orf_id = orf_id
self.index = index

def __str__(self) -> str:
""" str """
fields = [self.transcript_id]
if self.gene_id:
fields.append(self.gene_id)
if self.orf_id:
fields.append(self.orf_id)
if self.index:
fields.append(str(self.index))
return '|'.join(fields)
13 changes: 11 additions & 2 deletions moPepGen/aa/VariantPeptideLabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,11 @@ def from_variant_peptide_minimal(peptide:AminoAcidSeqRecord
info_list:List[VariantPeptideInfo] = []
variant_ids = pi.parse_variant_peptide_id(peptide.description)
for variant_id in variant_ids:
if isinstance(variant_id, pi.CircRNAVariantPeptideIdentifier):
if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
gene_ids = [variant_id.gene_id]
var_ids = {}

elif isinstance(variant_id, pi.CircRNAVariantPeptideIdentifier):
gene_ids = None
var_ids = {}

Expand Down Expand Up @@ -156,7 +160,12 @@ def from_variant_peptide(peptide:AminoAcidSeqRecord,
info_list = []
variant_ids = pi.parse_variant_peptide_id(peptide.description)
for variant_id in variant_ids:
if isinstance(variant_id, pi.CircRNAVariantPeptideIdentifier):
if isinstance(variant_id, pi.NoncodingPeptideIdentifier):
tx_id = variant_id.transcript_id
gene_ids = [variant_id.gene_id]
var_ids = {}

elif isinstance(variant_id, pi.CircRNAVariantPeptideIdentifier):
circ_rna_id = variant_id.circ_rna_id
tx_id = circ_rna_id.split('-', 2)[1]
gene_ids = [anno.transcripts[tx_id].transcript.gene_id]
Expand Down
2 changes: 1 addition & 1 deletion test/unit/test_peptide_pool_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def test_from_variant_peptide_noncoding(self):
label_map_data = {'ENSG0001': {'SNV-1157-G-A': 'sSNV'}}
label_map = LabelSourceMapping(label_map_data)

peptide = create_aa_record('KHIRJ','ENST0004|ORF1|1')
peptide = create_aa_record('KHIRJ','ENST0004|ENSG0004|ORF1|1')
infos = VariantPeptideInfo.from_variant_peptide(peptide, anno, label_map)
self.assertIn('Noncoding', infos[0].sources)

Expand Down
6 changes: 3 additions & 3 deletions test/unit/test_variant_peptide_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,11 +208,11 @@ def test_parse_variant_id_circ_rna_and_snv(self):

def test_parse_variant_id_orf(self):
""" parse variant with orf """
label = 'ENST0001|ORF1|1'
label = 'ENST0001|ENSG0001|ORF1|1'
peptide_ids = aa.parse_variant_peptide_id(label)
self.assertEqual(len(peptide_ids), 1)
self.assertIsInstance(peptide_ids[0], pi.BaseVariantPeptideIdentifier)
peptide_ids:List[pi.BaseVariantPeptideIdentifier]
self.assertIsInstance(peptide_ids[0], pi.NoncodingPeptideIdentifier)
peptide_ids:List[pi.NoncodingPeptideIdentifier]
self.assertEqual(peptide_ids[0].transcript_id, 'ENST0001')
self.assertEqual(peptide_ids[0].orf_id, 'ORF1')
self.assertEqual(str(peptide_ids[0]), label)

0 comments on commit a8482f3

Please sign in to comment.