Merge pull request #559 from uclahs-cds/czhu-fix-call-noncoding

Fix the problem of X in peptide sequence
uclahs-cds · Aug 27, 2022 · c56358a · c56358a
2 parents 2d4c108 + ca9b71a
commit c56358a
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,7 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
-## [0.9.3] - 2022-08-05
+## [0.9.3] - 2022-08-26
 
 ### Fixed
 
@@ -22,6 +22,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 - Fixed issue that `cpop_collapsed` attribute was not retained after merging so peptides that don't end with cleavage sites were yield. #554
 
+- Fixed problem caused by N in the reference DNA sequence. #556
+
 ---
 
 ## [0.9.2] - 2022-07-29

diff --git a/moPepGen/svgraph/VariantPeptideDict.py b/moPepGen/svgraph/VariantPeptideDict.py
@@ -153,9 +153,10 @@ def is_valid_seq(self, seq:Seq, blacklist:Set[str]) -> bool:
         max_length = self.cleavage_params.max_length
         min_mw = self.cleavage_params.min_mw
 
-        return seq not in blacklist and \
-            min_length <= len(seq) <= max_length and \
-            SeqUtils.molecular_weight(seq, 'protein') >= min_mw
+        return seq not in blacklist \
+            and min_length <= len(seq) <= max_length \
+            and 'X' not in seq \
+            and SeqUtils.molecular_weight(seq, 'protein') >= min_mw
 
     def join_miscleaved_peptides(self, check_variants:bool,
             additional_variants:List[VariantRecord], blacklist:Set[str],

diff --git a/moPepGen/util/downsample_reference.py b/moPepGen/util/downsample_reference.py
@@ -241,7 +241,7 @@ def get_noncoding_translate(tx_id:str, anno:gtf.GenomicAnnotation,
         orf = f"ORF{start}:{end}"
         alt_protein_id = f"{protein_id}-{orf}"
         alt_tx_id = f"{tx_id}-{orf}"
-        description = f"{alt_protein_id}|{alt_tx_id}|{gene_id}"
+        description = f"{alt_protein_id}|{alt_tx_id}|{gene_id}|-"
         aa_seq.id = alt_protein_id
         aa_seq.protein_id = alt_protein_id
         aa_seq.transcript_id = alt_tx_id

diff --git a/test/unit/test_variant_peptide_dict.py b/test/unit/test_variant_peptide_dict.py
@@ -1,7 +1,8 @@
 """ Test Module for VariantPeptideDict """
 import unittest
 from test.unit import create_aa_record, create_variants
-from moPepGen.svgraph.VariantPeptideDict import VariantPeptideDict, \
+from moPepGen import params
+from moPepGen.svgraph.VariantPeptideDict import MiscleavedNodes, VariantPeptideDict, \
     VariantPeptideMetadata
 import moPepGen.aa.VariantPeptideIdentifier as vpi
 
@@ -65,3 +66,13 @@ def test_get_peptide_sequences_circ_rna(self):
         seqs = pool.get_peptide_sequences()
         self.assertEqual({str(x.seq) for x in seqs}, {'SSSSSSSSSR'})
         self.assertEqual(list(seqs)[0].description, 'CIRC-ENST0001-E1-E2-E3|1')
+
+
+class TestCaseMiscleavedNodes(unittest.TestCase):
+    """ Test cases for MiscleavedNodes """
+    def test_is_valid_x(self):
+        """ Test that when X is found in the sequence, it is recognized as an
+        invalid sequence. """
+        cleavage_params = params.CleavageParams(enzyme='trypsin')
+        misc_nodes = MiscleavedNodes([], cleavage_params)
+        self.assertFalse(misc_nodes.is_valid_seq('AAAAXAAA', set()))