diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 1e41220f358..340bd911b32 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,3 +1,4 @@ +import re from typing import Callable, Dict, List, Optional, Tuple from thinc.api import Model @@ -7,6 +8,7 @@ from ...symbols import POS from ...tokens import Token from ...vocab import Vocab +from ..char_classes import COMBINING_DIACRITICS PUNCT_RULES = {"«": '"', "»": '"'} @@ -51,9 +53,10 @@ def __init__( super().__init__( vocab, model, name, mode=mode, overwrite=overwrite, scorer=scorer ) + self._diacritics_re = re.compile(f"[{COMBINING_DIACRITICS}]") def _pymorphy_lemmatize(self, token: Token) -> List[str]: - string = token.text + string = self._diacritics_re.sub("", token.text) univ_pos = token.pos_ morphology = token.morph.to_dict() if univ_pos == "PUNCT": diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 66aa7e3a6b4..77e0ac86e12 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -65,6 +65,8 @@ def test_ru_lemmatizer_works_with_different_pos_homonyms( ("гвоздики", "Gender=Masc", "гвоздик"), ("вина", "Gender=Fem", "вина"), ("вина", "Gender=Neut", "вино"), + ("жену", "Gender=Fem", "жена"), + ("жену́", "Gender=Fem", "жена"), ], ) def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morph, lemma):