-
Notifications
You must be signed in to change notification settings - Fork 2
/
Inflectors.py
443 lines (360 loc) · 17.4 KB
/
Inflectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
# Copyright 2022 Max Schmaltz: @maxschmaltz
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ************************************************************************
import os, sys
ROOT = os.path.dirname(__file__)
depth = 0
for _ in range(depth): ROOT = os.path.dirname(ROOT)
sys.path.append(ROOT)
# import required modules
import json
import os
import re
# import spaCy
import spacy
# in DERBI we need a compound splitter; we use dtuggener/CharSplit.
# to access it, we first need do some manipulations:
# clone to folder 'Charsplit'
# from git import Repo
# if not os.path.exists('./CharSplit'):
# Repo.clone_from('https://github.com/dtuggener/CharSplit', 'CharSplit')
# in folder 'Charsplit' create an empty file '__init__.py'
# for python to recognize the folder as a package
# filepath = os.path.join('CharSplit', '__init__.py')
# with open(filepath, 'w') as i:
# i.write('')
# # in file 'Charsplit/charsplit/__init__.py' we delete all the text
# # as running it, python throws an exception
# filepath = os.path.join('CharSplit/charsplit', '__init__.py')
# with open(filepath, 'w') as i:
# i.write('')
# finally import
from CharSplit.charsplit.splitter import Splitter
splitter = Splitter()
# import required scripts
# from DERBI import Tools
import Tools
'''
For each POS we have its own inflector (the list of correspondance can be found at
https://github.com/maxschmaltz/DERBI/blob/main/Router.json).
Each one is organized generally the same way though:
1. Search in lexicon.
If the desired form of the input token is
some kind of exception, it will be obtained via lexicon.
The common way is full inflection (all the word);
but in some cases only stem can be alternated and the flexion
is obtained correspondingly to the regular model.
2. Apply regular model.
After the input has gone through the lexicon, the output is
either returned or inflected with the regular model.
'''
# Basic Parent Class
class BasicInflector:
def __init__(self, fa_path: str=None, lexc_path: str=None):
self.auto_rules, self.lexc_rules = None, None
# obtain rules
if fa_path is not None:
self.auto_rules = Tools.StateMachine(fa_path).rules
if lexc_path is not None:
self.lexc_rules = Tools.Lexicon(lexc_path).rules
def search_in_lexicon(self, lemma: str, target_tags: str) -> tuple:
if (self.lexc_rules is None) or (self.lexc_rules.get(lemma) is None):
return lemma, Tools.split_tags(target_tags)
else:
tags_dict = Tools.split_tags(target_tags)
curr_rules = self.lexc_rules[lemma]
for rule in curr_rules:
# we require partial match
rule_is_applicable = set([((rule['rule'].get(cat) is None) or (feat in rule['rule'][cat]))
for cat, feat in tags_dict.items()]) == {True}
if rule_is_applicable:
# we return output and the not matched features (for further inflection) as well
return rule['output'], {cat: feat for cat, feat in tags_dict.items() if rule['rule'].get(cat) is None}
else:
return lemma, Tools.split_tags(target_tags)
def automata(self, token: str, tags_dict: dict) -> str:
if self.auto_rules is None:
return token
for rule in self.auto_rules:
# here we require full match though
rule_is_applicable = (set([tags_dict.get(cat, '') in rule['rule'][cat]
for cat, feat in rule['rule'].items()]) == {True}) or (rule['rule'] == {})
if rule_is_applicable:
token = re.sub(rule['pattern'], rule['to_sub'], token)
# print(rule, token)
return token
def __call__(self, token: spacy.tokens.token.Token, target_tags: str) -> str:
# the common way:
# 1. search in lexicon
output, remaining_tags = self.search_in_lexicon(token.lemma_.lower(), target_tags)
if not(len(remaining_tags)):
return output
# 2. apply regular model
return self.automata(output, remaining_tags)
# CCONJ, INTJ, NUM, PART, SCONJ, X
class Uninflected(BasicInflector):
def search_in_lexicon(self, *args):
pass
def automata(self, *args):
pass
# drop tags
def __call__(self, token: spacy.tokens.token.Token, _):
return token.norm_
# ADJ, ADV
class ADJInflector(BasicInflector):
# most of the one syllable ADJ and ADV
# toss an umlaut in comparative and superlative,
# e.g. groß -> größer
def umlaut(self, token: str) -> str:
if token.count('#') != 1:
return token.replace('#', '')
token = re.sub('#a', 'ä', token)
token = re.sub('#o', 'ö', token)
token = re.sub('#u', 'ü', token)
# if not applicable, there is no umlaut
return token.replace('#', '')
def __call__(self, token: spacy.tokens.token.Token or str, target_tags: str) -> str:
# from AUX and VERB we can receive <str> tokens
# (when Verbform=Part),
# so we must just pass the following part then
if not isinstance(token, str):
# somehow for ADV and ADJ spacy add 'en' to lemma in Degree=Pos,
# e.g. 'schnell'.lemma_ = 'schnellen' but 'schneller'.lemma_ = 'schnell'
if (token.pos_ == 'ADV') and (token.text.lower() + 'en' == token.lemma_):
token.lemma_ = token.text
# somehow for ADV spacy add 'e'/'en'/... to lemma in some forms,
# e.g. 'rote'.lemma_ = 'rote' but 'roten'.lemma_ = 'rot'
if (token.pos_ == 'ADJ') and (token.text.lower() == token.lemma_) and (len(Tools.split_tags(target_tags)) > 1):
token.lemma_ = re.sub('e[mnrs]{0,1}$', '', token.lemma_)
output, remaining_tags = self.search_in_lexicon(token.lemma_.lower(), target_tags)
if not(len(remaining_tags)):
return output
else:
output = token
remaining_tags = Tools.split_tags(target_tags)
auto_output = self.automata(output, remaining_tags)
# toss an umlaut, if applicable
return self.umlaut(auto_output)
# ADP
class ADPInflector(BasicInflector):
def automata(self, *args):
pass
def __call__(self, token: spacy.tokens.token.Token, target_tags: str) -> str:
output, remaining_tags = self.search_in_lexicon(token.lemma_.lower(), target_tags)
if not(len(remaining_tags)):
return output
# if an adposition came through the lexc and returned with
# remaining tags, it means it's not there (as APD.lexc defines
# all the features); then we're trying to inflect the adp to
# a form it can't have
raise ValueError('Features "' + target_tags +
'" are not available for word "' + token.norm_ + '".')
# AUX
class AUXInflector(BasicInflector):
def __init__(self, fa_path: str=None, lexc_path: str=None):
super().__init__(fa_path, lexc_path)
# ADJInflector for participles
self.adj_inflector = ADJInflector('./meta/automata/ADJ.fa')
# strong german verbs toss an umlaut
# when Mood=Sub,
# e.g. war -> wäre
def umlaut(self, token: str) -> str:
if '&' not in token:
return token
past_stem = token[:token.index('&')]
sub_stem = re.sub('a', 'ä', past_stem)
sub_stem = re.sub('o', 'ö', sub_stem)
sub_stem = re.sub('u', 'ü', sub_stem)
return re.sub('^' + past_stem + '&', sub_stem, token)
def __call__(self, token: spacy.tokens.token.Token, target_tags: str):
# restrict imperative forms formation for modal verbs
if ((token.lemma_.lower() in ['dürfen', 'können', 'mögen', 'müssen', 'sollen', 'wollen'])
and ('Mood=Imp' in target_tags)):
raise ValueError('No Imperative forms available for modal verbs.')
if token.lemma_ == 'habe':
token.lemma_ = 'haben'
if 'Verbform=Part' in target_tags:
target_tags = re.sub('Verbform=Part', 'Tense=Past|Verbform=Part', target_tags)
output, remaining_tags = self.search_in_lexicon(token.lemma_.lower(), target_tags)
if not(len(remaining_tags)):
return output
output = self.automata(output, remaining_tags)
# toss an umlaut if applicable
output = self.umlaut(output)
# use ADJInflector for participles,
# as they inflect the same way
if 'Verbform=Part' in target_tags:
return self.adj_inflector(output, re.sub('Tense=Past\|', '', target_tags) + '|Degree=Pos')
return output
# DET
class DETInflector(BasicInflector):
def parse_poss_dets(self, token: str) -> str:
# 'euer' is distinct, as it has a prothetical vowel
euer_pattern = re.compile('eue{0,1}r')
poss_pattern = re.compile('[dms]ein|unser|ihr')
if euer_pattern.search(token) is not None:
match = 'euer'
elif poss_pattern.search(token) is not None:
match = poss_pattern.search(token)[0]
else:
match = 'mein'
return match
def __call__(self, token: spacy.tokens.token.Token, target_tags: str) -> str:
# restrict plural forms formations for 'ein'
if (re.search('^ein(e[mnrs]{0,1}){0,1}', token.lemma_.lower()) is not None) and ('Number=Plur' in target_tags):
raise ValueError('Article "ein" has only Singular forms.')
# detect possessive pronouns
input = token.lemma_.lower() if 'Poss=Yes' not in target_tags else self.parse_poss_dets(token.text.lower())
output, remaining_tags = self.search_in_lexicon(input, target_tags)
if not(len(remaining_tags)):
return output
return self.automata(output, remaining_tags)
# NOUN
class NOUNInflector(BasicInflector):
def __init__(self, fa_path: str=None, lexc_path: str=None):
super().__init__(fa_path, lexc_path)
# ADJInflector for nouns of adjective declination
self.adj_inflector = ADJInflector('./meta/automata/ADJ.fa')
def __call__(self, token: spacy.tokens.token.Token, target_tags: str) -> str:
# adjective declination nouns
if 'Declination=' in target_tags:
if re.search('e[mnrs]{0,1}$', token.norm_) is None:
raise ValueError('Could not decline word "' + token.norm_ + '" as an ADJ.')
token.lemma_ = re.sub('e[mnrs]{0,1}$', '', token.lemma_.lower())
return self.adj_inflector(token, target_tags + '|Degree=Pos')
# primary search in lexicon
output, remaining_tags = self.search_in_lexicon(token.lemma_.lower(), target_tags)
if not(len(remaining_tags)):
return output
# if fails, we'll try to split it and search once again
splitted = splitter.split_compound(output)[0]
# it's no compound then
if splitted[0] == 0:
return self.automata(output, remaining_tags)
splitted = splitted[1:]
# search once again, now the compound head
output, remaining_tags = self.search_in_lexicon(splitted[1].lower(), target_tags)
if not(len(remaining_tags)):
return output
# restore compound
return splitted[0].lower() + self.automata(output, remaining_tags)
# PRON
class PRONInflector(BasicInflector):
def __call__(self, token: spacy.tokens.token.Token, target_tags: str) -> str:
# we need it for the state machine not to be confused,
# as every reflexive pronoun has tag 'Reflex=Yes' and PronType=Prs;
# we need only Reflex=Yes
if 'Reflex=Yes' in target_tags:
target_tags = target_tags.replace('Prontype=Prs|', '')
# assert lemma 'ich' for personal pronouns
# (for some reason lemmas for them vary)
if 'Prontype=Prs' in target_tags:
token.lemma_ = 'ich'
output, remaining_tags = self.search_in_lexicon(token.lemma_.lower(), target_tags)
if not(len(remaining_tags)):
return output
return self.automata(output, remaining_tags)
# PROPN
class PROPNInflector(BasicInflector):
def search_in_lexicon(self, *args):
pass
def __call__(self, token: spacy.tokens.token.Token, target_tags: str) -> str:
tags_dict = Tools.split_tags(target_tags)
return self.automata(token.lemma_.lower(), tags_dict)
# VERB
class VERBInflector(AUXInflector):
def __init__(self, fa_path: str=None, lexc_path: str=None):
super().__init__(fa_path, lexc_path)
# we need to distinct between separable and inseparable prefixes
with open('./meta/lexicons/verb_prefixes.json') as j:
self.prefixes = json.load(j)
# split a verb into prefixes and non-prefix-part
def sep_prefixes(self, token: str) -> str:
# first we have to separate the flexion
stem = re.sub('(en$|(?<=[lr])n$|(?<=tu)n$|(?<=sei)n$)', '', token)
# second we want to substitude all the diphthongs with one-symbol characters
# to know exactly the count of syllables (for the function
# not to separate the prefix leaving 'syllableless' stem)
dis = {
'ei': 'E',
'ie': 'I',
'eu': 'U',
'äu': 'Y'
}
di_pattern = re.compile('(ei|ie|eu|äu)')
while di_pattern.search(stem) is not None:
di = di_pattern.search(stem)[0]
stem = re.sub(di, dis[di], stem)
syls = lambda x: len(re.findall('[aeiouyäöüEIUY]', x))
prefs = []
# detect and separate prefixes
prefixes_pattern = re.compile('^' + '|^'.join(self.prefixes['sep'] + self.prefixes['insep']))
while prefixes_pattern.search(stem) is not None:
pref = prefixes_pattern.search(stem)[0]
if syls(re.sub(pref, '', stem)) == 0:
break
prefs.append(pref)
stem = re.sub(pref, '', stem, count=1)
if not len(prefs):
return ('', False, token)
# restore diphthongs
restore_dis = lambda x, matches: x if not len(matches) else restore_dis(re.sub(matches[-1],
{v: k for k, v in dis.items()}[matches[-1]], x), matches[:-1])
# we also need to know if the prefix complex is separable or inseparable
insep = ((prefs[0] in self.prefixes['insep']) or (prefs[-1] in self.prefixes['insep']))
prefs = restore_dis(''.join(prefs), re.findall('|'.join({v: k for k, v in dis.items()}.keys()), ''.join(prefs)))
return prefs, insep, re.sub('^' + prefs, '', token)
# restore separated prefixes
def add_prefixes(self, prefixes: str, insep: bool, token: str, part: bool) -> str:
if not len(prefixes):
return re.sub('#', 'ge', token)
# separable prefixes are joint at the beginning anyways
if insep:
return prefixes + re.sub('#', '', token)
# inseparable prefixes are joint in participles
if part:
return prefixes + re.sub('#', 'ge', token)
# separable prefixes are separated in finite and imperative forms
return '(' + token + ' , ' + prefixes + ') '
def __call__(self, token: spacy.tokens.token.Token, target_tags: str) -> str:
# restrict imperative forms formation for modal verbs
if ((token.lemma_.lower() in ['dürfen', 'können', 'mögen', 'müssen', 'sollen', 'wollen'])
and ('Mood=Imp' in target_tags)):
raise ValueError('No Imperative forms available for modal verbs.')
if token.lemma_ == 'habe':
token.lemma_ = 'haben'
if target_tags == 'Verbform=Inf':
return token.lemma_
part = False
if target_tags == 'Verbform=Part':
part = True
target_tags = 'Tense=Past|Verbform=Part'
# separate prefixes
prefixes, insep, stem = self.sep_prefixes(token.lemma_.lower())
# NB! in lexicon we search only non-prefix part
output, remaining_tags = self.search_in_lexicon(stem, target_tags)
output = self.automata(output, remaining_tags)
# remove # for -ieren
output = re.sub('#(?=\w+iert$)', '', output)
# toss an umlaut if applicable
output = self.umlaut(output)
# restore prefixes:
# separable prefixes and inseparable in participles are joint at the beginning
# else the prefix is separated
output = self.add_prefixes(prefixes, insep, output, part)
# use ADJInflector for participles,
# as they inflect the same way
if 'Verbform=Part' in target_tags:
return self.adj_inflector(output, re.sub('Tense=Past\|', '', target_tags) + '|Degree=Pos')
return output