-
Notifications
You must be signed in to change notification settings - Fork 10
/
linguistic.py
255 lines (200 loc) · 9.04 KB
/
linguistic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
from math import sqrt
import numpy as np
from utils.hsk import *
from utils.clausal import *
from utils.coll import *
from utils.bidep import *
from utils.const import *
def get_lexical_indices(text_wordlist, text_level_dict):
lexical_indices = {}
sophistication2 = (text_level_dict[5] + text_level_dict[6] + text_level_dict[7]) / sqrt(len(text_wordlist))
lexical_indices['LEXICAL_RTTR'] = len(set(text_wordlist)) / sqrt(len(text_wordlist))
lexical_indices['LEXICAL_SOP2'] = sophistication2
return lexical_indices
def get_clausal_indices(sent_length_list, clause_length_list, T_unit_length_list, max_depth_list):
clausal_indices = {}
# length based
clausal_indices['MLS'] = np.mean(sent_length_list)
clausal_indices['MLC'] = np.mean(clause_length_list)
clausal_indices['MLTU'] = np.mean(T_unit_length_list)
# unit based
clausal_indices['NCPS'] = len(clause_length_list) / len(sent_length_list)
clausal_indices['NTPS'] = len(T_unit_length_list) / len(sent_length_list)
# tree based
clausal_indices['MEAN_TREE_DEPTH'] = np.mean(max_depth_list)
clausal_indices['MAX_TREE_DEPTH'] = max(max_depth_list)
return clausal_indices
def get_coll_indices(collocation_list):
coll_indices = {}
coll_num = len(collocation_list)
sqrt_coll_num = sqrt(coll_num)
# coll_RTTR
set_colls = set(collocation_list)
coll_indices['COLL_RTTR'] = len(set_colls) / sqrt_coll_num
# unique_RTTR
unique_colls = [coll for coll in collocation_list if isUniqueColl(coll)]
set_unique_colls = set(unique_colls)
coll_indices['UNIQUE_RTTR'] = len(set_unique_colls) / (sqrt(len(unique_colls)) + 1)
# general_RTTR
general_colls = [coll for coll in collocation_list if coll not in set_unique_colls]
set_general_colls = set(general_colls)
general_coll_diversity = len(set_general_colls) / sqrt(len(general_colls))
coll_indices['GENERAL_RTTR'] = general_coll_diversity
# unique_RATIO
# coll_indices['UNIQUE_RATIO2'] = len(unique_colls) / coll_num
coll_indices['UNIQUE_RATIO2'] = len(unique_colls) / sqrt_coll_num
# lowfreq_RATIO
lowfreq_collls = [coll for coll in collocation_list if isLowFreqColl(coll)]
# coll_indices['LOWFREQ_RATIO1'] = len(lowfreq_collls) / coll_num
coll_indices['LOWFREQ_RATIO2'] = len(lowfreq_collls) / sqrt_coll_num
# coll indices based on different types
coll_types = ['VO', 'SP', 'AN', 'AP', 'CN*', 'PP*', 'PV*', 'PC*']
coll_type_dict = {k: [] for k in coll_types}
for coll in collocation_list:
typ = coll_dict[coll.split('\t')[-1]]
coll_type_dict[typ].append(coll)
for ct, v in coll_type_dict.items():
coll_indices[ct + '_RATIO'] = len(v) / len(collocation_list)
coll_indices[ct + '_RTTR'] = 0
if v:
coll_indices[ct + '_RTTR'] = len(set(v)) / sqrt(len(v))
return coll_indices
def get_ngram_indices(text_bigrams, text_trigrams):
'''
param:
- text_bigrams: <type> list
- text_trigrams: <type> dict
return :
- ngram_indices: <type> dict
* bi_rttr, bi_sop2
* tri_rttr, tri_sop2
* each type of dep: rttr, ratio
'''
ngram_indices = {}
trigram_list = []
for trigrams in text_trigrams.values():
trigram_list.extend(trigrams)
# diversity
ngram_indices['BIGRAM_RTTR'] = len(set(text_bigrams)) / sqrt(len(text_bigrams))
ngram_indices['DEP_RTTR'] = len(set(trigram_list)) / sqrt(len(trigram_list))
# sophistication
sophis_bi = [b for b in text_bigrams if is_sophis_bigram(b)]
sophis_tri = [t for t in trigram_list if is_sophis_trigram(t)]
# bi_sop1 = len(sophis_bi) / len(text_bigrams)
ngram_indices['BIGRAM_SOP2'] = len(sophis_bi) / sqrt(len(text_bigrams))
# tri_sop1 = len(sophis_tri) / len(trigram_list)
ngram_indices['DEP_SOP2'] = len(sophis_tri) / sqrt(len(trigram_list))
# rttr and ratio for each dep label
deplabels = ["HED", "COO", "SBV", "ADV", "ATT", "VOB", "FOB", "POB", "IOB", "DBL", "RAD", "CMP", "LAD"]
for label in deplabels:
ngram_indices[label + '_RTTR'] = 0
ngram_indices[label + '_RATIO'] = 0
if label in text_trigrams:
trigrams = text_trigrams[label]
ngram_indices[label + '_RTTR'] = len(set(trigrams)) / sqrt(len(trigrams))
ngram_indices[label + '_RATIO'] = len(trigrams) / len(trigram_list)
return ngram_indices
def get_dep_distance(depdist):
dist_indices = {}
sum_dist, num_dist = 0, 0
deplabels = ["COO", "SBV", "ADV", "ATT", "VOB", "FOB", "POB", "IOB", "DBL", "RAD", "CMP", "LAD"]
for label in deplabels:
dist_indices[label + '_DIST'] = 0
if label in depdist:
distances = depdist[label]
dist_indices[label + '_DIST'] = np.mean(distances)
sum_dist += sum(distances)
num_dist += len(distances)
dist_indices['MEAN_DIST'] = sum_dist / num_dist
return dist_indices
def get_construction_indices(constructions, text_length, const_num):
'''
ratio and density indices from level 1 to 5
'''
const_indices = {'CONST_DENSITY': const_num / text_length}
for i in range(1, 6):
level_const = constructions[i]
const_indices['CONST' + str(i) + '_RATIO'] = len(level_const) / const_num
const_indices['CONST' + str(i) + '_DENSITY'] = len(level_const) / text_length
# combine level 1&2 to low, level 4&5 to high, level 3 by default the mid
const_indices['CONST_LOW_RATIO'] = const_indices['CONST1_RATIO'] + const_indices['CONST2_RATIO']
const_indices['CONST_HIGH_RATIO'] = const_indices['CONST4_RATIO'] + const_indices['CONST5_RATIO']
const_indices['CONST_LOW_DENSITY'] = const_indices['CONST1_DENSITY'] + const_indices['CONST2_DENSITY']
const_indices['CONST_HIGH_DENSITY'] = const_indices['CONST4_DENSITY'] + const_indices['CONST5_DENSITY']
return const_indices
def getLinguisticIndices(text_dict):
''' lexical
clausal
collocation based
bigram and dependency based
construction based
'''
indices = {}
# for lexical
text_wordlist, text_level_dict = [], {k: 0 for k in range(1, 8)}
# for clausal: length based
sent_length_list, clause_length_list, T_unit_length_list = [], [], []
# for clausal: tree based
max_depth_list = []
# for collocation
collocation_list = []
# for bigram and dependency triples
text_bigrams, text_trigrams, text_depdists = [], {}, {}
# text constructions
text_constructions = {i: [] for i in range(1, 6)}
# get linguistic features for one text
for sent_id, info in text_dict.items():
sent, wordlist, wplist = info['sent'], info['wordlist'], info['wplist']
worddict = info['worddict']
# lexical features
level_dict = level_analyze(sent, wordlist, wplist)
wordlist = [w for w in wordlist if isWordChinese(w)] # 去除标点数字英文等
text_wordlist.extend(wordlist)
for level, wl in level_dict.items():
text_level_dict[level] += wl
# clausal units
if not re.search('[,。?!;……]', sent):
continue
sent_length, clause_lengths, T_unit_lengths = clausal_index(sent, worddict)
sent_length_list.append(sent_length)
clause_length_list.extend(clause_lengths)
T_unit_length_list.extend(T_unit_lengths)
max_depth = getTreePath(worddict)
max_depth_list.append(max_depth)
# collocations
collocations = getColl(worddict)
collocation_list.extend(collocations)
# bigrams and dependency trigrams
bigrams = get_bigrams(wplist)
text_bigrams.extend(bigrams)
text_trigrams, text_depdists = get_dep_trigrams(worddict, text_trigrams, text_depdists)
# constructions
text_constructions = getLevelConstruction(worddict, sent, text_constructions)
############### linguistic feature extraction done ###############
# update length indices
text_length = sum(sent_length_list)
indices['CHAR_NUM'] = text_length
indices['WORD_NUM'] = len(text_wordlist)
# update lexical indices
if len(text_wordlist):
lexical_indices = get_lexical_indices(text_wordlist, text_level_dict)
indices.update(lexical_indices)
# udpate clausal indices
if sent_length_list:
clausal_indices = get_clausal_indices(sent_length_list, clause_length_list, T_unit_length_list, max_depth_list)
indices.update(clausal_indices)
# update collocation based indices
if collocation_list:
coll_indices = get_coll_indices(collocation_list)
indices.update(coll_indices)
# update bigram and dependency based indices
if text_bigrams:
ngram_indices = get_ngram_indices(text_bigrams, text_trigrams)
dist_indices = get_dep_distance(text_depdists)
indices.update(ngram_indices)
indices.update(dist_indices)
const_num = sum([len(v) for v in text_constructions.values()])
if const_num:
const_indices = get_construction_indices(text_constructions, text_length, const_num)
indices.update(const_indices)
return indices