-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7ad2266
commit 2d8919b
Showing
1 changed file
with
255 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,255 @@ | ||
from math import sqrt | ||
import numpy as np | ||
from utils.hsk import * | ||
from utils.clausal import * | ||
from utils.coll import * | ||
from utils.bidep import * | ||
from utils.const import * | ||
|
||
|
||
def get_lexical_indices(text_wordlist, text_level_dict): | ||
lexical_indices = {} | ||
sophistication2 = (text_level_dict[5] + text_level_dict[6] + text_level_dict[7]) / sqrt(len(text_wordlist)) | ||
lexical_indices['LEXICAL_RTTR'] = len(set(text_wordlist)) / sqrt(len(text_wordlist)) | ||
lexical_indices['LEXICAL_SOP2'] = sophistication2 | ||
return lexical_indices | ||
|
||
|
||
def get_clausal_indices(sent_length_list, clause_length_list, T_unit_length_list, max_depth_list): | ||
clausal_indices = {} | ||
# length based | ||
clausal_indices['MLS'] = np.mean(sent_length_list) | ||
clausal_indices['MLC'] = np.mean(clause_length_list) | ||
clausal_indices['MLTU'] = np.mean(T_unit_length_list) | ||
# unit based | ||
clausal_indices['NCPS'] = len(clause_length_list) / len(sent_length_list) | ||
clausal_indices['NTPS'] = len(T_unit_length_list) / len(sent_length_list) | ||
# tree based | ||
clausal_indices['MEAN_TREE_DEPTH'] = np.mean(max_depth_list) | ||
clausal_indices['MAX_TREE_DEPTH'] = max(max_depth_list) | ||
return clausal_indices | ||
|
||
|
||
def get_coll_indices(collocation_list): | ||
coll_indices = {} | ||
coll_num = len(collocation_list) | ||
sqrt_coll_num = sqrt(coll_num) | ||
|
||
# coll_RTTR | ||
set_colls = set(collocation_list) | ||
coll_indices['COLL_RTTR'] = len(set_colls) / sqrt_coll_num | ||
|
||
# unique_RTTR | ||
unique_colls = [coll for coll in collocation_list if isUniqueColl(coll)] | ||
set_unique_colls = set(unique_colls) | ||
coll_indices['UNIQUE_RTTR'] = len(set_unique_colls) / (sqrt(len(unique_colls)) + 1) | ||
|
||
# general_RTTR | ||
general_colls = [coll for coll in collocation_list if coll not in set_unique_colls] | ||
set_general_colls = set(general_colls) | ||
general_coll_diversity = len(set_general_colls) / sqrt(len(general_colls)) | ||
coll_indices['GENERAL_RTTR'] = general_coll_diversity | ||
|
||
# unique_RATIO | ||
# coll_indices['UNIQUE_RATIO2'] = len(unique_colls) / coll_num | ||
coll_indices['UNIQUE_RATIO2'] = len(unique_colls) / sqrt_coll_num | ||
|
||
# lowfreq_RATIO | ||
lowfreq_collls = [coll for coll in collocation_list if isLowFreqColl(coll)] | ||
# coll_indices['LOWFREQ_RATIO1'] = len(lowfreq_collls) / coll_num | ||
coll_indices['LOWFREQ_RATIO2'] = len(lowfreq_collls) / sqrt_coll_num | ||
|
||
# coll indices based on different types | ||
coll_types = ['VO', 'SP', 'AN', 'AP', 'CN*', 'PP*', 'PV*', 'PC*'] | ||
coll_type_dict = {k: [] for k in coll_types} | ||
|
||
for coll in collocation_list: | ||
typ = coll_dict[coll.split('\t')[-1]] | ||
coll_type_dict[typ].append(coll) | ||
|
||
for ct, v in coll_type_dict.items(): | ||
coll_indices[ct + '_RATIO'] = len(v) / len(collocation_list) | ||
coll_indices[ct + '_RTTR'] = 0 | ||
if v: | ||
coll_indices[ct + '_RTTR'] = len(set(v)) / sqrt(len(v)) | ||
|
||
return coll_indices | ||
|
||
|
||
def get_ngram_indices(text_bigrams, text_trigrams): | ||
''' | ||
param: | ||
- text_bigrams: <type> list | ||
- text_trigrams: <type> dict | ||
return : | ||
- ngram_indices: <type> dict | ||
* bi_rttr, bi_sop2 | ||
* tri_rttr, tri_sop2 | ||
* each type of dep: rttr, ratio | ||
''' | ||
ngram_indices = {} | ||
trigram_list = [] | ||
for trigrams in text_trigrams.values(): | ||
trigram_list.extend(trigrams) | ||
|
||
# diversity | ||
ngram_indices['BIGRAM_RTTR'] = len(set(text_bigrams)) / sqrt(len(text_bigrams)) | ||
ngram_indices['DEP_RTTR'] = len(set(trigram_list)) / sqrt(len(trigram_list)) | ||
|
||
# sophistication | ||
sophis_bi = [b for b in text_bigrams if is_sophis_bigram(b)] | ||
sophis_tri = [t for t in trigram_list if is_sophis_trigram(t)] | ||
# bi_sop1 = len(sophis_bi) / len(text_bigrams) | ||
ngram_indices['BIGRAM_SOP2'] = len(sophis_bi) / sqrt(len(text_bigrams)) | ||
# tri_sop1 = len(sophis_tri) / len(trigram_list) | ||
ngram_indices['DEP_SOP2'] = len(sophis_tri) / sqrt(len(trigram_list)) | ||
|
||
# rttr and ratio for each dep label | ||
deplabels = ["HED", "COO", "SBV", "ADV", "ATT", "VOB", "FOB", "POB", "IOB", "DBL", "RAD", "CMP", "LAD"] | ||
for label in deplabels: | ||
ngram_indices[label + '_RTTR'] = 0 | ||
ngram_indices[label + '_RATIO'] = 0 | ||
if label in text_trigrams: | ||
trigrams = text_trigrams[label] | ||
ngram_indices[label + '_RTTR'] = len(set(trigrams)) / sqrt(len(trigrams)) | ||
ngram_indices[label + '_RATIO'] = len(trigrams) / len(trigram_list) | ||
|
||
return ngram_indices | ||
|
||
|
||
def get_dep_distance(depdist): | ||
dist_indices = {} | ||
sum_dist, num_dist = 0, 0 | ||
|
||
deplabels = ["COO", "SBV", "ADV", "ATT", "VOB", "FOB", "POB", "IOB", "DBL", "RAD", "CMP", "LAD"] | ||
for label in deplabels: | ||
dist_indices[label + '_DIST'] = 0 | ||
if label in depdist: | ||
distances = depdist[label] | ||
dist_indices[label + '_DIST'] = np.mean(distances) | ||
sum_dist += sum(distances) | ||
num_dist += len(distances) | ||
|
||
dist_indices['MEAN_DIST'] = sum_dist / num_dist | ||
|
||
return dist_indices | ||
|
||
|
||
def get_construction_indices(constructions, text_length, const_num): | ||
''' | ||
ratio and density indices from level 1 to 5 | ||
''' | ||
const_indices = {'CONST_DENSITY': const_num / text_length} | ||
|
||
for i in range(1, 6): | ||
level_const = constructions[i] | ||
const_indices['CONST' + str(i) + '_RATIO'] = len(level_const) / const_num | ||
const_indices['CONST' + str(i) + '_DENSITY'] = len(level_const) / text_length | ||
|
||
# combine level 1&2 to low, level 4&5 to high, level 3 by default the mid | ||
const_indices['CONST_LOW_RATIO'] = const_indices['CONST1_RATIO'] + const_indices['CONST2_RATIO'] | ||
const_indices['CONST_HIGH_RATIO'] = const_indices['CONST4_RATIO'] + const_indices['CONST5_RATIO'] | ||
const_indices['CONST_LOW_DENSITY'] = const_indices['CONST1_DENSITY'] + const_indices['CONST2_DENSITY'] | ||
const_indices['CONST_HIGH_DENSITY'] = const_indices['CONST4_DENSITY'] + const_indices['CONST5_DENSITY'] | ||
|
||
return const_indices | ||
|
||
|
||
def getLinguisticIndices(text_dict): | ||
''' lexical | ||
clausal | ||
collocation based | ||
bigram and dependency based | ||
construction based | ||
''' | ||
|
||
indices = {} | ||
|
||
# for lexical | ||
text_wordlist, text_level_dict = [], {k: 0 for k in range(1, 8)} | ||
|
||
# for clausal: length based | ||
sent_length_list, clause_length_list, T_unit_length_list = [], [], [] | ||
|
||
# for clausal: tree based | ||
max_depth_list = [] | ||
|
||
# for collocation | ||
collocation_list = [] | ||
|
||
# for bigram and dependency triples | ||
text_bigrams, text_trigrams, text_depdists = [], {}, {} | ||
|
||
# text constructions | ||
text_constructions = {i: [] for i in range(1, 6)} | ||
|
||
# get linguistic features for one text | ||
for sent_id, info in text_dict.items(): | ||
|
||
sent, wordlist, wplist = info['sent'], info['wordlist'], info['wplist'] | ||
worddict = info['worddict'] | ||
|
||
# lexical features | ||
level_dict = level_analyze(sent, wordlist, wplist) | ||
wordlist = [w for w in wordlist if isWordChinese(w)] # 去除标点数字英文等 | ||
text_wordlist.extend(wordlist) | ||
for level, wl in level_dict.items(): | ||
text_level_dict[level] += wl | ||
|
||
# clausal units | ||
if not re.search('[,。?!;……]', sent): | ||
continue | ||
sent_length, clause_lengths, T_unit_lengths = clausal_index(sent, worddict) | ||
sent_length_list.append(sent_length) | ||
clause_length_list.extend(clause_lengths) | ||
T_unit_length_list.extend(T_unit_lengths) | ||
max_depth = getTreePath(worddict) | ||
max_depth_list.append(max_depth) | ||
|
||
# collocations | ||
collocations = getColl(worddict) | ||
collocation_list.extend(collocations) | ||
|
||
# bigrams and dependency trigrams | ||
bigrams = get_bigrams(wplist) | ||
text_bigrams.extend(bigrams) | ||
text_trigrams, text_depdists = get_dep_trigrams(worddict, text_trigrams, text_depdists) | ||
|
||
# constructions | ||
text_constructions = getLevelConstruction(worddict, sent, text_constructions) | ||
|
||
############### linguistic feature extraction done ############### | ||
|
||
# update length indices | ||
text_length = sum(sent_length_list) | ||
indices['CHAR_NUM'] = text_length | ||
indices['WORD_NUM'] = len(text_wordlist) | ||
|
||
# update lexical indices | ||
if len(text_wordlist): | ||
lexical_indices = get_lexical_indices(text_wordlist, text_level_dict) | ||
indices.update(lexical_indices) | ||
|
||
# udpate clausal indices | ||
if sent_length_list: | ||
clausal_indices = get_clausal_indices(sent_length_list, clause_length_list, T_unit_length_list, max_depth_list) | ||
indices.update(clausal_indices) | ||
|
||
# update collocation based indices | ||
if collocation_list: | ||
coll_indices = get_coll_indices(collocation_list) | ||
indices.update(coll_indices) | ||
|
||
# update bigram and dependency based indices | ||
if text_bigrams: | ||
ngram_indices = get_ngram_indices(text_bigrams, text_trigrams) | ||
dist_indices = get_dep_distance(text_depdists) | ||
indices.update(ngram_indices) | ||
indices.update(dist_indices) | ||
|
||
const_num = sum([len(v) for v in text_constructions.values()]) | ||
if const_num: | ||
const_indices = get_construction_indices(text_constructions, text_length, const_num) | ||
indices.update(const_indices) | ||
|
||
return indices |