diff --git a/linguistic.py b/linguistic.py new file mode 100644 index 0000000..565ca1b --- /dev/null +++ b/linguistic.py @@ -0,0 +1,255 @@ +from math import sqrt +import numpy as np +from utils.hsk import * +from utils.clausal import * +from utils.coll import * +from utils.bidep import * +from utils.const import * + + +def get_lexical_indices(text_wordlist, text_level_dict): + lexical_indices = {} + sophistication2 = (text_level_dict[5] + text_level_dict[6] + text_level_dict[7]) / sqrt(len(text_wordlist)) + lexical_indices['LEXICAL_RTTR'] = len(set(text_wordlist)) / sqrt(len(text_wordlist)) + lexical_indices['LEXICAL_SOP2'] = sophistication2 + return lexical_indices + + +def get_clausal_indices(sent_length_list, clause_length_list, T_unit_length_list, max_depth_list): + clausal_indices = {} + # length based + clausal_indices['MLS'] = np.mean(sent_length_list) + clausal_indices['MLC'] = np.mean(clause_length_list) + clausal_indices['MLTU'] = np.mean(T_unit_length_list) + # unit based + clausal_indices['NCPS'] = len(clause_length_list) / len(sent_length_list) + clausal_indices['NTPS'] = len(T_unit_length_list) / len(sent_length_list) + # tree based + clausal_indices['MEAN_TREE_DEPTH'] = np.mean(max_depth_list) + clausal_indices['MAX_TREE_DEPTH'] = max(max_depth_list) + return clausal_indices + + +def get_coll_indices(collocation_list): + coll_indices = {} + coll_num = len(collocation_list) + sqrt_coll_num = sqrt(coll_num) + + # coll_RTTR + set_colls = set(collocation_list) + coll_indices['COLL_RTTR'] = len(set_colls) / sqrt_coll_num + + # unique_RTTR + unique_colls = [coll for coll in collocation_list if isUniqueColl(coll)] + set_unique_colls = set(unique_colls) + coll_indices['UNIQUE_RTTR'] = len(set_unique_colls) / (sqrt(len(unique_colls)) + 1) + + # general_RTTR + general_colls = [coll for coll in collocation_list if coll not in set_unique_colls] + set_general_colls = set(general_colls) + general_coll_diversity = len(set_general_colls) / sqrt(len(general_colls)) + coll_indices['GENERAL_RTTR'] = general_coll_diversity + + # unique_RATIO + # coll_indices['UNIQUE_RATIO2'] = len(unique_colls) / coll_num + coll_indices['UNIQUE_RATIO2'] = len(unique_colls) / sqrt_coll_num + + # lowfreq_RATIO + lowfreq_collls = [coll for coll in collocation_list if isLowFreqColl(coll)] + # coll_indices['LOWFREQ_RATIO1'] = len(lowfreq_collls) / coll_num + coll_indices['LOWFREQ_RATIO2'] = len(lowfreq_collls) / sqrt_coll_num + + # coll indices based on different types + coll_types = ['VO', 'SP', 'AN', 'AP', 'CN*', 'PP*', 'PV*', 'PC*'] + coll_type_dict = {k: [] for k in coll_types} + + for coll in collocation_list: + typ = coll_dict[coll.split('\t')[-1]] + coll_type_dict[typ].append(coll) + + for ct, v in coll_type_dict.items(): + coll_indices[ct + '_RATIO'] = len(v) / len(collocation_list) + coll_indices[ct + '_RTTR'] = 0 + if v: + coll_indices[ct + '_RTTR'] = len(set(v)) / sqrt(len(v)) + + return coll_indices + + +def get_ngram_indices(text_bigrams, text_trigrams): + ''' + param: + - text_bigrams: list + - text_trigrams: dict + return : + - ngram_indices: dict + * bi_rttr, bi_sop2 + * tri_rttr, tri_sop2 + * each type of dep: rttr, ratio + ''' + ngram_indices = {} + trigram_list = [] + for trigrams in text_trigrams.values(): + trigram_list.extend(trigrams) + + # diversity + ngram_indices['BIGRAM_RTTR'] = len(set(text_bigrams)) / sqrt(len(text_bigrams)) + ngram_indices['DEP_RTTR'] = len(set(trigram_list)) / sqrt(len(trigram_list)) + + # sophistication + sophis_bi = [b for b in text_bigrams if is_sophis_bigram(b)] + sophis_tri = [t for t in trigram_list if is_sophis_trigram(t)] + # bi_sop1 = len(sophis_bi) / len(text_bigrams) + ngram_indices['BIGRAM_SOP2'] = len(sophis_bi) / sqrt(len(text_bigrams)) + # tri_sop1 = len(sophis_tri) / len(trigram_list) + ngram_indices['DEP_SOP2'] = len(sophis_tri) / sqrt(len(trigram_list)) + + # rttr and ratio for each dep label + deplabels = ["HED", "COO", "SBV", "ADV", "ATT", "VOB", "FOB", "POB", "IOB", "DBL", "RAD", "CMP", "LAD"] + for label in deplabels: + ngram_indices[label + '_RTTR'] = 0 + ngram_indices[label + '_RATIO'] = 0 + if label in text_trigrams: + trigrams = text_trigrams[label] + ngram_indices[label + '_RTTR'] = len(set(trigrams)) / sqrt(len(trigrams)) + ngram_indices[label + '_RATIO'] = len(trigrams) / len(trigram_list) + + return ngram_indices + + +def get_dep_distance(depdist): + dist_indices = {} + sum_dist, num_dist = 0, 0 + + deplabels = ["COO", "SBV", "ADV", "ATT", "VOB", "FOB", "POB", "IOB", "DBL", "RAD", "CMP", "LAD"] + for label in deplabels: + dist_indices[label + '_DIST'] = 0 + if label in depdist: + distances = depdist[label] + dist_indices[label + '_DIST'] = np.mean(distances) + sum_dist += sum(distances) + num_dist += len(distances) + + dist_indices['MEAN_DIST'] = sum_dist / num_dist + + return dist_indices + + +def get_construction_indices(constructions, text_length, const_num): + ''' + ratio and density indices from level 1 to 5 + ''' + const_indices = {'CONST_DENSITY': const_num / text_length} + + for i in range(1, 6): + level_const = constructions[i] + const_indices['CONST' + str(i) + '_RATIO'] = len(level_const) / const_num + const_indices['CONST' + str(i) + '_DENSITY'] = len(level_const) / text_length + + # combine level 1&2 to low, level 4&5 to high, level 3 by default the mid + const_indices['CONST_LOW_RATIO'] = const_indices['CONST1_RATIO'] + const_indices['CONST2_RATIO'] + const_indices['CONST_HIGH_RATIO'] = const_indices['CONST4_RATIO'] + const_indices['CONST5_RATIO'] + const_indices['CONST_LOW_DENSITY'] = const_indices['CONST1_DENSITY'] + const_indices['CONST2_DENSITY'] + const_indices['CONST_HIGH_DENSITY'] = const_indices['CONST4_DENSITY'] + const_indices['CONST5_DENSITY'] + + return const_indices + + +def getLinguisticIndices(text_dict): + ''' lexical + clausal + collocation based + bigram and dependency based + construction based + ''' + + indices = {} + + # for lexical + text_wordlist, text_level_dict = [], {k: 0 for k in range(1, 8)} + + # for clausal: length based + sent_length_list, clause_length_list, T_unit_length_list = [], [], [] + + # for clausal: tree based + max_depth_list = [] + + # for collocation + collocation_list = [] + + # for bigram and dependency triples + text_bigrams, text_trigrams, text_depdists = [], {}, {} + + # text constructions + text_constructions = {i: [] for i in range(1, 6)} + + # get linguistic features for one text + for sent_id, info in text_dict.items(): + + sent, wordlist, wplist = info['sent'], info['wordlist'], info['wplist'] + worddict = info['worddict'] + + # lexical features + level_dict = level_analyze(sent, wordlist, wplist) + wordlist = [w for w in wordlist if isWordChinese(w)] # 去除标点数字英文等 + text_wordlist.extend(wordlist) + for level, wl in level_dict.items(): + text_level_dict[level] += wl + + # clausal units + if not re.search('[,。?!;……]', sent): + continue + sent_length, clause_lengths, T_unit_lengths = clausal_index(sent, worddict) + sent_length_list.append(sent_length) + clause_length_list.extend(clause_lengths) + T_unit_length_list.extend(T_unit_lengths) + max_depth = getTreePath(worddict) + max_depth_list.append(max_depth) + + # collocations + collocations = getColl(worddict) + collocation_list.extend(collocations) + + # bigrams and dependency trigrams + bigrams = get_bigrams(wplist) + text_bigrams.extend(bigrams) + text_trigrams, text_depdists = get_dep_trigrams(worddict, text_trigrams, text_depdists) + + # constructions + text_constructions = getLevelConstruction(worddict, sent, text_constructions) + + ############### linguistic feature extraction done ############### + + # update length indices + text_length = sum(sent_length_list) + indices['CHAR_NUM'] = text_length + indices['WORD_NUM'] = len(text_wordlist) + + # update lexical indices + if len(text_wordlist): + lexical_indices = get_lexical_indices(text_wordlist, text_level_dict) + indices.update(lexical_indices) + + # udpate clausal indices + if sent_length_list: + clausal_indices = get_clausal_indices(sent_length_list, clause_length_list, T_unit_length_list, max_depth_list) + indices.update(clausal_indices) + + # update collocation based indices + if collocation_list: + coll_indices = get_coll_indices(collocation_list) + indices.update(coll_indices) + + # update bigram and dependency based indices + if text_bigrams: + ngram_indices = get_ngram_indices(text_bigrams, text_trigrams) + dist_indices = get_dep_distance(text_depdists) + indices.update(ngram_indices) + indices.update(dist_indices) + + const_num = sum([len(v) for v in text_constructions.values()]) + if const_num: + const_indices = get_construction_indices(text_constructions, text_length, const_num) + indices.update(const_indices) + + return indices