From 9f8d85680dd32875b7d308801070a9c5fd812b7c Mon Sep 17 00:00:00 2001 From: LemonadeXyz <62137944+LemonadeXyz@users.noreply.github.com> Date: Fri, 20 Aug 2021 22:17:44 +0800 Subject: [PATCH] 2021-8-20 --- utils/text_repre.py | 55 --------------------------------------------- 1 file changed, 55 deletions(-) diff --git a/utils/text_repre.py b/utils/text_repre.py index b0b5af8..67a9ba1 100644 --- a/utils/text_repre.py +++ b/utils/text_repre.py @@ -71,61 +71,6 @@ def get_text_feature_from_ltp_results(text, text_dict, feature='c'): return corpus_line -# def get_text_feature(df, feature='c'): - -# """ -# input: -# - essay_revised.csv -# return: -# - X -# - Y -# """ -# if feature not in ['c', 'w', 'cw', 'wp', 'cwp']: -# print('feature not supported, please refer to: \ -# \n[char(c), word(w), char/word(cw), word/pos(wp), char/word/pos(cwp)]') -# exit() - -# corpus, Y = [], [] - -# # 分词这一步,在语言特征提取的时候就有,稍后可以将此处优化一下 -# for index, row in df.iterrows(): -# essay_id = row['essay_ID'] -# text = row['ESSAY'] -# order = score2ord[float(row['SCORE'])] - -# if feature == 'c': -# char_seg = ' '.join(list(text)) -# corpus.append(char_seg) -# elif feature == 'w': -# word_seg = ' '.join(list(jieba.cut(text))) -# corpus.append(word_seg) -# elif feature == 'cw': -# char_seg = ' '.join(list(text)) -# word_seg = ' '.join(list(jieba.cut(text))) -# corpus.append(char_seg + ' ' + word_seg) -# elif feature == 'wp': -# wordlist, poslist = [], [] -# wp_seg = pseg.cut(text) -# for w, p in wp_seg: -# wordlist.append(w) -# poslist.append(p) -# word_seg, pos_seg = ' '.join(wordlist), ' '.join(poslist) -# corpus.append(word_seg + ' ' + pos_seg) -# elif feature == 'cwp': -# char_seg = ' '.join(list(text)) -# wordlist, poslist = [], [] -# wp_seg = pseg.cut(text) -# for w, p in wp_seg: -# wordlist.append(w) -# poslist.append(p) -# word_seg, pos_seg = ' '.join(wordlist), ' '.join(poslist) -# corpus.append(char_seg + ' ' + word_seg + ' ' + pos_seg) - -# Y.append(order) - -# return corpus, np.array(Y) - - def get_text_matrix(corpus, ngram_min=1, ngram_max=1, df_threshold=20, sparse=False): """ transform the corpus to tf-idf matrix