-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
400 lines (303 loc) · 15.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
from tqdm import tqdm
import pandas as pd
tqdm.pandas()
Hotel_Info1 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2018-2019茂名(含自媒体).xlsx', sheet_name=0) # 酒店评论
Scenic_Info1 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2018-2019茂名(含自媒体).xlsx', sheet_name=1) # 景区评论
Travel_Info1 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2018-2019茂名(含自媒体).xlsx', sheet_name=2) # 游记攻略
Dining_Info1 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2018-2019茂名(含自媒体).xlsx', sheet_name=3) # 餐饮评论
Wechat_Info1 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2018-2019茂名(含自媒体).xlsx', sheet_name=4) # 微信公众号文章
Hotel_Info2 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2020-2021茂名(含自媒体).xlsx', sheet_name=0) # 酒店评论
Scenic_Info2 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2020-2021茂名(含自媒体).xlsx', sheet_name=1) # 景区评论
Travel_Info2 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2020-2021茂名(含自媒体).xlsx', sheet_name=2) # 游记攻略
Dining_Info2 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2020-2021茂名(含自媒体).xlsx', sheet_name=3) # 餐饮评论
Wechat_Info2 = pd.read_excel(
'E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/2020-2021茂名(含自媒体).xlsx', sheet_name=4) # 微信公众号文章
Hotel_Infos = pd.concat([Hotel_Info1, Hotel_Info2],axis=0) # 酒店评论
Scenic_Infos = pd.concat([Scenic_Info1, Scenic_Info2], axis=0) # 景区评论
Travel_Infos = pd.concat([Travel_Info1, Travel_Info2], axis=0) # 游记攻略
Dining_Infos = pd.concat([Dining_Info1, Dining_Info2], axis=0) # 餐饮评论
Wechat_Infos = pd.concat([Wechat_Info1, Wechat_Info2], axis=0) # 微信公众号文章
'''
旅游产品,亦称旅游服务产品。是指由实物和服务构成。包括旅行商集合景点、交通、食宿、娱乐等设施设备、
项目及相应服务出售给旅游者的旅游线路类产品,旅游景区、旅游饭店等单个企业提供给旅游者的活动项目类产品
'''
Scenic_Infos.head(10)
def addstr(s):
return '景区评论-'+str(s)
Scenic_Infos['语料ID'] = Scenic_Infos['景区评论ID'].progress_apply(addstr)
Scenic_Infos['文本'] = Scenic_Infos['评论内容']
Scenic_Infos['产品名称'] = Scenic_Infos['景区名称']
Scenic_Infos['年份'] = pd.to_datetime(Scenic_Infos['评论日期']).dt.year
Hotel_Infos.head(10)
def addstr(s):
return '酒店评论-'+str(s)
Hotel_Infos['语料ID'] = Hotel_Infos['酒店评论ID'].progress_apply(addstr)
Hotel_Infos['文本'] = Hotel_Infos['评论内容']
Hotel_Infos['产品名称'] = Hotel_Infos['酒店名称']
Hotel_Infos['年份'] = pd.to_datetime(Hotel_Infos['评论日期']).dt.year
def addstr(s):
return '餐饮评论-'+str(s)
Dining_Infos['语料ID'] = Dining_Infos['餐饮评论ID'].progress_apply(addstr)
Dining_Infos['文本'] = Dining_Infos['评论内容'] + '\n'+Dining_Infos['标题']
Dining_Infos['产品名称'] = Dining_Infos['餐饮名称']
Dining_Infos['年份'] = pd.to_datetime(Dining_Infos['评论日期']).dt.year
# 采用Textrank提取关键词组算法
# 这部分待改进
from textrank4zh import TextRank4Keyword # 导入textrank4zh模块
import numpy as np
def get_keyphrase(s):
tr4w = TextRank4Keyword(
allow_speech_tags=['n', 'nr', 'nr1', 'nr2', 'nrf', 'ns', 'nsf', 'nt', 'nz', 'nz', 'nl', 'ng'])
tr4w.analyze(text=str(s), lower=True, window=5) # 文本分析,文本小写,窗口为2
# 最多5个关键词组,有可能一个也没有。词组在原文中出现次数最少为1。
phase_list = tr4w.get_keyphrases(keywords_num=5, min_occur_num=1)
if len(phase_list) == 0:
return np.nan
else:
return phase_list[0]
# 游记攻略
Travel_Infos = pd.concat([Travel_Info1, Travel_Info2], axis=0) # 游记攻略
def addstr(s):
return '旅游攻略-' + str(s)
Travel_Infos['语料ID'] = Travel_Infos['游记ID'].progress_apply(addstr)
Travel_Infos['文本'] = Travel_Infos['游记标题'] + '\n' + Travel_Infos['正文']
Travel_Infos['年份'] = pd.to_datetime(Travel_Infos['发布时间']).dt.year
Travel_Infos['产品名称'] = Travel_Infos['文本'].progress_apply(get_keyphrase)
# 微信公众号文章
Wechat_Infos = pd.concat([Wechat_Info1, Wechat_Info2], axis=0) # 微信公众号文章
def addstr(s):
return '微信公共号文章-' + str(s)
Wechat_Infos['语料ID'] = Wechat_Infos['文章ID'].progress_apply(addstr)
Wechat_Infos['文本'] = Wechat_Infos['公众号标题'] + '\n' + Wechat_Infos['正文']
Wechat_Infos['年份'] = pd.to_datetime(Wechat_Infos['发布时间']).dt.year
Wechat_Infos['产品名称'] = Wechat_Infos['文本'].progress_apply(get_keyphrase)
# 删除没有产品名称的行
Travel_Infos = Travel_Infos.dropna(subset=["产品名称"])
Wechat_Infos = Wechat_Infos.dropna(subset=["产品名称"])
all_df = pd.DataFrame(columns=['语料ID', '文本', '产品名称'])
all_df['语料ID'] = pd.concat([Dining_Infos['语料ID'], Hotel_Infos['语料ID'],
Scenic_Infos['语料ID'], Travel_Infos['语料ID']], axis=0)
all_df['产品名称'] = pd.concat([Dining_Infos['产品名称'],Hotel_Infos['产品名称'],
Scenic_Infos['产品名称'], Travel_Infos['产品名称']], axis=0)
all_df['文本'] = pd.concat([Dining_Infos['文本'], Hotel_Infos['文本'],
Scenic_Infos['文本'], Travel_Infos['文本']], axis=0)
all_df['年份'] = pd.concat([Dining_Infos['年份'], Hotel_Infos['年份'],
Scenic_Infos['年份'], Travel_Infos['年份']], axis=0)
all_df
product_id = ['ID'+str(i+1) for i in range(len(all_df))]
all_df['产品ID'] = product_id
result2 = all_df[['语料ID','产品ID','产品名称']]
result2
result2.to_csv('E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/result2-1.csv', index=False)
all_df.to_csv('E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/问题二所有数据汇总.csv', index=False)
import warnings
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
warnings.filterwarnings('ignore')
all_df = pd.read_csv('E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/问题二所有数据汇总.csv')
all_df
from cnsenti import Sentiment
senti = Sentiment(pos='E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/pos.txt', #正面词典txt文件相对路径
neg='E:/Modeling/外挂/外挂/【2022-4-26更新】C题问题二/data/neg.txt', #负面词典txt文件相对路径
merge=True, #融合cnsenti自带词典和用户导入的自定义词典
encoding='utf-8') #两txt均为utf-8编码
def emotion_score(s):
r = senti.sentiment_count(str(s))
if r['pos']>r['neg']:
score = (r['pos']-r['neg'])/r['words']
elif r['pos'] < r['neg']:
score = (r['pos']-r['neg'])/r['words']
else :
score = 0
return score
all_df['情感得分'] = all_df['文本'].progress_apply(emotion_score)
#senti = Sentiment() #两txt均为utf-8编码
#for str1 in all_df['情感得分']:
# result=senti.sentiment_count(str(str1))
#result=1
# print(result)
year_2018_count = all_df[all_df['年份']==2018]
year_2019_count = all_df[all_df['年份'] == 2019]
year_2020_count = all_df[all_df['年份'] == 2020]
year_2021_count = all_df[all_df['年份'] == 2021]
dict_2018 = dict(year_2018_count['产品名称'].value_counts())
def get_frequency(s):
fre = dict_2018[s]
return fre
year_2018_count['出现频次'] = year_2018_count['产品名称'].progress_apply(get_frequency)
dict_2019 = dict(year_2019_count['产品名称'].value_counts())
def get_frequency(s):
fre = dict_2019[s]
return fre
year_2019_count['出现频次'] = year_2019_count['产品名称'].progress_apply(get_frequency)
dict_2020 = dict(year_2020_count['产品名称'].value_counts())
def get_frequency(s):
fre = dict_2020[s]
return fre
year_2020_count['出现频次'] = year_2020_count['产品名称'].progress_apply(get_frequency)
dict_2021 = dict(year_2021_count['产品名称'].value_counts())
def get_frequency(s):
fre = dict_2021[s]
return fre
year_2021_count['出现频次'] = year_2021_count['产品名称'].progress_apply(get_frequency)
# 计算综合得分
year_2018_count['产品热度总分'] = 0.8*year_2018_count['出现频次']+200*year_2018_count['情感得分']+0
year_2019_count['产品热度总分'] = 0.8*year_2019_count['出现频次']+200*year_2019_count['情感得分']+1*10
year_2020_count['产品热度总分'] = 0.8*year_2020_count['出现频次']+200*year_2020_count['情感得分']+2*15
year_2021_count['产品热度总分'] = 0.8*year_2021_count['出现频次']+200*year_2021_count['情感得分']+3*20
year_2018_count['产品热度'] = year_2018_count['产品热度总分'].div(np.sum(year_2018_count['产品热度总分']), axis=0)#化成一个小数 加起来为1
year_2019_count['产品热度'] = year_2019_count['产品热度总分'].div(np.sum(year_2019_count['产品热度总分']), axis=0)
year_2020_count['产品热度'] = year_2020_count['产品热度总分'].div(np.sum(year_2020_count['产品热度总分']), axis=0)
year_2021_count['产品热度'] = year_2021_count['产品热度总分'].div(np.sum(year_2021_count['产品热度总分']), axis=0)
year_2018 = year_2018_count.sort_values(by="产品热度", ascending=False).reset_index(drop=True)
year_2019 = year_2019_count.sort_values(by="产品热度", ascending=False).reset_index(drop=True)
year_2020 = year_2020_count.sort_values(by="产品热度", ascending=False).reset_index(drop=True)
year_2021 = year_2021_count.sort_values(by="产品热度", ascending=False).reset_index(drop=True)
product_hot_score = pd.concat([year_2018_count, year_2019_count, year_2020_count, year_2021_count], axis=0)
product_hot_score
# 分词
import re
import jieba
stopword_list = [k.strip() for k in open(
'E://Modeling//外挂//外挂//【2022-4-26更新】C题问题二//stop//cn_stopwords.txt', encoding='utf8').readlines() if k.strip() != '']
def clearTxt(line):
if line != '':
line = str(line).strip()
#去除文本中的英文和数字
line = re.sub("[a-zA-Z0-9]", "", line)
#只保留中文、大小写字母
reg = "[^0-9A-Za-z\u4e00-\u9fa5]"
line = re.sub(reg, '', line)
#分词
segList = jieba.cut(line, cut_all=False)
segSentence = ''
for word in segList:
if word != '\t':
segSentence += word + " "
# 去停用词
wordList = segSentence.split(' ')
sentence = ''
for word in wordList:
word = word.strip()
if word not in stopword_list:
if word != '\t':
sentence += word + " "
return sentence.strip()
product_hot_score['文本'] = product_hot_score['文本'].progress_apply(clearTxt)
product_hot_score
# 景区、酒店、网红景点、民宿、特色餐饮、乡村旅游、文创
def get_product_type(s):
if '景区' in s:
return '景区'
elif '酒店' in s:
return '酒店'
elif '餐饮' in s:
return '特色餐饮'
elif '景点' in s:
return '景点'
elif '民宿' in s:
return '民宿'
elif '乡村' in s:
return '乡村旅游'
elif '文创' in s:
return '文创'
else:
return '景点'
product_hot_score['产品类型判断文本'] = product_hot_score['语料ID'] +' '+product_hot_score['文本']
product_hot_score['产品类型'] = product_hot_score['产品类型判断文本'].progress_apply(get_product_type)
# 去除重复的产品
product_hot_score2 = product_hot_score.drop_duplicates(['产品名称'])
product_hot_score2
# 产品 ID 产品类型 产品名称 产品热度 年份
result2_2 = product_hot_score2[['产品ID','产品类型','产品名称','产品热度','年份']]
result2_2['产品ID'] = ['ID'+str(i+1) for i in range(len(result2_2))]
result2_2
result2_2.to_csv('E://Modeling//外挂//外挂//【2022-4-26更新】C题问题二//data//result2-2.csv',index=False)
# 计算产品热度
pre_data = all_df[all_df['年份']<2020]
after_data = all_df[all_df['年份']>2019]
dict_pre = dict(pre_data['产品名称'].value_counts())
dict_after = dict(after_data['产品名称'].value_counts())
def get_pre_frequency(s):
fre = dict_pre[s]
return fre
def get_after_frequency(s):
fre = dict_after[s]
return fre
pre_data['出现频次'] = pre_data['产品名称'].progress_apply(get_pre_frequency)
after_data['出现频次'] = after_data['产品名称'].progress_apply(get_after_frequency)
# 计算综合得分
pre_data['产品热度总分'] = 3*pre_data['出现频次']+2*pre_data['情感得分']
after_data['产品热度总分'] = 3*after_data['出现频次']+2*after_data['情感得分']
pre_data['产品热度'] = pre_data['产品热度总分'].div(np.sum(pre_data['产品热度总分']), axis=0)
after_data['产品热度'] = after_data['产品热度总分'].div(np.sum(after_data['产品热度总分']), axis=0)
pre_data_sort = pre_data.sort_values(by="产品热度", ascending=False).reset_index(drop=True)
after_data_sort = after_data.sort_values(
by="产品热度", ascending=False).reset_index(drop=True)
# 判断产品类型
import re
import jieba
stopword_list = [k.strip() for k in open(
'E:\Modeling\外挂\外挂\【2022-4-26更新】C题问题二\stop\cn_stopwords.txt', encoding='utf8').readlines() if k.strip() != '']
def clearTxt(line):
if line != '':
line = str(line).strip()
#去除文本中的英文和数字
line = re.sub("[a-zA-Z0-9]", "", line)
#只保留中文、大小写字母
reg = "[^0-9A-Za-z\u4e00-\u9fa5]"
line = re.sub(reg, '', line)
#分词
segList = jieba.cut(line, cut_all=False)
segSentence = ''
for word in segList:
# global segSentence
if word != '\t':
segSentence += word + " "
# 去停用词
wordList = segSentence.split(' ')
sentence = ''
for word in wordList:
word = word.strip()
if word not in stopword_list:
if word != '\t':
sentence += word + " "
return sentence.strip()
# 景区、酒店、网红景点、民宿、特色餐饮、乡村旅游、文创
def get_product_type(s):
if '景区' in s:
return '景区'
elif '酒店' in s:
return '酒店'
elif '餐饮' in s:
return '特色餐饮'
elif '景点' in s:
return '景点'
elif '民宿' in s:
return '民宿'
elif '乡村' in s:
return '乡村旅游'
elif '文创' in s:
return '文创'
else:
return '景点'
pre_data_sort['文本'] = pre_data_sort['文本'].progress_apply(clearTxt)
pre_data_sort['产品类型判断文本'] = pre_data_sort['语料ID'] + ' ' + pre_data_sort['文本']
pre_data_sort['产品类型'] = pre_data_sort['产品类型判断文本'].progress_apply(get_product_type)
after_data_sort['文本'] = after_data_sort['文本'].progress_apply(clearTxt)
after_data_sort['产品类型判断文本'] = after_data_sort['语料ID'] + \
' ' + after_data_sort['文本']
after_data_sort['产品类型'] = after_data_sort['产品类型判断文本'].progress_apply(
get_product_type)
pre_data_sort.to_csv('E://Modeling//外挂//外挂//【2022-4-26更新】C题问题二//data//疫情前产品热度.csv', index=False)
after_data_sort.to_csv('E://Modeling//外挂//外挂//【2022-4-26更新】C题问题二//data//疫情后产品热度.csv', index=False)