-
Notifications
You must be signed in to change notification settings - Fork 70
/
main.py
90 lines (83 loc) · 3.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
#
# @Version : 1.0
# @Time : 2019/11/1
# @Author : 圈圈烃
# @File : main
# @Description:
#
#
import SougouSpider
import Scel2Txt
import os
# 下载类别
Categories = ['城市信息:167', '自然科学:1', '社会科学:76', '工程应用:96', '农林渔畜:127', '医学医药:132',
'电子游戏:436', '艺术设计:154', '生活百科:389', '运动休闲:367', '人文科学:31', '娱乐休闲:403']
# Scel保存路径
SavePath = r"f:\Users\QQT\Documents\Temp\scel1"
# TXT保存路径
txtSavePath = r"f:\Users\QQT\Documents\Temp\txt1"
# 开始链接
startUrl = "https://pinyin.sogou.com/dict/cate/index/436"
def main():
"""搜狗词库下载"""
SGSpider = SougouSpider.SougouSpider()
# 创建保存路径
try:
os.mkdir(SavePath)
except Exception as e:
print(e)
# 我需要啥
myCategoryUrls = []
for mc in Categories:
myCategoryUrls.append("https://pinyin.sogou.com/dict/cate/index/" + mc.split(":")[-1])
print(myCategoryUrls)
# 大类分类
for index, categoryOneUrl in enumerate(myCategoryUrls):
# 创建保存路径
categoryOnePath = SavePath + "/" + Categories[index].split(":")[-1]
try:
os.mkdir(categoryOnePath)
except Exception as e:
print(e)
# 获取小类链接
resp = SGSpider.GetHtml(categoryOneUrl)
# 判断该链接是否为"城市信息",若是则采取Type1方法解析
if categoryOneUrl == "https://pinyin.sogou.com/dict/cate/index/167":
category2Type1Urls = SGSpider.GetCategory2Type1(resp)
else:
category2Type1Urls = SGSpider.GetCategory2Type2(resp)
# 小类分类
for key, url in category2Type1Urls.items():
# 创建保存路径
categoryTwoPath = categoryOnePath + "/" + key
try:
os.mkdir(categoryTwoPath)
except Exception as e:
print(e)
# 获取总页数
try:
resp = SGSpider.GetHtml(url)
pages = SGSpider.GetPage(resp)
except Exception as e:
print(e)
pages = 1
# 获取下载链接
for page in range(1, pages + 1):
pageUrl = url + "/default/" + str(page)
resp = SGSpider.GetHtml(pageUrl)
downloadUrls = SGSpider.GetDownloadList(resp)
# 开始下载
for keyDownload, urlDownload in downloadUrls.items():
filePath = categoryTwoPath + "/" + keyDownload + ".scel"
if os.path.exists(filePath):
print(keyDownload + " 文件已存在>>>>>>")
else:
SGSpider.Download(urlDownload, filePath)
print(keyDownload + " 保存成功......")
# 转scel为txt
Scel2Txt.batch_file(SavePath, txtSavePath)
print("任务结束...")
if __name__ == '__main__':
main()