-
Notifications
You must be signed in to change notification settings - Fork 2
/
mzitu.py
110 lines (92 loc) · 3.17 KB
/
mzitu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import asyncio
import os
import aiofiles
import aiohttp
from aiomultiprocess import Pool
from bs4 import BeautifulSoup
start_url = 'https://www.mzitu.com'
waiting_urls = []
sem = asyncio.Semaphore(400)
img_total = 0
# 请求一个url, 返回 response
async def fetch(url, session=None, referer=None):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (HTML, like Gecko) '
'Chrome/65.0.3325.162 Safari/537.36',
'Referer': referer
}
try:
resp = await session.get(url, headers=headers)
return resp
except Exception as e:
print("获取链接{}时检测到异常:{}".format(url, e))
# 一开始的待爬取url队列为空, 需要从 start_url解析出url
async def init_urls(url, session):
resp = await fetch(url, session, url)
html = await resp.text()
global waiting_urls
soup = BeautifulSoup(html, "html.parser")
waiting_urls = [li.find("a").attrs["href"] for li in soup.find_all("div")[2].find_all("li")]
return waiting_urls
# 解析每个系列的图片数量
async def get_pic_num(url):
async with aiohttp.ClientSession() as session:
while True:
try:
resp = await fetch(url, session, url)
html = await resp.read()
soup = BeautifulSoup(html, "html.parser")
pic_num = int(soup.find(lambda tag: tag.name == 'a' and '下一页»' in tag.text).find_previous_sibling().text)
except AttributeError:
await asyncio.sleep(1)
continue
else:
break
return pic_num
# 解析每个系列的图片链接,保存到文件
async def article_handle(url, num):
async with sem:
async with aiohttp.ClientSession() as session:
while True:
try:
resp = await fetch(url, session, url)
html = await resp.read()
soup = BeautifulSoup(html, "html.parser")
target_pic_link = soup.find_all("div")[2].find_all("div")[3].find("img").attrs["src"]
except IndexError or AttributeError:
await asyncio.sleep(1)
continue
else:
break
file_folder_name = "img" + os.path.sep + soup.find_all("div")[2].find_all("div")[3].find("img").attrs["alt"]
if not os.path.exists(file_folder_name):
os.makedirs(file_folder_name)
img_file_path = file_folder_name + os.path.sep + '{}.jpg'.format(num)
print(img_file_path, "{} already exist !")
if not os.path.exists(img_file_path):
res = await fetch(target_pic_link, session, url)
content = await res.read()
async with aiofiles.open(img_file_path, "wb") as fb:
print('Success to save Pic file {} .'.format(img_file_path))
global img_total
img_total += 1
print(img_total)
await fb.write(content)
# 提交目标url中图片链接的协程 -> 解析并储存到文件
async def consumer():
async with Pool() as pool:
num_lis = await pool.map(get_pic_num, waiting_urls)
for link_num in zip(waiting_urls, num_lis):
url, num = link_num
for i in range(1, num+1):
link = url + "/{}".format(i)
asyncio.ensure_future(article_handle(link, i))
# 爬虫 主逻辑
async def main():
async with aiohttp.ClientSession() as session:
await init_urls(start_url, session)
asyncio.ensure_future(consumer())
if __name__ == '__main__':
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(main())
loop.run_forever()