-
Notifications
You must be signed in to change notification settings - Fork 0
/
spider.py
53 lines (48 loc) · 1.97 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
import time
class downloader(object):
def __init__(self):
# self.server='http://www.jjwxc.net/'
# self.target='http://www.jjwxc.net/onebook.php?novelid=2221448'
self.target='https://m.lewenxiaoshuo.com/books/chenghuashisinian/'
self.read_content()
for i in range(len(self.texts)):
self.read_url(i)
self.read_text()
time.sleep(2)
# print(len(self.texts))
def read_content(self):
req = requests.get(url=self.target)
content=req.text.encode('ISO-8859-1','ignore').decode('GB18030','ignore')
# print(content)
html = content
bf = BeautifulSoup(html)
# texts = bf.find_all('table', class_="cytable")
# self.texts = bf.find_all('tr',itemprop='chapter')# jinjiangwenxue
self.texts = bf.find_all('div',class_='box')
temp=BeautifulSoup(str(self.texts))
self.texts=temp.find_all('li')
def read_url(self,i):
a_bf = BeautifulSoup(str(self.texts[i]))
a = a_bf.find_all('a')
for each in a:
print(each.string, each.get('href'))
self.filename=each.string
self.url=each.get('href')
def read_text(self):
# target = 'https://m.lewenxiaoshuo.com/books/chenghuashisinian/11823888.html'
req = requests.get(url=self.url)
content=req.text.encode('gbk').decode('GB18030','ignore')
html = content
bf = BeautifulSoup(html)
# texts = bf.find_all('div', class_ = 'noveltext')
texts = bf.find_all('div', id = 'content')
strs=texts[0].text
strs = re.sub('[!"#$%&\'()*+-/<=>?@?★、【】《》[\\]^_`{|}~\s]+', "\n",strs)
testtext=strs
with open(r'D:\ETH\\test\test.txt',"a",encoding='gbk') as f:
f.write('\n\n\n\n第'+self.filename+'章'+'\n\n'+testtext)
p=downloader()