-
Notifications
You must be signed in to change notification settings - Fork 0
/
pipelines.py
76 lines (64 loc) · 1.76 KB
/
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# -*- coding: utf-8 -*-
# pylint:disable=unused-argument
'''
pipelines 处理获取的item
'''
import json
import codecs
import pymongo
from scrapy.exceptions import DropItem
class JsonWriterPipeline(object):
'''
存储到json文件
'''
def __init__(self):
'''
初始化方法,打开文件
'''
self.file = codecs.open('items.json', 'a', encoding='utf-8')
def close_spider(self, spider):
'''
关闭json文件
'''
self.file.close()
def process_item(self, item, spider):
'''
写入到json文件
'''
line = json.dumps(dict(item)) + "\n"
self.file.write(line.decode('unicode_escape'))
return item
class MongoPipeline(object):
'''
利用mongodb中的数据过滤item,并将通过的数据存储到mongodb中
'''
def __init__(self, mongo_uri, mongo_db):
'''
init, 保存mongodb的配置
'''
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
@classmethod
def from_crawler(cls, crawler):
'''
类方法,初始化pipeline
'''
return cls(
mongo_uri=crawler.settings.get('MONGO_URI', '192.168.6.183:27017'),
mongo_db=crawler.settings.get('MONGODB_NAME', 'zy_crawler_test')
)
def close_spider(self, spider):
'''
关闭mongodb
'''
self.client.close()
def process_item(self, item, spider):
'''
处理item
'''
if item is None:
raise DropItem("item is None")
self.db['result_online_%s'%item['site']].insert(dict(item))
return item