-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrapycsdemo.py
33 lines (27 loc) · 1.35 KB
/
scrapycsdemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class EPSpeakerSpider(CrawlSpider):
name = 'epspeakers_crawlspider'
start_urls = ['https://ep2015.europython.eu/en/speakers/']
rules = [
Rule(LinkExtractor(allow=('/conference/',)),
callback='parse_speakerdetails')
]
def parse_speakerdetails(self, response):
item = {}
item['url'] = response.url
item['name'] = response.xpath('//section[@class="profile-name"]//h1/text()').extract()[0].strip()
item['avatar'] = response.urljoin(response.xpath('//img[@class="avatar"]/@src').extract()[0])
# Getting all attributes about the Speaker
for field in response.xpath('//dl[@class="dl-horizontal"]//dt'):
name = field.xpath('.//text()').extract()[0].strip()
description = ''.join(field.xpath('.//following-sibling::dd[1]//text()').extract()).strip()
item[name] = description
# Extracting list of talks
talks = []
for talk in response.xpath('//div[@class="speaker-talks well"]//li'):
talk_title = talk.xpath('.//text()').extract()[0]
talk_url = response.urljoin(talk.xpath('.//a/@href').extract()[0])
talks.append({'title': talk_title, 'url': talk_url})
item['talks'] = talks
return item