import scrapy
import re
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
url_list = response.css('div.post-meta p a.archive-title::attr(href)').extract()
title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
crat_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0]
crat_time = crat_time.strip()
crat_time = crat_time.strip('·')
crat_time =crat_time.strip()
biao_qian = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
if biao_qian:
biao_qian = ','.join(biao_qian)
else:
biao_qian = ''
dian_zan = response.xpath('//span[@class=" btn-bluet-bigger href-style vote-post-up register-user-only "]/h10/text()').extract()[0]
shou_cang = response.xpath('//span[@class=" btn-bluet-bigger href-style bookmark-btn register-user-only "]/text()').extract()[0]
ret = re.match(r'.*?(\d+).*?', shou_cang)
if ret:
shou_cang = int(ret.group(1))
else:
shou_cang = 0
comment = response.xpath('//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()').extract()[0]
comment = re.findall(r'.*(\d).*?', comment)
if comment:
comment = int(comment[0])
else:
comment = 0
pass