爬虫实战scrapy

214 阅读1分钟
# -*- coding: utf-8 -*-
import scrapy
import re


class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']

    def parse(self, response):
        
        url_list = response.css('div.post-meta p a.archive-title::attr(href)').extract()
        # url_list = response.xpath("//div[@class='post-meta']/p/a[@class='archive-title']/@href").extract()
        # url_list = response.xpath("//a[@class='archive-title']/@href").extract()
        # xpath
        # 注意在用属性选择器的时候尽量选择class属性,因为id属性选择器可扩展性不强
        # 即id选择器只对当前页有效,对其他页面可能就没有这个id,就可能报错
        # //*[@id="post-113735"]/div[1]
        # 获取标题
        title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
        # 获取日期
        crat_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0]
        # 删除空白字符
        crat_time = crat_time.strip()
        # 删除点
        crat_time = crat_time.strip('·')
        # 删除空白字符
        crat_time =crat_time.strip()
        # 获取关键字标签
        biao_qian = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # biao_qian = response.css("p.entry-meta-hide-on-mobile a::text")
        if biao_qian:
            biao_qian = ','.join(biao_qian)
        else:
            biao_qian = ''

        # 获取点赞数
        dian_zan = response.xpath('//span[@class=" btn-bluet-bigger href-style vote-post-up   register-user-only "]/h10/text()').extract()[0]
        # 收藏数
        shou_cang = response.xpath('//span[@class=" btn-bluet-bigger href-style bookmark-btn  register-user-only "]/text()').extract()[0]
        ret = re.match(r'.*?(\d+).*?', shou_cang)
        if ret:
            shou_cang = int(ret.group(1))
        else:
            shou_cang = 0

        # 获取评论
        comment = response.xpath('//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()').extract()[0]
        # re.findall得到的是列表
        comment = re.findall(r'.*(\d).*?', comment)
        if comment:
            comment = int(comment[0])
        else:
            comment = 0

        # 以下是运用css 选择器提取数据
        # 获取文章内容
        # bood_commemt = response.css("div.entry").extract()[0]
        # 获取标题
        # title_css = response.css(".entry-header h1::text").extract()[0]
        # 获取日期
        # crat_time_css = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().strip('·').strip()

        pass