简单爬取搜狐新闻的数据from scrapy.from scrapy.# allowed_domains = ['htt

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class SouhuSpiderSpider(CrawlSpider):
    name = 'souhu_spider'
    # allowed_domains = ['http://www.sohu.com/']
    start_urls = ['http://www.sohu.com//']

    rules = (
        # Rule(LinkExtractor(allow='http://.*?\.sohu\.com/\?\w+'), follow=True),
        Rule(LinkExtractor(allow='http://www\.sohu\.com/\w+/\w+?\w+'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = {}
        item['title'] = response.xpath('//div[@class="text-title"]/h1/text()').extract_first()
        item['time'] = response.xpath('//span[@class="time"]/text()').extract_first()
        # item['article'] = response.xpath('//article[@class="article"]/p/text()').extract()
        return item
        # print(response.url)