import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class SouhuSpiderSpider(CrawlSpider):
name = 'souhu_spider'
start_urls = ['http://www.sohu.com//']
rules = (
Rule(LinkExtractor(allow='http://www\.sohu\.com/\w+/\w+?\w+'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = {}
item['title'] = response.xpath('//div[@class="text-title"]/h1/text()').extract_first()
item['time'] = response.xpath('//span[@class="time"]/text()').extract_first()
return item