1 scrapy官方文档
生成scrapy项目
scrapy startproject tutorial
2 生成spider
spider name必须唯一
3 修改spider内容
import scrapy
# 文章地址 https://www.zhihu.com/people/woodenrobot/posts?page=2
# 运行命令 scrapy crawl douban_ajax -o douban_ajax.csv
import re
import json
from scrapy import Request
from scrapy.spiders import Spider
class DoubanMovieItem(scrapy.Item):
# 排名
ranking = scrapy.Field()
# 电影名称
movie_name = scrapy.Field()
# 评分
score = scrapy.Field()
# 评论人数
score_num = scrapy.Field()
class DoubanAJAXSpider(Spider):
name = 'douban_ajax'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
}
def start_requests(self):
url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start=0&limit=20'
yield Request(url, headers=self.headers)
def parse(self, response):
datas = json.loads(response.body)
item = DoubanMovieItem()
if datas:
for data in datas:
item['ranking'] = data['rank']
item['movie_name'] = data['title']
item['score'] = data['score']
item['score_num'] = data['vote_count']
yield item
# 如果datas存在数据则对下一页进行采集
page_num = re.search(r'start=(\d+)', response.url).group(1)
page_num = 'start=' + str(int(page_num)+20)
next_url = re.sub(r'start=\d+', page_num, response.url)
yield Request(next_url, headers=self.headers)
4 运行命令
scrapy crawl douban_ajax -o douban_ajax.csv