scrapy爬取斗鱼主播图片

322 阅读1分钟

目标

抓取斗鱼主播图片并且保存到本地重命名

新建项目

# 新建项目
scrapy startproject douyuSpider
# 新建爬虫
scrapy genspider douyu "douyucdn.cn"

项目目录: 项目目录结构

定义item文件

import scrapy

class DouyuspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # 昵称
    nickname=scrapy.Field()
    # 图片链接
    imageurl=scrapy.Field()

settings设置

# 启用管道
ITEM_PIPELINES = {
   'douyuSpider.pipelines.DouyuspiderPipeline': 300,
}
# 图片存储位置
IMAGES_STORE = 'D:\\PythonProjects\\spiders\\douyuSpider\\images'
# 禁用robots协议
ROBOTSTXT_OBEY = False

爬虫文件douyu.py

# -*- coding: utf-8 -*-
import scrapy
import json
from douyuSpider.items import DouyuspiderItem


# 抓取斗鱼主播图片保存
class DouyuSpider(scrapy.Spider):
    name = 'douyu'
    allowed_domains = ['douyucdn.cn']
    baseURL = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
    # 偏移量
    offset = 0
    start_urls = [baseURL + str(offset)]

    def parse(self, response):
        data_list = json.loads(response.body)['data']
        # 若数据爬完则返回
        if len(data_list) == 0:
            return
        # 开始爬取
        for data in data_list:
            item = DouyuspiderItem()
            item['nickname'] = data['nickname']
            item['imageurl'] = data['vertical_src']
            yield item
        # 修改偏移量,多次爬虫
        self.offset += 20
        yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)

管道文件pipelines.py

import scrapy
from douyuSpider.settings import IMAGES_STORE
from scrapy.pipelines.images import ImagesPipeline
import os

# 使用图片管道
class DouyuspiderPipeline(ImagesPipeline):

    # 获取图片
    def get_media_requests(self, item, info):
        image_link = item['imageurl']
        yield scrapy.Request(image_link)

    # 获取后处理:重命名
    # results中提取本地保存path,修改
    def item_completed(self, results, item, info):
        # 取出路径
        image_url=[x["path"] for ok,x in results if ok]
        # 重命名
        os.rename(IMAGES_STORE+image_url[0],IMAGES_STORE+item['nickname']+".jpg")
        return item

运行

切换到该工作目录下

scrapy crawl douyu

爬取结果

结果列表1

结果