Scrapy爬虫框架5-图片管道

73 阅读2分钟

图片管道的使用

使用scrapy自带的图片管道需要满足以下3点:(首先需要安装pillow模块)

1.使⽤scrapy⾃带的图⽚管道类。

ITEM_PIPELINES = {
    'scrapy.pipelines.images.ImagesPipeline':300
}

2.设置图片存储路径

IMAGES_STORE='C:\Users\DELL\PycharmProjects\python_spider\day05_scrapy_图片管道'

3.根据指定字段传输数据,传⼊的字段 key 必须是 image_urls ,value是⼀个可以被循环的对象。

item['image_urls']=url
yield item

示范案例:(堆糖)

import scrapy
from lxml import etree
from ..items import DuitangImageItem

class DuitangSpider(scrapy.Spider):
    name = "duitang"
    allowed_domains = ["www.duitang.com"]
    start_urls = ["https://www.duitang.com/category/?cat=movie_music_books"]

    def parse(self, response):
        # print(response.text)
        html=etree.HTML(response.text)
        for i in html.xpath('//div[@class="j"]'):
            url=i.xpath('//div[1]/a/img/@src')
            title_id=i.xpath('//div[1]/a/img/@data-rootid')
            item = DuitangImageItem()
            # image_urls 不能修改名称的字段
            item['image_urls']=url
            yield item    #yield {'image_urls':url}

自定义图片管道案例

from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline


class DuitangImagePipeline(ImagesPipeline):
    # def get_media_requests(self, item, info):
    #     request_list = super().get_media_requests(item, info)
    #
    #     new_list = []
    #     for index, request in enumerate(request_list): # 为每个请求对象绑定对于列表索引的名字
    #         request.image_name = item['image_name'][index]
    #         new_list.append(request)
    #
    #     return new_list

    def get_media_requests(self, item, info):
        urls = ItemAdapter(item).get(self.images_urls_field, [])  # 根据字段获取其他的链接
        name = item['image_name']  # 获取名称

        request_list = []
        for u, name in zip(urls, name):
            request = Request(u, callback=NO_CALLBACK)  # 创建请求对象
            request.image_name = name  # 为请求对象绑定对于的名称
            request_list.append(request)  # 添加列表

        return request_list  # 需要返回一个请求对象列表

    def file_path(self, request, response=None, info=None, *, item=None):
        return f"full/{request.image_name}.jpg"  # 使用前面对象所绑定的名称

根据image.py文件,自定义存储数据:

from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline


class DuitangImagePipeline(ImagesPipeline):
    # def get_media_requests(self, item, info):
    #     request_list = super().get_media_requests(item, info)
    #
    #     new_list = []
    #     for index, request in enumerate(request_list): # 为每个请求对象绑定对于列表索引的名字
    #         request.image_name = item['image_name'][index]
    #         new_list.append(request)
    #
    #     return new_list

    def get_media_requests(self, item, info):
        urls = ItemAdapter(item).get(self.images_urls_field, [])  # 根据字段获取其他的链接
        name = item['image_name']  # 获取名称

        request_list = []
        for u, name in zip(urls, name):
            request = Request(u, callback=NO_CALLBACK)  # 创建请求对象
            request.image_name = name  # 为请求对象绑定对于的名称
            request_list.append(request)  # 添加列表

        return request_list  # 需要返回一个请求对象列表

    def file_path(self, request, response=None, info=None, *, item=None):
        return f"full/{request.image_name}.jpg"  # 使用前面对象所绑定的名称