图片管道的使用
使用scrapy自带的图片管道需要满足以下3点:(首先需要安装pillow模块)
1.使⽤scrapy⾃带的图⽚管道类。
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline':300
}
2.设置图片存储路径
IMAGES_STORE='C:\Users\DELL\PycharmProjects\python_spider\day05_scrapy_图片管道'
3.根据指定字段传输数据,传⼊的字段 key 必须是 image_urls ,value是⼀个可以被循环的对象。
item['image_urls']=url
yield item
示范案例:(堆糖)
import scrapy
from lxml import etree
from ..items import DuitangImageItem
class DuitangSpider(scrapy.Spider):
name = "duitang"
allowed_domains = ["www.duitang.com"]
start_urls = ["https://www.duitang.com/category/?cat=movie_music_books"]
def parse(self, response):
# print(response.text)
html=etree.HTML(response.text)
for i in html.xpath('//div[@class="j"]'):
url=i.xpath('//div[1]/a/img/@src')
title_id=i.xpath('//div[1]/a/img/@data-rootid')
item = DuitangImageItem()
# image_urls 不能修改名称的字段
item['image_urls']=url
yield item #yield {'image_urls':url}
自定义图片管道案例
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class DuitangImagePipeline(ImagesPipeline):
# def get_media_requests(self, item, info):
# request_list = super().get_media_requests(item, info)
#
# new_list = []
# for index, request in enumerate(request_list): # 为每个请求对象绑定对于列表索引的名字
# request.image_name = item['image_name'][index]
# new_list.append(request)
#
# return new_list
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.images_urls_field, []) # 根据字段获取其他的链接
name = item['image_name'] # 获取名称
request_list = []
for u, name in zip(urls, name):
request = Request(u, callback=NO_CALLBACK) # 创建请求对象
request.image_name = name # 为请求对象绑定对于的名称
request_list.append(request) # 添加列表
return request_list # 需要返回一个请求对象列表
def file_path(self, request, response=None, info=None, *, item=None):
return f"full/{request.image_name}.jpg" # 使用前面对象所绑定的名称
根据image.py文件,自定义存储数据:
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class DuitangImagePipeline(ImagesPipeline):
# def get_media_requests(self, item, info):
# request_list = super().get_media_requests(item, info)
#
# new_list = []
# for index, request in enumerate(request_list): # 为每个请求对象绑定对于列表索引的名字
# request.image_name = item['image_name'][index]
# new_list.append(request)
#
# return new_list
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.images_urls_field, []) # 根据字段获取其他的链接
name = item['image_name'] # 获取名称
request_list = []
for u, name in zip(urls, name):
request = Request(u, callback=NO_CALLBACK) # 创建请求对象
request.image_name = name # 为请求对象绑定对于的名称
request_list.append(request) # 添加列表
return request_list # 需要返回一个请求对象列表
def file_path(self, request, response=None, info=None, *, item=None):
return f"full/{request.image_name}.jpg" # 使用前面对象所绑定的名称