爬虫脚本
import scrapy
from scrapy import Request
from urllib import parse
from Article.items import CnblogsItem
class CnblogsPickSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["https://www.cnblogs.com/pick/"]
def parse(self, response):
# 获取页面所有的详情页
url = response.xpath('//article[@class="post-item"]')
for item in url:
# 获取详情页的url
detail_url = item.xpath('.//a[@class="post-item-title"]/@href').extract_first("")
# 获取图片的url
image_url = item.xpath('.//p[@class="post-item-summary"]//img/@src').extract_first("")
# 交给scrapy进行下载
yield {"detail_url": detail_url, "image_url": image_url}
# 获取下一页交给scrapy
next_page = response.xpath('//div[@class="pager"]/a[contains(text(),">")]/@href').extract_first("")
if next_page is not None:
yield Request(url=response.urljoin(next_page), callback=self.parse)
def parse_detail(self, response):
pass
问题
ValueError: Missing scheme in request url: h
2024-04-03 15:59:28 [scrapy.core.scraper] ERROR: Error processing {'detail_url': 'https://www.cnblogs.com/WizardWu/archive/2009/01/03/1367527.html', 'image_url': 'https://pic.cnblogs.com/face/35657/20160720124413.png'}
问题分析
如果单纯获取文本,只需要start_urls是一个list。但如果获取图片,则必须start_urls与item中存储图片路径字段都必须是list。
解决方案
import scrapy
from scrapy import Request
from urllib import parse
from Article.items import CnblogsItem
class CnblogsPickSpider(scrapy.Spider):
name = "cnblogs"
allowed_domains = ["www.cnblogs.com"]
start_urls = ["https://www.cnblogs.com/pick/"]
def parse(self, response):
# 获取页面所有的详情页
url = response.xpath('//article[@class="post-item"]')
for item in url:
# 获取详情页的url
detail_url = item.xpath('.//a[@class="post-item-title"]/@href').extract_first("")
# 获取图片的url
image_url = item.xpath('.//p[@class="post-item-summary"]//img/@src').extract_first("")
# 交给scrapy进行下载
yield {"detail_url": detail_url, "image_url": [image_url]}
# 获取下一页交给scrapy
next_page = response.xpath('//div[@class="pager"]/a[contains(text(),">")]/@href').extract_first("")
if next_page is not None:
yield Request(url=response.urljoin(next_page), callback=self.parse)
def parse_detail(self, response):
pass