本文已参与「新人创作礼」活动,一起开启掘金创作之路。
CrawlSpider可以定义规则在解析html内容时候提取出指定的链接,再向这些链接发送请求,如果有跟进链接的需求,使用他是非常合适的
创建项目
scrapy genspider -t crawl read www.dushu.com/book/1175.h… 创建爬虫文件 scrapy -t crawl 爬虫文件名字 爬取域名 和平时不同需要在前面加一个-t
去数据结构定义一个你要获取内容的名称 在Items类中
class ScrapyReadbookItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name =scrapy.Field()
src = scrapy.Field()
和我们平时创建项目文件不一样,在这一行写好要提取链接的正则表达式
Rule(LinkExtractor(allow=r'/book/1175_\d+\.html')
在爬虫文件中通过正则表达式爬取书的名字和地址
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_readbook.items import ScrapyReadbookItem
class ReadSpider(CrawlSpider):
name = 'read'
allowed_domains = ['www.dushu.com']
start_urls = ['https://www.dushu.com/book/1175_1.html']
rules = (
Rule(LinkExtractor(allow=r'/book/1175_\d+\.html')
, callback='parse_item',
follow=True),
)
def parse_item(self, response):
#//div[@class="bookslist"]//li//img//@data-original
#//div[@class="bookslist"]//li//img//@alt
item_list = response.xpath('//div[@class="bookslist"]//li//img')
for i in item_list:
name = i.xpath('./@data-original').extract_first()
src = i.xpath('./@alt').extract_first()
item=ScrapyReadbookItem(name=name,src=src)
yield item
来到管道pipelines保存爬取的文件数据内容
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScrapyReadbookPipeline:
#在爬虫前创建文件
def open_spider(self,spider):
self.fp= open('book.json','w',encoding='utf-8')
def process_item(self, item, spider):
#爬虫中读取保存
self.fp.write(str(item))
return item
def end_spider(self,spider):
#爬虫结束后回收关闭文件
self.fp.close()
接下来创建一个数据库
然后去settings配置数据库信息
接下来多创建一条管道
创建完成后要去settings 添加你新创建的管道信息,后面的优先级越小速度越快
写到数据库中
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class ScrapyReadbookPipeline:
def open_spider(self,spider):
self.fp= open('book.json','w',encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(str(item))
return item
def end_spider(self,spider):
self.fp.close()
pass
#加载settings文件
from scrapy.utils.project import get_project_settings
#导入数据库包
import pymysql
#在多创建一条管道
class MysqlPipeline:
#创建数据库链接
def open_spider(self,spider):
#加载settings文件
settings=get_project_settings()
self.host=settings['DB_HOST'] #这些都是链接值必备的参数
self.port=settings['DB_PORT']
self.user=settings['DB_USER']
self.passwrod=settings['DB_PASSWROD']
self.name=settings['DB_NAME']
self.charset=settings['DB_CHARSET']
#数据库链接函数
self.conn = pymysql.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.passwrod,
# 数据库名字
db=self.name,
charset=self.charset,
)
# 这个对象可以执行sql语句
self.cursor = self.conn.cursor()
print("连接数据库成功")
#坑点 item 这个不要打错
def process_item(self,item,spider):
#创建数据库插入语句
#执行sql语句
try:
sql='insert into book(name,src) values("{}","{}")'.format(item['name'],item['src'])
self.cursor.execute(sql)
self.conn.commit()
except Exception as error:
print(error)
#提交语句
self.conn.commit()
return item
# 关闭数据库链接
def end_spider(self, spider):
#关闭数据库
self.cursor.close()
self.conn.close()
效果图