在开始获取网站之前 我们需要以下指令来创建scrapy-redis文件:
在pycharm内创建一个文件 右键点击该文件 点击“打开于”再点击“终端”。
之后在该终端界面输入:scrapy startproject dangdang_book(文件名可自定义)
->cd dangdang_book ->scrapy genspider book_info(spider的名字可自定义) www.dangdang.com(访问的地址)
代码:
Spider.py (book_info)代码:
import scrapy
from scrapy.http import HtmlResponse
from scrapy import cmdline
from scrapy_redis.spiders import RedisSpider
class BookInfoSpider(RedisSpider):
name = "book_info"
redis_key ='dd_book:start_urls'
#创建scrapy_redis请求对象的方法
def make_request_from_data(self, data):
url=data.decode('utf-8')
return scrapy.Request(url=url,callback=self.parse)
def parse(self, response: HtmlResponse, **kwargs):
# 修复1:去掉所有多余空格,// 开头匹配
li_list = response.xpath('//ul[@class="bigimg"]/li')
for li in li_list:
item = dict()
item['title'] = li.xpath('./a/@title').extract_first() or '空'
# 修复2:去掉 p span text 里的所有空格
item['price'] = li.xpath('./p[@class="price"]/span[1]/text()').extract_first() or '空'
item['author'] = li.xpath('./p[@class="search_book_author"]/span[1]/a[1]/@title').extract_first() or '空'
item['date_data'] = li.xpath(
'./p[@class="search_book_author"]/span[last()-1]/text()').extract_first() or '空'
item['detail'] = li.xpath('./p[@class="detail"]/text()').extract_first() or '空'
item['producer'] = li.xpath('./p[@class="search_book_author"]/span[last()]/a/text()').extract_first() or '空'
print('一条数据已插入')
yield item
# 翻页逻辑不变
next_href = response.xpath('//ul[@name="Fy"]/li[@class="next"]/a/@href').extract_first()
if next_href:
next_url = response.urljoin(next_href)
yield scrapy.Request(url=next_url, callback=self.parse)
else:
print('没有下一页了...')
if name == 'main': cmdline.execute('scrapy crawl book_info'.split())
pipelines.py代码:
from itemadapter import ItemAdapter
import pymongo
import pymysql
class DangDangBookMongoPipeline:
def __init__(self):
self.mongo_client=None
self.db=None
def open_spider(self,spider):
if spider.name=='book_info':
self.mongo_client=pymongo.MongoClient()
self.db=self.mongo_client['py_spider']['dangdang_book']
def process_item(self, item, spider):
if spider.name=='book_info':
self.db.insert_one(item)
print('[mongodb]数据插入成功:',item)
return item
def close_spider(self, spider):
if spider.name == 'book_info':
self.db.close()
self.mongo_client.close()
class DangDangBookMysqlPipeline:
def __init__(self):
self.db=None
self.cursor=None
def open_spider(self,spider):
if spider.name=='book_info':
self.db=pymysql.connect(host='localhost',user='root',password=' ',db='py_spider')
self.cursor=self.db.cursor()
create_table="""
create table if not exists book_info(
id int primary key auto_increment not null,
title varchar(255) not null,
price varchar(255) not null,
author varchar(255) not null,
date_data varchar(255) not null,
detail text,
producer varchar(255) not null
);
"""
try:
self.cursor.execute(create_table)
print('表创建成功...')
except Exception as e:
print('表创建失败:',e)
def process_item(self,item,spider):
if spider.name=='book_info':
insert_sql="""
insert into book_info(title,price,author,date_data,detail,producer) values (%s,%s,%s,%s,%s,%s);
"""
try:
self.cursor.execute(insert_sql,(item['title'],item['price'],item['author'],item['date_data'],item['detail'],item['producer']))
self.db.commit()#事务提交
print('[mysql]数据插入成功',item['title'],item['price'],item['author'],item['date_data'],item['detail'],item['producer'])
except Exception as e:
print('[mysq]数据插入失败:',e)
self.db.rollback()#事务回滚
return item
def close_spider(self,spider):
if spider.name=='book_info':
self.cursor.close()
self.db.close()
settings.py代码:
BOT_NAME = "dangdang_book"
SPIDER_MODULES = ["dangdang_book.spiders"] NEWSPIDER_MODULE = "dangdang_book.spiders"
ADDONS = {}
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
DEFAULT_REQUEST_HEADERS = { 'User-Agent':'“打印你的UA信息” } ITEM_PIPELINES = { "scrapy_redis.pipelines.RedisPipeline": 300, "dangdang_book.pipelines.DangDangBookMongoPipeline": 301, "dangdang_book.pipelines.DangDangBookMysqlPipeline": 302 } FEED_EXPORT_ENCODING = "utf-8"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://localhost:6379/1"
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
insert_start_urls.py代码:
import redis with redis.Redis(db=1) as redis_client: redis_client.lpush('dd_book:start_urls','search.dangdang.com/?key=python…') print('上传成功...')
"""如果把该自定义脚本放在spider文件夹之下,则这个自定义脚本无需手动执行,因为scrapy会加载这个脚本并执行 但是放在spider外面的话,就得手动执行"""
代码完成后 打开redis,mongo,mysql确保里面没有别的数据 运行book_info文件后,稍等一会时间后即可分别在redis,mongo,mysql里面看见该网站被提取的全部信息