scrapy-redis获取某个网站信息

1 阅读2分钟

在开始获取网站之前 我们需要以下指令来创建scrapy-redis文件:

在pycharm内创建一个文件 右键点击该文件 点击“打开于”再点击“终端”。

之后在该终端界面输入:scrapy startproject dangdang_book(文件名可自定义)

->cd dangdang_book ->scrapy genspider book_info(spider的名字可自定义) www.dangdang.com(访问的地址)

代码:

Spider.py (book_info)代码:

import scrapy

from scrapy.http import HtmlResponse

from scrapy import cmdline

from scrapy_redis.spiders import RedisSpider

class BookInfoSpider(RedisSpider):

name = "book_info"

redis_key ='dd_book:start_urls'

#创建scrapy_redis请求对象的方法

def make_request_from_data(self, data):
    url=data.decode('utf-8')
    return scrapy.Request(url=url,callback=self.parse)

def parse(self, response: HtmlResponse, **kwargs):
    # 修复1:去掉所有多余空格,// 开头匹配
    li_list = response.xpath('//ul[@class="bigimg"]/li')

    for li in li_list:
        item = dict()
        item['title'] = li.xpath('./a/@title').extract_first() or '空'
        # 修复2:去掉 p  span  text 里的所有空格
        item['price'] = li.xpath('./p[@class="price"]/span[1]/text()').extract_first() or '空'

        item['author'] = li.xpath('./p[@class="search_book_author"]/span[1]/a[1]/@title').extract_first() or '空'
        item['date_data'] = li.xpath(
            './p[@class="search_book_author"]/span[last()-1]/text()').extract_first() or '空'
        item['detail'] = li.xpath('./p[@class="detail"]/text()').extract_first() or '空'
        item['producer'] = li.xpath('./p[@class="search_book_author"]/span[last()]/a/text()').extract_first() or '空'
        print('一条数据已插入')
        yield item

    # 翻页逻辑不变
    next_href = response.xpath('//ul[@name="Fy"]/li[@class="next"]/a/@href').extract_first()
    if next_href:
        next_url = response.urljoin(next_href)
        yield scrapy.Request(url=next_url, callback=self.parse)
    else:
        print('没有下一页了...')

if name == 'main': cmdline.execute('scrapy crawl book_info'.split())

pipelines.py代码:

from itemadapter import ItemAdapter

import pymongo

import pymysql

class DangDangBookMongoPipeline:

def __init__(self):

    self.mongo_client=None

    self.db=None

def open_spider(self,spider):

    if spider.name=='book_info':
        self.mongo_client=pymongo.MongoClient()
        self.db=self.mongo_client['py_spider']['dangdang_book']

def process_item(self, item, spider):
    if spider.name=='book_info':
        self.db.insert_one(item)
        print('[mongodb]数据插入成功:',item)
    return item

def close_spider(self, spider):
    if spider.name == 'book_info':
        self.db.close()
        self.mongo_client.close()

class DangDangBookMysqlPipeline:

def __init__(self):

    self.db=None

    self.cursor=None

def open_spider(self,spider):

    if spider.name=='book_info':
        self.db=pymysql.connect(host='localhost',user='root',password=' ',db='py_spider')
        self.cursor=self.db.cursor()

        create_table="""
            create table if not exists book_info(
             id int primary key auto_increment not null,
             title varchar(255) not null,
             price varchar(255) not null,
             author varchar(255) not null,
             date_data varchar(255) not null,
             detail text,
             producer varchar(255) not null
        );
        """
        try:
            self.cursor.execute(create_table)
            print('表创建成功...')
        except Exception as e:
                print('表创建失败:',e)

def process_item(self,item,spider):
    if spider.name=='book_info':
        insert_sql="""
            insert into book_info(title,price,author,date_data,detail,producer) values (%s,%s,%s,%s,%s,%s);
        """
        try:
            self.cursor.execute(insert_sql,(item['title'],item['price'],item['author'],item['date_data'],item['detail'],item['producer']))
            self.db.commit()#事务提交
            print('[mysql]数据插入成功',item['title'],item['price'],item['author'],item['date_data'],item['detail'],item['producer'])
        except Exception as e:
            print('[mysq]数据插入失败:',e)
            self.db.rollback()#事务回滚
        return item
def close_spider(self,spider):
    if spider.name=='book_info':
        self.cursor.close()
        self.db.close()

settings.py代码:

BOT_NAME = "dangdang_book"

SPIDER_MODULES = ["dangdang_book.spiders"] NEWSPIDER_MODULE = "dangdang_book.spiders"

ADDONS = {}

ROBOTSTXT_OBEY = False

LOG_LEVEL='ERROR'

DEFAULT_REQUEST_HEADERS = { 'User-Agent':'“打印你的UA信息” } ITEM_PIPELINES = { "scrapy_redis.pipelines.RedisPipeline": 300, "dangdang_book.pipelines.DangDangBookMongoPipeline": 301, "dangdang_book.pipelines.DangDangBookMysqlPipeline": 302 } FEED_EXPORT_ENCODING = "utf-8"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER_PERSIST = True

REDIS_URL = "redis://localhost:6379/1"

SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'

insert_start_urls.py代码:

import redis with redis.Redis(db=1) as redis_client: redis_client.lpush('dd_book:start_urls','search.dangdang.com/?key=python…') print('上传成功...')

"""如果把该自定义脚本放在spider文件夹之下,则这个自定义脚本无需手动执行,因为scrapy会加载这个脚本并执行 但是放在spider外面的话,就得手动执行"""

代码完成后 打开redis,mongo,mysql确保里面没有别的数据 运行book_info文件后,稍等一会时间后即可分别在redis,mongo,mysql里面看见该网站被提取的全部信息