nodejs 爬虫库crawler的简单封装

3,214 阅读1分钟

安装

npm install crawler

or

yarn add crawler

GitHub: github.com/bda-researc…

MCrawler.js

'use strict'

// 爬虫工具库 MCrawler.js

const Crawler = require('crawler')



class MCrawler {
  constructor(newConfig, callback) {
    this.c = this.createCrawler(newConfig, callback)
  }

  /**
 * 初始化爬虫配置
 * @param {object} newConfig 可参考 github Crawer官方配置
 */
  createCrawler (newConfig, cb) {
    let config = Object.assign({
      headers: {   // 加入请求报头模拟浏览器请求防止被封IP
        'User-Agent':
          'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4295.400'
      },
      maxConnections: 5,
      jQuery: true
    }, newConfig)
    return new Crawler({
      headers: config.headers,
      jQuery: config.jQuery,
      maxConnections: config.maxConnections, // 并发数量
      // This will be called for each crawled page
      callback (error, res, done) {
        if (cb) {
          cb(error, res, done)
        }
        if (error) {
          console.log(error)
        }
        done()
      }
    })
  }
  /**
  * 开始爬取数据
  * @param {*} requestUrl
  * @returns {Promise}   返回一个promise
  */
  startCrawer (requestUrl) {
    return new Promise((resolve, reject) => {
      this.c.queue([
        {
          uri: requestUrl,
          jQuery: true,
          callback (error, res, done) {
            if (error) {
              console.log('爬虫错误', error)
              done()
              reject(error)
            } else {
              let $ = res.$
              done()
              resolve($)
            }
          }
        }
      ])
    })
  }
}
module.exports = MCrawler



示例

const MCrawler = require('./MCrawler')
let crawlerNew = new MCrawler();
const crawlingText = async searchObj => {
  let requestUrl = `http://.....`
  let $ = await crawlerNew.startCrawer(requestUrl)
  return {
    text: $('title').text()
  }
}