Puppeteer-获取所需素材(爬虫)

1,426 阅读3分钟

Nodejs 爬取素材

为什么是Puppeteer?
后续会做表单自动提交功能,先踩踩坑.

步骤

  1. 观察目标
  2. 寻找规律
  3. 代码实现

背景交代

个人项目需要一些素材
浏览网站时,发现每日一句板块很有意思

依赖:Nodejs+Puppeteer+Mongodb

安装相关依赖

略...

内容爬取

  1. 获取双语句子
  2. 获取好看的图片

分析页面结构

  1. 切换卡片时候地址栏只是时间在变化
  2. 往前推算时间未知此模块是何时上线
  3. 继续浏览发现有个列表页,包含每日入口
  • getDetail.js
    !(async () => {
      const puppeteer = require('puppeteer')
      let browser = await puppeteer.launch({
        headless: false,
        timeout: 10 * 60 * 1000,
        defaultViewport: { width: 1366, height: 768 }
      })
    
      let page = await browser.newPage()
      await page.setRequestInterception(true)
      
      let url = `http://news.iciba.com/views/dailysentence/daily.html#!/detail/title/2018-11-11`
      try {
        await page.goto(url, {
          timeout: 10 * 60 * 1000,
          waitUntil: 'networkidle0'
        })
        await page.waitFor(3 * 1000)
        let { zhCn, enUs } = await page.evaluate(() => {
          let enUs = document.querySelector('.detail-content .detail-content-en').innerText
          let zhCn = document.querySelector('.detail-content .detail-content-zh').innerText
          return {
            zhCn,
            enUs
          }
        })
    
        let data = {
          source: 'iciba',
          zhCn,
          enUs
        }
        console.log(data)
      } catch (err) {
        console.log(err)
      }
      await browser.close()
    })()
    
    

全部列表

  • 列表入口
  • getLinks.js
    !(async () => {
      const puppeteer = require('puppeteer')
    
      let browser = await puppeteer.launch({
        headless: false,
        timeout: 10 * 60 * 1000,
        defaultViewport: { width: 1366, height: 768 }
      })
    
      let page = await browser.newPage()
      await page.setRequestInterception(true)
    
      // total step 也可页面爬取
      const conf = {
        total: 500, // 页面显示总数 也可内部读取Dom(.sort_btn>p span)
        step: 8, // 页面每页数量(length)
        uri: 'http://news.iciba.com/appv3/wwwroot/ds.php?action=history'
      }
    
      // http://news.iciba.com/appv3/wwwroot/ds.php?action=history&order=2&page=1
      /**
       * order 排序方式
       * 2 时间排序
       * 1 热度排序
       * page 页数
       */
    
      const getPageLinks = async url => {
        try {
          await page.goto(url, {
            waitUntil: 'networkidle0'
          })
          await page.waitFor(3 * 1000)
          let pageLinks = await page.evaluate(() => {
            let pageLinks = []
            let items = document.querySelectorAll('#content .main .sort_list')
            items.forEach((item, index) => {
              console.log('item==>', index)
              let a = item.querySelector('.c_l_m_en>a')
              a && pageLinks.push(a.href)
            })
            return pageLinks
          })
          return pageLinks
        } catch (err) {
          console.log(err)
        }
      }
    
      /**
       * 获取所有链接
       */
      const getLinks = async () => {
        let pages = Math.ceil(conf.total / conf.step)
        let arrLinks = []
        let baseUrl = conf.uri
    
        for (let index = 1; index <= pages; index++) {
          let url = `${baseUrl}&order=2&page=${index}`
          let startTime = new Date().getTime()
          let links = await getPageLinks(url, page)
          arrLinks.push(...links)
          let endTime = new Date().getTime()
          console.log(`${url}:${endTime - startTime}`)
        }
        return arrLinks
      }
      console.log(await getLinks())
      await browser.close()
    })()
    
    

获取全部的图文卡片

  • 逐个获取getLinks的值

图片爬取

  1. 直接获取使用上面的图片
  2. 批量爬取别的网站图片

目标网址(图片无版权)

  • pixabay
  • 获取全部图片链接getAllLinks.js
    let config = {
      baseUrl: 'https://pixabay.com/images/search/',
      limit: 5 // 只获取5页
    }
    // 获取当前页面图片地址
    const getPageImgs = async (url, page) => {
      await page.goto(url, {
        timeout: 60 * 1000,
        waitUntil: 'networkidle0'
      })
      await page.waitFor(3 * 1000)
      let { srcList } = await page.evaluate(() => {
        let srcList = []
        let $ = window.$
        let $items = $('.flex_grid.credits.search_results .item')
        $.each($items, (i, item) => {
          let $img = $(item).find('a img')
          let _src = ''
          if ($img.attr('data-lazy')) {
            _src = $img.attr('data-lazy')
          } else {
            _src = $img.attr('src')
          }
          srcList.push(`${_src.replace('__340', '_960_720')}`)
        })
        return { srcList }
      })
      return srcList
    }
    // 获取全部链接地址
    const run = async () => {
      let startTime = new Date().getTime()
      let browser = await puppeteer.launch({
        headless: false,
        timeout: 30000,
        defaultViewport: { width: 1366, height: 768 }
      })
    
      let page = await browser.newPage()
      await page.setRequestInterception(true)
      
      page.on('request', request => {
        let url = request.url()
        // console.log(url)
        if (url.includes('google.com')) {
          request.abort()
        } else {
          request.continue()
        }
      })
      
      let srcList = []
      for (let index = 1; index <= config.limit; index++) {
        let _arr = await getPageImgs(`${config.baseUrl}?pagi=${index}`, page)
        const appendFile = () => {
          return new Promise((resolve, reject) => {
            fs.appendFile(
              path.join(__dirname, './per_page_links.json'),
              JSON.stringify({
                page: index,
                total: _arr.length,
                list: _arr
              }),
              err => {
                if (err) {
                  reject(err)
                }
                resolve()
              }
            )
          })
        }
        // 防止某一页中断后,可以从中断页继续爬取
        await appendFile()
        console.log(`page:${index}---写入成功`)
        srcList.push(..._arr)
      }
      console.log(srcList)
      await browser.close()
      let endTime = new Date().getTime()
      console.log(`耗时 ${(endTime - startTime) / 1000}s`)
      const writeFile = () => {
        return new Promise((resolve, reject) => {
          fs.writeFile(
            path.join(__dirname, './total_links.json'),
            JSON.stringify({
              total: srcList.length,
              list: srcList
            }),
            err => {
              if (err) {
                reject(err)
              }
              resolve()
            }
          )
        })
      }
      // writeFile
      await writeFile()
    }
    run()
    
  • 下载全部图片downAllImgs.js
    // 延时
    const sleep = ms => {
      return new Promise(resolve => setTimeout(resolve, ms))
    }
    // 下载图片到本地
    const downloadImage = async (url, fullPath) => {
      let res = await axios({
        method: 'get',
        responseType: 'stream',
        url
      })
      // console.log('-------------------------',res.data)
      res.data.pipe(fs.createWriteStream(fullPath))
    }
    let arrLinks = [
      'https://cdn.pixabay.com/photo/2019/06/29/16/28/mouse-4306520_960_720.jpg',
      'https://cdn.pixabay.com/photo/2019/06/29/18/51/rails-4306770_960_720.jpg',
      'https://cdn.pixabay.com/photo/2019/06/29/09/42/barley-4305844_960_720.jpg',
      'https://cdn.pixabay.com/photo/2019/06/30/08/30/frog-4307564_960_720.jpg',
      'https://cdn.pixabay.com/photo/2019/06/25/13/06/fiat-4298163_960_720.jpg',
      'https://cdn.pixabay.com/photo/2019/07/01/11/23/dog-4309752_960_720.jpg',
      'https://cdn.pixabay.com/photo/2019/06/25/18/50/sunflower-4298808_960_720.jpg'
    ]
    
    // let arrLinks = require('./total_links.js')
    
    const run = async () => {
      arrLinks = arrLinks.splice(1380, arrLinks.length)
      let len = arrLinks.length
      for (let index = 0; index < len; index++) {
        let src = arrLinks[index]
        await downloadImage(src, `${savePath}/${uuid()}${src.split('_960_720')[1]}`)
        // console.log(`${Math.floor(((index + 1) / len) * 100)}%`)
        console.log(`${index + 1}/${len}`)
        await sleep(Math.floor(Math.random() * 10 * 1000)) // 防止过于频繁的请求,被判定为爬虫恶意爬取
      }
      process.exit()
    }
    run()
    

数据持久化

  • 内容文字存入Mongodb
  • 图片内容存入七牛云

踩坑注意

  • 请求进行时总是抛出timeout超时
    1. 增大timeout设置,但是还会有网络或链接情况影响
    2. 添加请求代码手动分析猜测
      // 发现控制台在打印出某个url之后卡住了,之后会抛出timeout异常
      page.on('request', request => {
       let url = request.url()
       // console.log(url)
       if (url.includes('google.com')) {
         request.abort()
       } else {
         request.continue()
       }
      })
      
  • 注意page.evaluate()作用域
  • 性能优化,注意Puppeteer browser启动项
    args: [
      '–disable-gpu',
      '–disable-dev-shm-usage',
      '–disable-setuid-sandbox',
      '–no-first-run',
      '–no-sandbox',
      '–no-zygote',
      '–single-process'
    ]
    

参考资料

留坑

  • linux安装部署puppeteer项目