爬取深圳boss直聘前端30K以上职位

354 阅读2分钟

写理由很烦,直接上代码吧

用到的插件是

const puppeteer = require('puppeteer');
const xlsx = require('node-xlsx');
const fs = require('fs');

不是插件教程,所以需要自己有点puppeteer经验就行了,可以参考

结合项目来谈谈 Puppeteer

Puppeteer

因为boss直聘限制IP的访问频率,所以各位自己准备动态ip,或者每次请求之后,sleep长一点时间直接上代码,有注释

const sleep = timer => new Promise((resolve, reject) => {
  setTimeout(resolve, timer)
})
(async () => {
  async function first () {
    console.log('开启第一次爬虫');
    const browser = await puppeteer.launch({
      headless: false,   //有浏览器界面启动
      slowMo: 100,       //放慢浏览器执行速度,方便测试观察
      args: [            //启动 Chrome 的参数,详见上文中的介绍
          '–no-sandbox',
          '--window-size=1280,960'
      ],
    });
    let arr = []
    const page = await browser.newPage();
    // 因为30K以上就九个分页,所以直接写个循环就是了
    for (let i = 1; i < 10; i++){
      console.log(`${i}/10`)
      try {
        await page.goto(`https://www.zhipin.com/c101280600/y_7/?query=%E5%89%8D%E7%AB%AF&page=${i}&ka=page-${i}`,{
          timeout: 1200000,
          waitUntil: 'networkidle2'
        })
        const result = await page.evaluate(() => {
          let $ = window.$
          let li = $('.job-list li')
          let eles = []
          if (li?.length > 0) {
            li.map(async (i, v) => {
              let name = $(v).find('.job-name').text()
              let price = $(v).find('.job-limit .red').text()
              let limit = $(v).find('.job-limit p').text()
              let company = $(v).find('.company-text .name').text()
              let href = `https://www.zhipin.com` + $(v).find('.job-name a').attr('href')
              eles.push([
                name,
                price,
                limit,
                company,
                href])
            })
          }
          return eles
        })
        arr.push(...result)
      } catch (error) {
        // await page.waitFor(40000)
      }
    }
    console.log('结束第一次爬虫');
    // 生成目录
    let name = new Date().getTime().valueOf()
    const buffer = xlsx.build([{name: 'mySheetName', data: arr}]);
    fs.writeFileSync(`${name}.csv`, buffer, 'binary');
    return new Promise((resolve) => {
      resolve(arr)
    })
  }
  first().then(async res => {
    console.log('开启第二次爬虫');
    const browser = await puppeteer.launch({
      headless: false,   //有浏览器界面启动
      slowMo: 100,       //放慢浏览器执行速度,方便测试观察
      args: [            //启动 Chrome 的参数,详见上文中的介绍
          '–no-sandbox',
          '--window-size=1280,960'
      ],
    });
    const page = await browser.newPage();
    for (let i = 0; i < res.length; i++) {
      console.log(`${i}/${res.length}`);
      try {
        await page.goto(res[i][4],{
          timeout: 1200000,
          waitUntil: 'networkidle2'
        })
        const result = await page.evaluate(() => {
          let $ = window.$
          let job = $('.job-sec .text')[0].innerHTML ?? null
          return job
        })
        if (result) {
          console.log('完成爬取');
          res[i].push(result) 
        } else {
          // 懒得提取爬取方法,爬取失败后,停止20S,然后在爬,再失败,就去catch了
          console.log(`重新爬取${i}条`);
          await sleep(20000)
          await page.goto(arr1[i][4],{
            timeout: 1200000,
            waitUntil: 'networkidle2'
          })
          const result = await page.evaluate(() => {
            let $ = window.$
            let job = $('.job-sec .text')[0].innerHTML ?? null
            return job
          })
          if (result) {
            console.log('完成爬取');
            res[i].push(result)
          } else {
            throw new Error('123')
          }
        }
      } catch (error) {
        await sleep(30000)
        res[i].push('result') 
      }
    }
    console.log(res);
    const buffer = xlsx.build([{name: 'mySheetName', data: res}]);
    let name = new Date().getTime().valueOf()
    fs.writeFileSync(`${name}.csv`, buffer, 'binary');
    console.log('结束第二次爬虫');
  })();
})()

然后就爬出来了

image.png

结语:今年互联网行业是真的有点冷,并不像人们乐观所说的到了高级就好找工作,一共只有两百多条数据,除掉一些大公司发布多个相同岗位的,只有不到六十家公司有在招聘。所以各位互勉吧

下载地址: http://47.112.238.202:10003/mulu.csv 不保证啥时候失效