Node 抓取 GitHub Trending 数据

233 阅读1分钟

数据抓取

基于 Nodejs 请求数据解析,返回api。

接口测试:test.52shujun.cn

抱歉,拉低【掘金】代码质量。

安装库

npm i cheerio express superagent -S

app.js

const express = require('express')
const app = express()
const { getTrending, getNationality, getLanguage, getDate } = require('./util')

// 国家
app.get('/nationality', async (req, res) => {
  const data = await getNationality()
  res.json(data)
})

// 编程语言
app.get('/language', async (req, res) => {
  const data = await getLanguage()
  res.json(data)
})

// 时间
app.get('/date', async (req, res) => {
  const data = await getDate()
  res.json(data)
})

// 趋势
app.get('/', async (req, res) => {
  var { language = 'any', since = 'any', nationality = 'any' } = req.query
  var data = await getTrending(language, since, nationality)
  res.send(data)
})

app.listen(3000, () => {
  console.log('服务已运行')
})

util.js

// 类似 jQuery
const cheerio = require('cheerio')
// 基于原生 node 的 ajax 请求库
const request = require('superagent')

// 目标地址
const trending = `https://github.com/trending`

// 获取趋势数据
function getTrending(language, since, nationality) {
  return new Promise(async (resolve, reject) => {
    // 参数拼接 -- start
    var queryData = {}
    var localTreding = trending
    if (language !== 'any') {
      localTreding = `${trending}/${language}`
    }
    if (since !== 'any') {
      queryData['since'] = since
    }
    if (nationality !== 'any') {
      queryData['spoken_language_code'] = nationality
    }
    // 参数拼接 -- end

    // html 请求解析
    const html = await request.get(localTreding).query(queryData)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('.Box-row').each((index, item) => {
      const title = $(item).find('h1 > a').text().split('/')[1].trim()
      const author = $(item).find('h1 > a').text().split('/')[0].trim()
      const discription = $(item).find('p').text().trim()
      const language = $(item)
        .find('div [itemprop="programmingLanguage"]')
        .text()
      const star = $(item)
        .find('.d-inline-block.float-sm-right')
        .text()
        .match(/\d+/)[0]
      const avatarUrl = $(item).find('img.avatar').attr('src')
      const url = 'https://github.com' + $(item).find('h1 > a').attr('href')

      list.push({
        title,
        author,
        avatarUrl,
        discription,
        language,
        star,
        url,
      })
    })
    list.length > 0 ? resolve(list) : reject('null')
  })
}

// 国家
function getNationality() {
  return new Promise(async (resolve, reject) => {
    const html = await request.get(trending)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('[data-filterable-for="text-filter-field-spoken-language"] a').each(
      (index, item) => {
        const key = $(item).find('> span').text().trim()
        const value = $(item).attr('href').split('=')[1]

        list.push({
          key,
          value,
        })
      }
    )
    list.length > 0 ? resolve(list) : reject('null')
  })
}

// 编程语言
function getLanguage() {
  return new Promise(async (resolve, reject) => {
    const html = await request.get(trending)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('#languages-menuitems > div > a').each((index, item) => {
      const key = $(item).find('> span').text().trim()
      const value = $(item)
        .attr('href')
        .match(/[^\/]+(\s\S)*(?=\?)/)[0]
      list.push({
        key,
        value,
      })
    })
    list.length > 0 ? resolve(list) : reject('null')
  })
}

// 时间
function getDate() {
  return new Promise(async (resolve, reject) => {
    const html = await request.get(trending)
    const $ = cheerio.load(html.text)
    var list = []
    // 解析数据
    $('.select-menu-list > a.select-menu-item').each((index, item) => {
      const key = $(item).find('> span').text().trim()
      const value = $(item).attr('href').split('=')[1]
      list.push({
        key,
        value,
      })
    })
    list.length > 0 ? resolve(list) : reject('null')
  })
}

module.exports = {
  getDate,
  getLanguage,
  getNationality,
  getTrending,
}