nodejs爬虫实现

204 阅读1分钟

爬取图片

废话不说,直接上代码

  • app.js
// app.js
// 引入模块
// cheerio、iconv、request需要安装
const cheerio = require('cheerio');
const fs = require('fs');
var iconv = require('iconv-lite');
const request = require('request');
const Utils = require('./utils')
const baseUrl = 'https://pic.netbian.com'

Utils.deleteDir('./pic/')
function start(index) {
    let url = ''
    if (index > 58) return
    if (index == 1) {
        url = baseUrl + '/4kmeinv/index.html'
    } else {
        url = baseUrl + `/4kmeinv/index_${index}.html`
    }
    console.log(`------------第${index}页--------------`);
    requestSrc(url)
    // 间隔20s获取下一页内容
    const timer = setTimeout(() => {
        index++
        start(index)
    }, 20000)
}

// 获取一页数据
const requestSrc = async (url) => {
    request({
        url,
        method: 'GET',
        timeout: 5000
    }, function (err, res, body) {
        if (err) {
            return console.error(err)
        }
        if (res.statusCode == 200) {
            findHref(body)
        }
    })
}

function findHref(body) {
    const $ = cheerio.load(body)
    $('.slist .clearfix a').each((index, dom) => {
        // 根据每页图片数据获取详情页地址
        let href = $(dom).attr('href')
        requestHref(href)
    })
}

// 加载详情页面图片和标题
function requestHref(href) {
    const url = baseUrl + href
    request({
        url,
        encoding: null
    }, function (err, res, body) {
        if (err) {
            return console.error(err)
        }
        if (res.statusCode == 200) {
            // 处理中文乱码
            let buf = iconv.decode(body, 'gb2312').toString();
            findImg(buf)
        }
    })
}

function findImg(body) {
    const $ = cheerio.load(body)
    const dom = $('.photo-pic img')
    const src = dom.attr('src')
    const srcArr = src.split('.')
    const extension = srcArr[srcArr.length - 1]
    const title = dom.attr('title') + '.' + extension
    saveImgFile(src, title)
}

function saveImgFile(src, title) {
    const writeStrean = fs.createWriteStream('./pic/' + title)
    request(baseUrl + src).pipe(writeStrean)
    writeStrean.on('finish', function() {
        console.log('文件写入成功:', title)
    })
}

start(1)

  • utils.js
const fs = require('fs')

/**
 * 读取目录
 */
function readDir(path) {
    return fs.readdirSync(path, function (err, files) {
        if (err) {
            console.error(err)
            return false
        }
        return files
    })
}

/**
 * 删除目录
 */
function deleteDir(path) {
    if (fs.existsSync(path)) {
        const files = readDir(path)
        if (files) {
            files.forEach(file => {
                fs.unlinkSync(path + file)
            })
        }
        fs.rmdirSync(path)
    }
    // 删除后重新创建文件夹,目的为了清空文件夹内容
    fs.mkdirSync(path)
}

module.exports = {
    readDir,
    deleteDir
}

结果展示

image.png