Nodejs 爬取素材
为什么是
Puppeteer?
后续会做表单自动提交功能,先踩踩坑.
步骤
- 观察目标
- 寻找规律
- 代码实现
背景交代
个人项目需要一些素材
浏览网站时,发现每日一句板块很有意思依赖:Nodejs+Puppeteer+Mongodb
安装相关依赖
略...
内容爬取
- 获取双语句子
- 获取好看的图片
分析页面结构
- 切换卡片时候地址栏只是时间在变化
- 往前推算时间未知此模块是何时上线
- 继续浏览发现有个列表页,包含每日入口
getDetail.js!(async () => { const puppeteer = require('puppeteer') let browser = await puppeteer.launch({ headless: false, timeout: 10 * 60 * 1000, defaultViewport: { width: 1366, height: 768 } }) let page = await browser.newPage() await page.setRequestInterception(true) let url = `http://news.iciba.com/views/dailysentence/daily.html#!/detail/title/2018-11-11` try { await page.goto(url, { timeout: 10 * 60 * 1000, waitUntil: 'networkidle0' }) await page.waitFor(3 * 1000) let { zhCn, enUs } = await page.evaluate(() => { let enUs = document.querySelector('.detail-content .detail-content-en').innerText let zhCn = document.querySelector('.detail-content .detail-content-zh').innerText return { zhCn, enUs } }) let data = { source: 'iciba', zhCn, enUs } console.log(data) } catch (err) { console.log(err) } await browser.close() })()
全部列表
- 列表入口
getLinks.js!(async () => { const puppeteer = require('puppeteer') let browser = await puppeteer.launch({ headless: false, timeout: 10 * 60 * 1000, defaultViewport: { width: 1366, height: 768 } }) let page = await browser.newPage() await page.setRequestInterception(true) // total step 也可页面爬取 const conf = { total: 500, // 页面显示总数 也可内部读取Dom(.sort_btn>p span) step: 8, // 页面每页数量(length) uri: 'http://news.iciba.com/appv3/wwwroot/ds.php?action=history' } // http://news.iciba.com/appv3/wwwroot/ds.php?action=history&order=2&page=1 /** * order 排序方式 * 2 时间排序 * 1 热度排序 * page 页数 */ const getPageLinks = async url => { try { await page.goto(url, { waitUntil: 'networkidle0' }) await page.waitFor(3 * 1000) let pageLinks = await page.evaluate(() => { let pageLinks = [] let items = document.querySelectorAll('#content .main .sort_list') items.forEach((item, index) => { console.log('item==>', index) let a = item.querySelector('.c_l_m_en>a') a && pageLinks.push(a.href) }) return pageLinks }) return pageLinks } catch (err) { console.log(err) } } /** * 获取所有链接 */ const getLinks = async () => { let pages = Math.ceil(conf.total / conf.step) let arrLinks = [] let baseUrl = conf.uri for (let index = 1; index <= pages; index++) { let url = `${baseUrl}&order=2&page=${index}` let startTime = new Date().getTime() let links = await getPageLinks(url, page) arrLinks.push(...links) let endTime = new Date().getTime() console.log(`${url}:${endTime - startTime}`) } return arrLinks } console.log(await getLinks()) await browser.close() })()
获取全部的图文卡片
- 逐个获取
getLinks的值
图片爬取
- 直接获取使用上面的图片
- 批量爬取别的网站图片
目标网址(图片无版权)
- pixabay
- 获取全部图片链接
getAllLinks.jslet config = { baseUrl: 'https://pixabay.com/images/search/', limit: 5 // 只获取5页 } // 获取当前页面图片地址 const getPageImgs = async (url, page) => { await page.goto(url, { timeout: 60 * 1000, waitUntil: 'networkidle0' }) await page.waitFor(3 * 1000) let { srcList } = await page.evaluate(() => { let srcList = [] let $ = window.$ let $items = $('.flex_grid.credits.search_results .item') $.each($items, (i, item) => { let $img = $(item).find('a img') let _src = '' if ($img.attr('data-lazy')) { _src = $img.attr('data-lazy') } else { _src = $img.attr('src') } srcList.push(`${_src.replace('__340', '_960_720')}`) }) return { srcList } }) return srcList } // 获取全部链接地址 const run = async () => { let startTime = new Date().getTime() let browser = await puppeteer.launch({ headless: false, timeout: 30000, defaultViewport: { width: 1366, height: 768 } }) let page = await browser.newPage() await page.setRequestInterception(true) page.on('request', request => { let url = request.url() // console.log(url) if (url.includes('google.com')) { request.abort() } else { request.continue() } }) let srcList = [] for (let index = 1; index <= config.limit; index++) { let _arr = await getPageImgs(`${config.baseUrl}?pagi=${index}`, page) const appendFile = () => { return new Promise((resolve, reject) => { fs.appendFile( path.join(__dirname, './per_page_links.json'), JSON.stringify({ page: index, total: _arr.length, list: _arr }), err => { if (err) { reject(err) } resolve() } ) }) } // 防止某一页中断后,可以从中断页继续爬取 await appendFile() console.log(`page:${index}---写入成功`) srcList.push(..._arr) } console.log(srcList) await browser.close() let endTime = new Date().getTime() console.log(`耗时 ${(endTime - startTime) / 1000}s`) const writeFile = () => { return new Promise((resolve, reject) => { fs.writeFile( path.join(__dirname, './total_links.json'), JSON.stringify({ total: srcList.length, list: srcList }), err => { if (err) { reject(err) } resolve() } ) }) } // writeFile await writeFile() } run() - 下载全部图片
downAllImgs.js// 延时 const sleep = ms => { return new Promise(resolve => setTimeout(resolve, ms)) } // 下载图片到本地 const downloadImage = async (url, fullPath) => { let res = await axios({ method: 'get', responseType: 'stream', url }) // console.log('-------------------------',res.data) res.data.pipe(fs.createWriteStream(fullPath)) } let arrLinks = [ 'https://cdn.pixabay.com/photo/2019/06/29/16/28/mouse-4306520_960_720.jpg', 'https://cdn.pixabay.com/photo/2019/06/29/18/51/rails-4306770_960_720.jpg', 'https://cdn.pixabay.com/photo/2019/06/29/09/42/barley-4305844_960_720.jpg', 'https://cdn.pixabay.com/photo/2019/06/30/08/30/frog-4307564_960_720.jpg', 'https://cdn.pixabay.com/photo/2019/06/25/13/06/fiat-4298163_960_720.jpg', 'https://cdn.pixabay.com/photo/2019/07/01/11/23/dog-4309752_960_720.jpg', 'https://cdn.pixabay.com/photo/2019/06/25/18/50/sunflower-4298808_960_720.jpg' ] // let arrLinks = require('./total_links.js') const run = async () => { arrLinks = arrLinks.splice(1380, arrLinks.length) let len = arrLinks.length for (let index = 0; index < len; index++) { let src = arrLinks[index] await downloadImage(src, `${savePath}/${uuid()}${src.split('_960_720')[1]}`) // console.log(`${Math.floor(((index + 1) / len) * 100)}%`) console.log(`${index + 1}/${len}`) await sleep(Math.floor(Math.random() * 10 * 1000)) // 防止过于频繁的请求,被判定为爬虫恶意爬取 } process.exit() } run()
数据持久化
- 内容文字存入
Mongodb - 图片内容存入
七牛云
踩坑注意
- 请求进行时总是抛出
timeout超时- 增大
timeout设置,但是还会有网络或链接情况影响 - 添加请求代码手动分析猜测
// 发现控制台在打印出某个url之后卡住了,之后会抛出timeout异常 page.on('request', request => { let url = request.url() // console.log(url) if (url.includes('google.com')) { request.abort() } else { request.continue() } })
- 增大
- 注意
page.evaluate()作用域 - 性能优化,注意
Puppeteer browser启动项args: [ '–disable-gpu', '–disable-dev-shm-usage', '–disable-setuid-sandbox', '–no-first-run', '–no-sandbox', '–no-zygote', '–single-process' ]
参考资料
留坑
linux安装部署puppeteer项目