本文将实现用Puppeteer爬取网页然后将爬取内容生成截图或填充到PDF
先看下通过爬虫爬取实现的效果
首先感谢这篇文章提供的思路和方法 如何使用 Node.js 和 puppeteer 抓取网页
Puppeteer文档API
知识背景
Puppeteer 是一个 Node 库,它提供了一个高级 API 来通过 DevTools 协议控制 Chromium 或 Chrome。Puppeteer 默认以 headless 模式运行, 但是可以通过修改配置文件运行“有头”模式。在创建 browser时,可通过传入配置项实现无头模式的控制
[案例1] 先动手实现一个 5行核心代码实现自动打开浏览器 输入网站并生成截图 的小案例,展示下Puppeteer的能力;
1.新建目录 puppet
2.新建 puppet/package.json
{
"name": "yg-puppeteer-demo",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "MES",
"license": "ISC",
"dependencies": {
"axios": "^1.3.4",
"puppeteer": "^14.1.2",
"request": "^2.88.2"
}
}
3.新建 puppet/index.js
const puppeteer = require('puppeteer');
(async () => {
// 5行核心代码 在这里哦
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://baidu.com');
await page.screenshot({path: 'baidu.png'});
await browser.close();
})();
- npm install 安装依赖
- node index.js 运行后在根目录会自动生成baidu.png图片(根据打开URL生产的截图)
可以看到5行代码实现 模拟浏览器打开指定页面并生成截图留存成功。
这种无感的node交互好像看不见不够直观? .如果想运行在本地浏览器(谷歌)上实时观看,请使用以下方法。静默执行请略过此步
1.右键点击谷歌浏览器点击属性
2.在目标栏中后面先加一个空格,然后加--remote-debugging-port=9222,点击应用
3.关闭你所有的谷歌浏览器(很重要:所有的),重新启动,访问 http://localhost:9222/json/version
4.如果有数据,说明配置成功了,如果无,请百度
5.打开浏览器,运行程序!
打开浏览器 输入:http://localhost:9222/json/version 测试配置是否成功
注意:一定要关闭所有浏览器然后回到桌面,双击刚才那个加了后缀的谷歌浏览器打开(千万不要在任务栏或开始菜单的快捷方式打开会报错)
在Chrome中打开显示返回说明配置成功
可视环境搭建完毕,我们改造下 index.js 复制一份重命名为 indexShow.js
// yg pupppeteer 爬虫演示demo
const puppeteer = require('puppeteer');
const request = require("request");
(async () => {
var browser;
await new Promise((resolve, rejects) => {
request({ url: 'http://localhost:9222/json/version', method: "get" }, async (err, response, body) => {
if (response) {
wsKey = JSON.parse(response.body)
console.log(wsKey)
resolve()
}
});
})
browser = await puppeteer.connect({
browserWSEndpoint: wsKey.webSocketDebuggerUrl,
defaultViewport: null
});
const page = await browser.newPage();
page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 2,
})
await page.goto('https://baidu.com');
await page.screenshot({path: 'baidu.png'});
console.log('yg puppeteer 爬虫演示带视窗demo执行成功!');
await browser.close();
})();
自动打开浏览器输入网址然后截图留存,实现窗口可视调试更直观。
[案例2]爬取网页生成PDF
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://www.baidu.com', {waitUntil: 'networkidle2'});
await page.pdf({path: 'hn.pdf', format: 'A4'});
await browser.close();
})();
[案例3]抓取网页中标题和URL
const puppeteer = require('puppeteer');
const URL = 'https://coding.napolux.com';
puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }).then(async browser => {
const page = await browser.newPage();
await page.setViewport({width: 320, height: 600})
await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 9_0_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13A404 Safari/601.1')
await page.goto(URL, {waitUntil: 'networkidle0'});//当至少 500 毫秒内网络连接不超过 0 个时,认为导航已完成。 https://www.w3cschool.cn/puppeteer/puppeteer-rip537tj.html
await page.waitForSelector('body.blog');//等待 body下 class="blog" DOM树加载完毕
await page.addScriptTag({url: 'https://code.jquery.com/jquery-3.2.1.min.js'})//插入JQ
const result = await page.evaluate(() => {
try {
var data = [];
$('h3.loop__post-title').each(function() {
const url = $(this).find('a').attr('href');
const title = $(this).find('a').attr('title');
const text = $(this).find('a').text();
data.push({
'title' : title,
'url' : url,
'text' : text
});
});
return data; // Return our data array
} catch(err) {
reject(err.toString());
}
});
// let's close the browser
await browser.close();
// ok, let's log blog titles...
for(var i = 0; i < result.length; i++) {
console.log('Post: ' + result[i].title + ' URL: ' + result[i].url + 'Text: ' + result[i].text);
}
process.exit();
}).catch(function(error) {
console.error('No way Paco!');
process.exit();
});
[案例4] 小试牛刀--爬取本文作者博客首页文章试试。【注意:请合法合理使用,NOZUONODIE】
const puppeteer = require('puppeteer');
const URL = 'https://juejin.cn/user/501800125607752';
puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }).then(async browser => {
const page = await browser.newPage();
await page.setViewport({width: 320, height: 600})
await page.setUserAgent('Mozilla/5.0 (iPhone; CPU iPhone OS 9_0_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13A404 Safari/601.1')
await page.goto(URL, {waitUntil: 'networkidle0'});//当至少 500 毫秒内网络连接不超过 0 个时,认为导航已完成。 https://www.w3cschool.cn/puppeteer/puppeteer-rip537tj.html
await page.waitForSelector('body #__nuxt');//等待 body下 DOM树加载完毕
await page.waitForSelector('.activity-list-box .activity-list a.post-link');
await page.addScriptTag({url: 'https://code.jquery.com/jquery-3.2.1.min.js'})//插入JQ
// 1.获取单个用户名
const username = await page.$eval('.info-box .user-name', el => el.innerText);
console.log('用户名:',username);//YG908
// 2.抓取主页文章URL链接
const urls = await page.$$eval('.activity-list-box .activity-list a.post-link', links =>
links.map(link => link.href)
);
console.log('文章URL:',urls);
// 3.抓取每篇文章的标题和描述
const result = await page.evaluate(() => {
try {
var data = [];
$('.list-body .activity-list-box .activity-list').each(function() {
const title = $(this).find('h3').text().trim();
const excerpt = $(this).find('.post-item-excerpt').text().trim();
data.push({
'title' : title,
'excerpt' : excerpt
});
});
return data; // Return our data array
} catch(err) {
reject(err.toString());
}
});
// let's close the browser
await browser.close();
// ok, let's log blog titles...
for(var i = 0; i < result.length; i++) {
console.log(' TITLE: ' + result[i].title + ' EXCEPRPT: ' + result[i].excerpt);
}
process.exit();
}).catch(function(error) {
console.error('No way yg!',error);
process.exit();
});
业务延伸
需求:每天7点将A站登录后打开B页截图后发送给C邮箱。
const puppeteer = require("puppeteer");
const fs = require("fs");
const request = require("request");
// 上传图片ip地址
var urlIp = "http://192.168.3.103/apis/pmp/board/upload";
// 账号密码维护
var userData = [
{
userName: "luzhiyong",//软件工程
passWord: "123456",
dpetid: "1"
},
{
userName: "chenshengqing",//品质部
passWord: "123456",
dpetid: "3"
},
{
userName: "pengjianping",//pmc
passWord: "123456",
dpetid: "4"
},
{
userName: "pengchaoliang",//采购
passWord: "123456",
dpetid: "5"
},
{
userName: "yichao",//电气
passWord: "123456",
dpetid: "8"
},
{
userName: "yangweiping",//生产/自动化装配部
passWord: "123456",
dpetid: "10"
},
];
var userDataIndex = 0;
var browser, wsKey;
// 部门截图逻辑
async function getBase64() {
if (userDataIndex == userData.length) {
userDataIndex = 0
console.log("重置:", userDataIndex)
// 谷歌浏览器实时查看打开
await getjimu()
return
}
//只能打开一个
await new Promise((resolve, rejects) => {
request({ url: 'http://localhost:9222/json/version', method: "get" }, async (err, response, body) => {
if (response) {
wsKey = JSON.parse(response.body)
console.log(wsKey)
resolve()
}
});
})
browser = await puppeteer.connect({
browserWSEndpoint: wsKey.webSocketDebuggerUrl,
defaultViewport: null
});
// browser = await puppeteer.launch({ slowMo: 1000 });
const page = await browser.newPage();
page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 2,
})
console.log(3);
await page.goto("http://192.168.1.181");
await page.type(".login-from-cont>div:nth-of-type(1) .el-input__inner", userData[userDataIndex].userName)
await page.type(".login-from-cont>div:nth-of-type(2) .el-input__inner", userData[userDataIndex].passWord)
await page.type(".login-from-cont>div:nth-of-type(3) .el-input__inner", "0")
let layout = await page.$eval(".hamburger-container", e => e).catch(() => null)
await page.click(".login_btn", { delay: 1500 })
await page.click(".xuyujie", { delay: 1500 }).catch(() => { })
layout = await page.$eval(".hamburger-container", e => e).catch(() => null)
while (layout == null) {
layout = await page.$eval(".hamburger-container", e => e).catch(() => null)
await page.click(".login_btn", { delay: 1500 }).catch(async () => {
layout = true
})
}
await page.click(".hamburger-container", { delay: 15000 })
// 元素截图
const element = await page.$(".app-main")
await element.screenshot({ path: `${userData[userDataIndex].dpetid}.png` })
// 坐标截图
// await page.screenshot({ path: `${dpetid}.png`,clip:{x:55,y:85,width:1865,height:995}})
await page.deleteCookie({ name: "Admin-Token" })
await page.deleteCookie({ name: "sidebarStatus" })
fs.readFile(`./${userData[userDataIndex].dpetid}.png`, (err, data) => {
let dpetid = userData[userDataIndex].dpetid
if (err) {
// console.log(`${dpetid}出错了!`)
} else {
// console.log(`${dpetid}成功了了!`)
//读取根目录下生成的图片
let bitmap = fs.readFileSync(`./${dpetid}.png`);
// 将读取的图片转成BASE64;
let base64Png = `data:image/png;base64,${Buffer.from(bitmap).toString('base64')}`
request({
url: urlIp,
method: "post",
json: true,
body: {
departType: dpetid,
baseImg: base64Png
}
}, (error, response, body) => {
if (error) {
// console.log(error)
userDataIndex++
console.log("++", userDataIndex)
getBase64()
}
if (response) {
console.log(response.body)
userDataIndex++
console.log("++", userDataIndex)
getBase64()
}
if (body) {
// console.log(body);
}
})
// setTimeout(() => {
// fs.unlink(`./${dpetid}.png`, (error) => {
// if (error) {
// // console.log(`${dpetid}删除失败了`)
// } else {
// console.log(`${dpetid}删除成功了`+new Date())
// }
// })
// }, 2000)
}
})
}
//配置项
var config = {
time: "07:00:00",//每天几点执行
interval: 1,//隔几天执行一次
runNow: true //是否立即执行
};
//定时任务逻辑
function timerTask(config) {
if (config.runNow) {
//如果配置了立刻运行则立刻运行任务函数
cleanLog();
}
//获取下次要执行的时间,如果执行时间已经过了今天,就让把执行时间设到明天的按时执行的时间
var nowTime = new Date().getTime();
var timePoint = config.time.split(":").map(i => parseInt(i));
var recent = new Date().setHours(...timePoint);//获取执行时间的时间戳
if (recent <= nowTime) {
recent += 24 * 60 * 60 * 1000;
}
//未来程序执行的时间减去现在的时间,就是程序要多少秒之后执行
var doRunTime = recent - nowTime;
setTimeout(function () {
cleanLog();
//每隔多少天在执执行
var intTime = config.interval * 24 * 60 * 60 * 1000;
setInterval(function () {
cleanLog();
}, intTime);
}, doRunTime);
};
//清空日志逻辑
async function cleanLog() {
//清空web运行程序的日志
var toDay = new Date()
var year = toDay.getFullYear() //年
var month = toDay.getMonth() + 1 < 10 ? '0' + (toDay.getMonth() + 1) : toDay.getMonth() + 1 //月
var data = toDay.getDate() < 10 ? '0' + toDay.getDate() : toDay.getDate() //日
var hour = toDay.getHours() < 10 ? '0' + toDay.getHours() : toDay.getHours() //时
var minute = toDay.getMinutes() < 10 ? '0' + toDay.getMinutes() : toDay.getMinutes() //分
var second = toDay.getSeconds() < 10 ? '0' + toDay.getSeconds() : toDay.getSeconds() //秒
var today = year + '年' + month + '月' + data + '日' + hour + '时' + minute + '分' + second + '秒'
// 这里放要执行的程序:
await getBase64()
}
// 获取积木报表
async function getjimu() {
// 同上只能打开一个
// await new Promise((resolve, rejects) => {
// request({ url: 'http://localhost:9222/json/version', method: "get" }, async (err, response, body) => {
// if (response) {
// wsKey = JSON.parse(response.body)
// console.log(wsKey)
// resolve()
// }
// });
// })
// browser = await puppeteer.connect({
// browserWSEndpoint: wsKey.webSocketDebuggerUrl,
// defaultViewport: null,
// slowMo: 1000
// });
browser = await puppeteer.launch({ slowMo: 1000 });
const page = await browser.newPage();
page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 2,
})
await page.goto("http://192.168.0.171:4002/jmreport/view/691791748505595904");
// 坐标截图
await page.evaluate(() => {
let body = document.body;
let app = document.getElementById("app");
var oText = document.createTextNode(new Date())
body.insertBefore(oText, app)
})
await page.screenshot({ path: `jimu_renli.png`, fullPage: true, maxHeight: 6000, })
fs.readFile(`./jimu_renli.png`, (err, data) => {
if (err) {
// console.log(`jimu_renli出错了!`)
} else {
// console.log(`jimu_renli成功了了!`)
let bitmap = fs.readFileSync(`./jimu_renli.png`);
let base64Png = `data:image/png;base64,${Buffer.from(bitmap).toString('base64')}`
request({
url: urlIp,
method: "post",
json: true,
body: {
departType: "77",
baseImg: base64Png
}
}, (error, response, body) => {
if (error) {
// console.log(error)
// console.log("失败")
}
if (response) {
console.log(response.body)
browser.close();
}
if (body) {
// console.log(body);
}
})
setTimeout(() => {
fs.unlink(`./jimu_renli.png`, (error) => {
if (error) {
// console.log(`jimu_renli删除失败了`)
} else {
console.log(`jimu_renli删除成功了`+new Date())
}
})
}, 2000)
}
})
}
// 执行任务
console.log("程序启动了!")
timerTask(config)
这里puppeteer把生成的图片固定每天七点请求API接口发送,后台程序八点去取图片然后发送邮件。
结束!