以下通过 node后端实现请求和逆向数据,如需运行请 补充 egg.js && typescript
以下代码仅做学习参考,切勿违法
public async index() {
let _origScriptSessionId = "_scriptSessionId关键key";
let _scriptSessionId = null;
_scriptSessionId = await this._getScriptSessionId(_scriptSessionId, _origScriptSessionId)
let currentUrl = ['010212.html','01021201.html',"01021202.html",]; //请求页面
let page = currentUrl[2]; //通过数组控制爬取数据
const headers = { //伪造请求头
"Host": "学校官网地址",
"Connection": "keep-alive",
"Content-Length": "306",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
"DNT": "1",
"Content-Type": "text/plain",
"Accept": "*/*",
"Origin": "学校官网地址",
"Referer": `学校官网地址/newkm/colunm/${page}`,
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cookie": "JSESSIONID=BB598025E17A847357963806D340B29F" //好像学校不查cookie有效值,应该是公开内容不做判断
}
const data = { //请求参数
"callCount":"1",
"page":`/newkm/colunm/${page}`,
"httpSessionId":"",
"scriptSessionId":`${_scriptSessionId}`,
"c0-scriptName":"portalAjax",
"c0-methodName":"getNewsXml",
"c0-id":"0",
"c0-param0":"string:0102",
"c0-param1":`string:${page.split('.html')[0]}`,
"c0-param2":"string:news_",
"c0-param3":"number:20",
"c0-param4":"number:1",
"c0-param5":"null:null",
"batchId":"0"
}
const { ctx } = this;
const url = '学校官网/newkm/dwr/call/plaincall/portalAjax.getNewsXml.dwr';
const result = await ctx.curl(url,{
method: 'POST', // 设置请求方式
contentType: "text/html; charset=UTF-8",
data: data,
headers:headers
});
//把请求到的ascii码数据转为中文,并去掉无用字符串
let str = result.data.toString()
str = str.replace("//#DWR-INSERT",'');
str = str.replace("//#DWR-REPLY","")
str = str.replace(`dwr.engine._remoteHandleCallback('2','0',"`,"")
str = str.replace(`");`,"")
str = str.replace(`<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<s><n>4051</n>`,"")
str = str.replace(`</s>`,"")
const dom = new JSDOM(str);
let t = dom.window.document.querySelectorAll("i");
//定义数组保存数据
let arr:any[] = [];
t.forEach((item)=>{
// console.log(item.textContent?.split(".html")[0]);
let url = item.textContent?.split(".html")[0];
let s = item.textContent?.split(".html")[1]
s = s.replace(/\d{4}-\d{1,2}-\d{1,2}/g, "");
//组装数据
let obj = {
url: url+'.html',
content: this.reconvert(s)
}
arr.push(obj)
})
ctx.body = arr;
}
//ascii码转换方法
public reconvert(str) {
str = str.replace(/(\u)(\w{1,4})/gi, function ($0) {
return (String.fromCharCode(parseInt((escape($0).replace(/(%5Cu)(\w{1,4})/g, "$2")), 16)));
});
str = str.replace(/(&#x)(\w{1,4});/gi, function ($0) {
return String.fromCharCode(parseInt(escape($0).replace(/(%26%23x)(\w{1,4})(%3B)/g, "$2"), 16));
});
str = str.replace(/(&#)(\d{1,6});/gi, function ($0) {
return String.fromCharCode(parseInt(escape($0).replace(/(%26%23)(\d{1,6})(%3B)/g, "$2")));
});
return str;
}
//_getScriptSessionId生成算法
public async _getScriptSessionId(_scriptSessionId,_origScriptSessionId){
if (_scriptSessionId == null) {
_scriptSessionId = _origScriptSessionId + Math.floor(Math.random() * 1000);
}
return _scriptSessionId;
}