学校官网数据逆向挖掘记录

417 阅读1分钟

以下通过 node后端实现请求和逆向数据,如需运行请 补充 egg.js && typescript

以下代码仅做学习参考,切勿违法


public async index() {
  let _origScriptSessionId = "_scriptSessionId关键key";
  let _scriptSessionId = null;
  _scriptSessionId = await this._getScriptSessionId(_scriptSessionId, _origScriptSessionId)
  let currentUrl = ['010212.html','01021201.html',"01021202.html",]; //请求页面
  let page = currentUrl[2]; //通过数组控制爬取数据
  const headers = { //伪造请求头
    "Host": "学校官网地址",
    "Connection": "keep-alive",
    "Content-Length": "306",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
    "DNT": "1",
    "Content-Type": "text/plain",
    "Accept": "*/*",
    "Origin": "学校官网地址",
    "Referer": `学校官网地址/newkm/colunm/${page}`,
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": "JSESSIONID=BB598025E17A847357963806D340B29F" //好像学校不查cookie有效值,应该是公开内容不做判断
  }
  const data = { //请求参数
    "callCount":"1",
    "page":`/newkm/colunm/${page}`,
    "httpSessionId":"",
    "scriptSessionId":`${_scriptSessionId}`,
    "c0-scriptName":"portalAjax",
    "c0-methodName":"getNewsXml",
    "c0-id":"0",
    "c0-param0":"string:0102",
    "c0-param1":`string:${page.split('.html')[0]}`,
    "c0-param2":"string:news_",
    "c0-param3":"number:20",
    "c0-param4":"number:1",
    "c0-param5":"null:null",
    "batchId":"0"
  }
  const { ctx } = this;
  const url = '学校官网/newkm/dwr/call/plaincall/portalAjax.getNewsXml.dwr';
  const result = await ctx.curl(url,{
    method: 'POST', // 设置请求方式
    contentType: "text/html; charset=UTF-8",
    data: data,
    headers:headers
  });

  //把请求到的ascii码数据转为中文,并去掉无用字符串
  let str = result.data.toString()
  str = str.replace("//#DWR-INSERT",'');
  str = str.replace("//#DWR-REPLY","")
  str = str.replace(`dwr.engine._remoteHandleCallback('2','0',"`,"")
  str = str.replace(`");`,"")
  str = str.replace(`<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<s><n>4051</n>`,"")
  str = str.replace(`</s>`,"")
  const dom = new JSDOM(str);
  let t = dom.window.document.querySelectorAll("i");
  //定义数组保存数据
  let arr:any[] = [];
  t.forEach((item)=>{
    // console.log(item.textContent?.split(".html")[0]);
    let url = item.textContent?.split(".html")[0];
    let s = item.textContent?.split(".html")[1]
    s = s.replace(/\d{4}-\d{1,2}-\d{1,2}/g, "");
    //组装数据
    let obj = {
      url: url+'.html',
      content: this.reconvert(s)
    }
    arr.push(obj)
  })
  ctx.body = arr;
}
//ascii码转换方法
public reconvert(str) {
  str = str.replace(/(\u)(\w{1,4})/gi, function ($0) {
    return (String.fromCharCode(parseInt((escape($0).replace(/(%5Cu)(\w{1,4})/g, "$2")), 16)));
  });
  str = str.replace(/(&#x)(\w{1,4});/gi, function ($0) {
    return String.fromCharCode(parseInt(escape($0).replace(/(%26%23x)(\w{1,4})(%3B)/g, "$2"), 16));
  });
  str = str.replace(/(&#)(\d{1,6});/gi, function ($0) {
    return String.fromCharCode(parseInt(escape($0).replace(/(%26%23)(\d{1,6})(%3B)/g, "$2")));
  });

  return str;
}
//_getScriptSessionId生成算法
public async _getScriptSessionId(_scriptSessionId,_origScriptSessionId){
  if (_scriptSessionId == null) {
    _scriptSessionId = _origScriptSessionId + Math.floor(Math.random() * 1000);
  }
  return _scriptSessionId;
}

组装后的数据结构

image.png