获取html字符串h1-h5标题内容,生成树结构

322 阅读2分钟

规则为:

1.标题不能嵌套;

2.标题id生成规则为:h[1-5]Id_*;

const htmlCon = '<header><h2 id="h2Id_2222">st<strong>2222</strong></h2><p></p><p></p><p></p><p></p><h3 id="h3Id_3333"><strong>3333</strong></h3><h4 id="h4Id_444444"><strong>444444</strong></h4><h3 id="h3Id_35333"><strong>35333</strong></h3><h4 id="h4Id_4444444"><strong>444444</strong></h4><h5 id="h5Id_5555"><strong>5555</strong></h5><h6 id="h6Id6666"><strong>6666</strong></h6><h1 id="h1Id_111112">11111</h1><h2 id="h2Id_111113">11111</h2><h1 id="h1Id_111113">11111</h1><p></p><p></p><p><strong><br /><br /><br /><br /></strong></p><p></p>';
console.log('htmlCon', htmlCon);
/**
 * 获取当前标题索引值
 * @param id 匹配的id
 */
function getTitleIndex(id: string) {
  let index = 0;
  if (id.indexOf('h1Id') > -1) {
    index = 1;
  }
  if (id.indexOf('h2Id') > -1) {
    index = 2;
  }
  if (id.indexOf('h3Id') > -1) {
    index = 3;
  }
  if (id.indexOf('h4Id') > -1) {
    index = 4;
  }
  if (id.indexOf('h5Id') > -1) {
    index = 5;
  }
  return index;
}
/**
 * 获取当前id的父id
 * @param data 遍历的数组
 * @param id 匹配的id
 */
function getPId(data: Array<any>, id: string) {
  let pId = '';
  if (!data.length) {
    return pId;
  }
  // 当前查询id索引值
  const idIndex = getTitleIndex(id);
  // 索引值大id索引值的数组
  const greaterData = data.filter(item => item.index < idIndex);
  // 取最靠近的值即为父节点id
  if (greaterData.length) {
    pId = greaterData[greaterData.length - 1].id;
  }
  return pId;
}
/**
 * 
 * @param htmlContent html文本内容
 * @param id 标题id
 * @param hIndex 标题索引值,第几级标题
 */
function getTitle(htmlContent: string, id: string, hIndex: number) {
  const reg = new RegExp(`<h${hIndex} id="${id}"(.*?)<\\/h${hIndex}>`, 'gi');
  const arr = htmlContent.match(reg);
  if (!arr) {
    return '';
  } else {
    const htmlObj = arr[0];
    let str = htmlObj.replace(/<[^>]*>(([^<])*)/g, function () {
      return arguments[1];
    });
    return str;
  }
}
/**
 * 获取标题导航栏
 * @param htmlContent html字符串
 */
function getTitleTrees(htmlContent: string) {
  //匹配h1-h5(g表示匹配所有结果i表示区分大小写)
  const hReg = /<h[1-5].*?(?:>|\/>)/gi;
  //匹配id属性正则
  const idReg = /id=['"]?([^'"]*)['"]?/i;
  // 正则匹配到的标题数组
  const hArr = htmlContent.match(hReg);
  // json标题数组
  const hSrc = [];
  if (hArr) {
    for (let i = 0; i < hArr.length; i++) {
      const obj = hArr[i];
      const id = obj.match(idReg);
      if (id && id[1]) {
        const currentId = id[1];
        const hIndex = getTitleIndex(currentId);
        const pId = getPId(hSrc, currentId);
        const hTitle = getTitle(htmlContent, currentId, hIndex);
        hSrc.push({
          pId: pId,
          id: currentId,
          index: hIndex,
          value: hTitle
        });
      }
    }
  }
  return hSrc;
}
const titleData = getTitleTrees(htmlCon);
console.log('titleData', titleData);