使用有限状态机手写 htmlParse

324 阅读2分钟

一道萌时科技 moego 的面试题

const input = '<div><h1>Title</h1><p>Description here</p></div>'

期望输出

const output = {
  "tagName": "div",
  "children": [
    {
      "tagName": "h1",
      "children": [
        "Title"
      ]
    },
    {
      "tagName": "p",
      "children": [
        "Description here"
      ]
    }
  ]
}

使用之前写的编译原理之词法分析(一)里的有限状态机, 代码一行不用改, 然后再写一个buildDom来生成对象即解决问题了

const LETTERS = /[a-z0-9]/;
// const tokenTypes = require("./tokenTypes");
let currentToken = { type: "", value: "" };

const tokenTypes = {
  LeftParentheses: "LeftParentheses", // <
  JSXIdentifier: "JSXIdentifier", //标识符
  AttributeKey: "AttributeKey", // 属性的key
  AttributeStringValue: "AttributeStringValue", // 字符串格式的属性值
  AttributeExpressionValue: "AttributeExpressionValue", // 变量的属性值
  RightParentheses: "RightParentheses", // > 开始标签的结束
  JSXText: "JSXText", // 文本
  BackSlash: "BackSlash", // 反斜杠
};

const tokens = [];
function emit(token) {
  currentToken = { type: "", value: "" };
  tokens.push(token);
}

function start(char) {
  if (char === "<") {
    emit({ type: tokenTypes.LeftParentheses, value: "<" });
    return foundLeftParentheses; // 找到<
  }
  throw new Error("第一个字符必须是<");
}
// end of fire
function eof() {
  if (currentToken.value.length > 0) {
    emit(currentToken);
  }
}
function foundLeftParentheses(char) {
  // h1
  if (LETTERS.test(char)) {
    // 如果char是一个小写字母或数字
    currentToken.type = tokenTypes.JSXIdentifier;
    currentToken.value += char; // h
    return jsxIdentifier; // 继续收集标识符
  } else if (char === "/") {
    emit({ type: tokenTypes.BackSlash, value: "/" });
    return foundLeftParentheses; // 这里借助左边的来找
  }
}

function jsxIdentifier(char) {
  if (LETTERS.test(char)) {
    currentToken.value += char;
    return jsxIdentifier;
  } else if (char === " ") {
    // 遇到空格
    emit(currentToken);
    return attribute;
  } else if (char === ">") {
    // 说明没有属性 直接结束
    emit(currentToken);
    emit({ type: tokenTypes.RightParentheses, value: ">" });
    return foundRightParentheses;
  }
  //   return eof;
}

function attribute(char) {
  // i
  if (LETTERS.test(char)) {
    // 将会是key
    currentToken.type = tokenTypes.AttributeKey;
    currentToken.value += char;
    return attributeKey;
  }
  throw new TypeError("Error");
}

function attributeKey(char) {
  if (LETTERS.test(char)) {
    currentToken.value += char;
    return attributeKey;
  } else if (char === "=") {
    // 属性key的名字已经结束了
    emit(currentToken);
    return attributeValue;
  }
}

function attributeValue(char) {
  // char = "
  if (char === '"') {
    currentToken.type = tokenTypes.AttributeStringValue;
    currentToken.value = char;
    return attributeStringValue; // 开始读字符串属性值
  } else if (char === "{") {
    currentToken.type = tokenTypes.AttributeExpressionValue;
    currentToken.value = char;
    return attributeExpressionValue;
  }
}
function attributeExpressionValue(char) {
  if (LETTERS.test(char)) {
    currentToken.value += char;
    return attributeExpressionValue;
  } else if (char === "}") {
    // 说明字符串的值结束了
    currentToken.value += char;
    emit(currentToken); // {type:'AttributeStringValue', value:'title' }
    return tryLeaveAttribute;
  }
  throw new TypeError("Error");
}
function attributeStringValue(char) {
  // t
  if (LETTERS.test(char)) {
    currentToken.value += char;
    return attributeStringValue;
  } else if (char === '"') {
    // 说明字符串的值结束了
    currentToken.value += char;
    emit(currentToken); // {type:'AttributeStringValue', value:'title' }
    return tryLeaveAttribute;
  }
  throw new TypeError("Error");
}
// 后面可能是一个新属性,也坑是开始标签的结束
function tryLeaveAttribute(char) {
  if (char === " ") {
    return attribute; // 后面是空格, 说明后面是一个新属性
  } else if (char === ">") {
    // '<h1 id="title">
    emit({
      type: tokenTypes.RightParentheses,
      value: ">",
    });
    return foundRightParentheses;
  }
}
function foundRightParentheses(char) {
  if (char === "<") {
    // '<h1 id="title"><
    emit({ type: tokenTypes.LeftParentheses, value: "<" });
    return foundLeftParentheses; // 找到<
  } else {
    // <h1 id='title'><span>h
    currentToken.type = tokenTypes.JSXText;
    currentToken.value += char;
    return jsxText;
  }
}
function jsxText(char) {
  if (char === "<") {
    emit(currentToken); // {type:'JSXText',value:'hello'}
    emit({ type: tokenTypes.LeftParentheses, value: "<" });
    return foundLeftParentheses;
  } else {
    currentToken.value += char;
    return jsxText;
  }
}

function tokenizer(input) {
  let state = start;
  for (let char of input) {
    if (state) state = state(char);
  }
  return tokens;
}

let sourceCode = `<div><h1>Title</h1><p>Description here</p></div>`;
console.log(tokenizer(sourceCode));

function buildDom(tokens) {
  let root = { tagName: null, children: [] };
  let currentParent = root;
  let stack = [root];

  while (tokens.length > 0) {
    let token = tokens.shift();
    if (token.type === tokenTypes.LeftParentheses) {
      let tagNameToken = tokens.shift();
      if (tagNameToken.type === tokenTypes.BackSlash) {
        // 如果遇到结束标签
        stack.pop();
        currentParent = stack[stack.length - 1];
        tokens.shift(); // 去掉结束标签的'>'
      } else {
        // 如果遇到开始标签
        let node = { tagName: tagNameToken.value, children: [] };
        currentParent.children.push(node);
        currentParent = node;
        stack.push(node);
      }
    } else if (token.type === tokenTypes.JSXText) {
      currentParent.children.push(token.value);
    }
  }

  return root.children[0];
}
let dom = buildDom(tokens);
console.log(JSON.stringify(dom, null, 2));

博主面试没过, 有点莫名其妙 哈哈