编译原理之语法分析(二)即 tokens 转 AST 树换另外一个简单的例子将代码: 2+3*4 转为AST tok

即 tokens 转 AST 树

先上一个简单的例子,学习下语法分析的原理以及递归下降算法

需求: 将代码: 2+3*4 转为AST

tokens

[
{type:'NUMBER',value:'2'},
{type:'NUMBER',value:'2'},
{type:'NUMBER',value:'2'},
{type:'NUMBER',value:'2'},
{type:'NUMBER',value:'2'},
]

生成的 AST

{
  "type": "Program",
  "children": [
    {
      "type": "Additive",
      "children": [
        {
          "type": "Numeric",
          "value": "2"
        },
        {
          "type": "Multiplicative",
          "children": [
            {
              "type": "Numeric",
              "value": "3"
            },
            {
              "type": "Numeric",
              "value": "4"
            }
          ]
        }
      ]
    }
  ]
}

本文介绍语法分析的原理以及递归下降算法
并初步介绍上下文无关文法

递归下降算法

它的左边是一个非终结符(Non-terminal)
右边是它的产生式(Production Rule)
在语法解析的过程中, 左边会被右边替代, 如果替代之后还有非终结符, 那么继续这个替代过程, 直到最后全部都是终结符(Terminal), 也就是Token
只有终结符才可以成为AST的叶子节点, 这个过程, 也叫做推导(Derivation)过程
上级文法嵌套下级文法, 上级的算法调用下级的算法, 表现在生成AST中, 上级算法生成上级节点, 下级算法生成下级节点. 这就是下降的含义

上下文无关文法

上下文无关的意思是, 无论在任何情况下,文法的推导规则都是一样的
规则分成两级: 第一级是加法规则, 第二级是乘法规则, 把乘法规则作为加法规则的子规则
解析形成AST时, 乘法节点就一定是加法节点的子节点, 从而被优先计算
加法规则中还递归地又引用了加法规则

算数表达式

2+3*4

语法规则(规则也可以根据业务自己制定)

add -> multiple | multiple + add // 左边add非终结符 -> 右边是产生式
multiple -> NUMBER | NUMBER * multiple

add 表示加法, multiple 表示乘法第一句表示: 加法由乘法或者乘法 + 加法产生第二句表示: 乘法由 NUMBER 或者 NUMBER * 乘法产生

终结符: NUMBER * 上级文法 add 嵌套下级 multiple 文法加法里需要推导乘法, 乘法里需要推导number, 应该先推导加法

推导流程:

1.先匹配加法规则, additive(tokenReader);加法里又匹配乘法规则, 因此下降到乘法规则, 开始推导乘法, multiple -> NUMBER | NUMBER * multiple 2. 取第一个token 2,匹配到了 NUMBER, 但是还没完, 需要继续看是否匹配第二个规则 NUMBER * multiple 2. 查看2后面的token + , 不是* 号, 因此不匹配第二条, 这就结束了,回退到加法推导函数里,返回值记为child1, 继续判断是否满足下一个匹配项 3. 取出下一个token +, 匹配到了 + add, 继续调用自身additive, 返回值记为child2, 4. 包装 child1 和 child2 为一个 ASTNode 的 children 5. 在继续调用additive自身时, 又进入到乘法推导里, 取下一个token 3,是数字, 匹配上了乘法的第一个匹配项, 取出来消耗掉,再判断是否满足乘法第二个匹配项 6. 取出下一个token *, 匹配到了 NUMBER * multiple, 取出来消耗掉, 继续调用自身multiple, 看下一个token 4是否满足第一个匹配项NUMBER, 满足, 那就取出来,再看下一个是否乘法,以匹配第二个匹配项, 发现是null了 7. 结束

代码转抽象语法树

function parse(script) {
  // 分词处理
  let tokenReader = tokenize(script);
  let ast = toAST(tokenReader);
  return ast;
}

token转AST数时,会有需要查看下一个token等操作, 用tokens数组不太方便, 因此会把tokens先包装成一个类

TokenReader 类

class TokenReader {
  constructor(tokens) {
    this.tokens = tokens; // token数组
    this.pos = 0; // 索引
  }
  // 读取当前位置上的token,并消耗
  read() {
    if (this.pos < this.tokens.length) {
      return this.tokens[this.pos++]; // 读完后自增, 消耗token
    }
    return null;
  }

  peek() {
    if (this.pos < this.tokens.length) {
      return this.tokens[this.pos]; // 读完后不自增
    }
    return null;
  }
  // 恢复,倒退
  unread() {
    if (this.pos > 0) {
      this.pos--;
    }
  }
}

这就叫递归下降

function toAST(tokenReader) {
  let rootNode = new ASTNode(nodeTypes.Program);
  // 开始推导加法乘法 先推导加法, 加法权重高
  
  let child = additive(tokenReader);
  if (child) rootNode.children.push(child);
  return rootNode;
}
// 每一个规则都是一个函数 additive对应加法规则
function additive(tokenReader) {
  let child1 = multiple(tokenReader);
}
// 每一个规则都是一个函数 additive对应乘法规则
function multiple(tokenReader) {
  const child1 = number(tokenReader);
}
// 每一个规则都是一个函数 additive对应NUMBER规则
function number(tokenReader) {}

这种方式实现加减乘除会有些结合性以及左递归的问题,不过此处暂不关心

jsx 转 AST

先定义文法结构

 jsxElement => <JSXIdentifier attribute*>child</JSXIdentifier>
 attribute => AttributeKey='AttributeStringValue'
 child => jsxElement | JSXText

使用循环的方式来实现, 性能好一些,但是可读性不如上面的函数式

const { tokenizer } = require("./tokenizer");
const tokenTypes = require("./tokenTypes");
const nodeTypes = require("./nodeTypes");

/**
 * 
 * 先定义文法结构
  jsxElement => <JSXIdentifier attribute*>child</JSXIdentifier>
  attribute => AttributeKey='AttributeStringValue'
  child => jsxElement | JSXText
 */
function parser(sourceCode) {
  let tokens = tokenizer(sourceCode); // tokens 数组
  let pos = 0; // 当前token的数组索引

  function walk(parent) {
    debugger;
    let token = tokens[pos]; // 取出当前token 不消耗
    let nextToken = tokens[pos + 1]; // 取出下一个token 不消耗
    // < && h1
    // jsxElement 规则
    if (
      token.type === tokenTypes.LeftParentheses &&
      nextToken.type === tokenTypes.JSXIdentifier
    ) {
      let node = {
        type: nodeTypes.JSXElement,
        openingElement: null,
        children: [],
        closingElement: null,
      };
      // 第一步给开始标签赋值
      token = tokens[++pos]; // h1
      node.openingElement = {
        type: nodeTypes.JSXOpeningElement,
        name: {
          type: nodeTypes.JSXIdentifier,
          name: token.value,
        },
        attributes: [],
      };
      token = tokens[++pos]; // 走一步 到下一个attributeKey
      // 循环取 attribute 的token
      while (token.type === tokenTypes.AttributeKey) {
        node.openingElement.attributes.push(walk());
        token = tokens[pos];
      }
      // while 结束以后, 下一个是大于号,即pos指到>
      token = tokens[++pos]; // 跳过大于号,取到<号
      nextToken = tokens[pos + 1]; // span 的s
      // !== <  就匹配到 <span>hello 文本节点的子节点
      // 对应元素类型的子节点
      // 推导 child => jsxElement | JSXText
      while (
        token.type !== tokenTypes.LeftParentheses ||
        (token.type === tokenTypes.LeftParentheses &&
          nextToken.type !== tokenTypes.BackSlash)
      ) {
        node.children.push(walk());
        token = tokens[pos];
        nextToken = tokens[pos + 1];
      }
      node.closingElement = walk(node);
      return node;
      // attribute 规则
    } else if (token.type === tokenTypes.AttributeKey) {
      let nextToken = tokens[++pos];
      let node = {
        type: nodeTypes.JSXAttribute,
        name: {
          type: nodeTypes.JSXIdentifier,
          name: token.value, // id
        },
        value: {
          type: nodeTypes.Literal,
          value: nextToken.value,
        },
      };
      pos++;
      return node;
    } else if (token.type === tokenTypes.JSXText) {
      // hello
      pos++;
      return {
        type: nodeTypes.JSXText,
        value: token.value,
      };
      // 结束标签
    } else if (
      parent &&
      token.type === tokenTypes.LeftParentheses &&
      nextToken.type === tokenTypes.BackSlash
    ) {
      pos++; // 跳过 <,到 /
      pos++; // 跳过 /,到 span 的s
      token = tokens[pos]; // span h1
      pos++; // 跳过 span
      pos++; // 跳过</span> 的 >, 到 world 的 w
      if (parent.openingElement.name.name !== token.value) {
        throw new TypeError(
          `开始标签${parent.openingElement.name.name}不匹配结束标签${token.value}`
        );
      }
      return {
        type: nodeTypes.JSXClosingElement,
        name: {
          type: nodeTypes.JSXIdentifier,
          name: token.value,
        },
      };
    }
    throw new Error("不可能走到这");
  }

  let ast = {
    type: nodeTypes.Program,
    body: [
      {
        type: nodeTypes.ExpressionStatement,
        expression: walk(),
      },
    ],
  };
  return ast;
}
module.exports = {
  parser,
};

let sourceCode = `<h1 id="title"><span>hello</span>world</h1>`;

console.log(JSON.stringify(parser(sourceCode), null, 2));

/**
 
 [
  { type: 'LeftParentheses', value: '<' },
  { type: 'JSXIdentifier', value: 'h1' },
  { type: 'AttributeKey', value: 'id' },
  { type: 'AttributeStringValue', value: '"title"' },
  { type: 'RightParentheses', value: '>' },
  { type: 'LeftParentheses', value: '<' },
  { type: 'JSXIdentifier', value: 'span' },
  { type: 'RightParentheses', value: '>' },
  { type: 'JSXText', value: 'hello' },
  { type: 'LeftParentheses', value: '<' },
  { type: 'BackSlash', value: '/' },
  { type: 'JSXIdentifier', value: 'span' },
  { type: 'RightParentheses', value: '>' },
  { type: 'JSXText', value: 'world' },
  { type: 'LeftParentheses', value: '<' },
  { type: 'BackSlash', value: '/' },
  { type: 'JSXIdentifier', value: 'h1' },
  { type: 'RightParentheses', value: '>' }
]
 */


{
  "type": "Program",
  "body": [
    {
      "type": "ExpressionStatement",
      "expression": {
        "type": "JSXElement",
        "openingElement": {
          "type": "JSXOpeningElement",
          "name": {
            "type": "JSXIdentifier",
            "name": "h1"
          },
          "attributes": [
            {
              "type": "JSXAttribute",
              "name": {
                "type": "JSXIdentifier",
                "name": "id"
              },
              "value": {
                "type": "Literal",
                "value": "\"title\""
              }
            }
          ]
        },
        "children": [
          {
            "type": "JSXElement",
            "openingElement": {
              "type": "JSXOpeningElement",
              "name": {
                "type": "JSXIdentifier",
                "name": "span"
              },
              "attributes": []
            },
            "children": [
              {
                "type": "JSXText",
                "value": "hello"
              }
            ],
            "closingElement": {
              "type": "JSXClosingElement",
              "name": {
                "type": "JSXIdentifier",
                "name": "span"
              }
            }
          },
          {
            "type": "JSXText",
            "value": "world"
          }
        ],
        "closingElement": {
          "type": "JSXClosingElement",
          "name": {
            "type": "JSXIdentifier",
            "name": "h1"
          }
        }
      }
    }
  ]
}

对比 astExplorer 的结果,是一样的

代码已上传 github github.com/vaynevayne/…