编译原理笔记-源码学习-语法分析(1)

105 阅读3分钟

入口

从下面代码可以看出, 其实解析重点就是循环的调用parseStatement, 直到遇到type == tt.eof, 此时就完成了解析.

然后再进行格式检查和Directive处理.

/**
 * parse的入口函数, 解析出program的node.
 * @param {Node} node 
 * @returns {Node}
 */
pp.parseTopLevel = function(node) {
  let exports = Object.create(null)
  if (!node.body) node.body = []
  // 重点
  while (this.type !== tt.eof) {
    let stmt = this.parseStatement(null, true, exports)
    node.body.push(stmt)
  }
  if (this.inModule)
    // undefined exports在checkLocalExport时插入, 如果是没有定义的本地变量, 则报错.
    for (let name of Object.keys(this.undefinedExports))
      this.raiseRecoverable(this.undefinedExports[name].start, `Export '${name}' is not defined`)
  this.adaptDirectivePrologue(node.body)
  this.next()
  node.sourceType = this.options.sourceType
  return this.finishNode(node, NodeTypes.Program)
}

parseStatement

解析单个表达式, 是每一段解析的开始入口. 从代码中可以看出, 其操作是根据startType(在Tokenizer中读出并区别的类型)区分去调用不同的处理函数执行.

按深度优先的递归的方式读完所有的解析函数, 就算是完成了语法解析的第一部分的解读.

/**
 * 解析单个表达式
 * 比如`class node {}`, `let n = new node();`, `const num = 1`都是一个单独的statement
 * 所以区分的方式是分号, 换行, 或者大括号
 * @param {null | string} context 作用在于判断上层类型, 具体而言, 是指do/if/while/for/with几种关键词.
 * @param {boolean} topLevel 是否顶层解析, 非顶层解析时, 不允许import和export
 * @param {Record<string, unknown>} exports 导出的类型对象
 * @returns {Node}
 */
pp.parseStatement = function(context, topLevel, exports) {
  let starttype = this.type, node = this.startNode(), kind

  if (this.isLet(context)) {
    // 如果是let表达式, tokenType转换为var
    starttype = tt._var
    kind = "let"
  }

  // Most types of statements are recognized by the keyword they
  // start with. Many are trivial to parse, some require a bit of
  // complexity.

  switch (starttype) {
  case tt._break: case tt._continue: return this.parseBreakContinueStatement(node, starttype.keyword)
  case tt._debugger: return this.parseDebuggerStatement(node)
  case tt._do: return this.parseDoStatement(node)
  case tt._for: return this.parseForStatement(node)
  case tt._function:
    // Function as sole body of either an if statement or a labeled statement
    // works, but not when it is part of a labeled statement that is the sole
    // body of an if statement.
    if ((context && (this.strict || context !== "if" && context !== "label")) && this.options.ecmaVersion >= 6) this.unexpected()
    return this.parseFunctionStatement(node, false, !context)
  case tt._class:
    if (context) this.unexpected()
    return this.parseClass(node, true)
  case tt._if: return this.parseIfStatement(node)
  case tt._return: return this.parseReturnStatement(node)
  case tt._switch: return this.parseSwitchStatement(node)
  case tt._throw: return this.parseThrowStatement(node)
  case tt._try: return this.parseTryStatement(node)
  case tt._const: case tt._var:
    kind = kind || this.value
    if (context && kind !== "var") this.unexpected()
    return this.parseVarStatement(node, kind)
  case tt._while: return this.parseWhileStatement(node)
  case tt._with: return this.parseWithStatement(node)
  case tt.braceL: return this.parseBlock(true, node)
  case tt.semi: return this.parseEmptyStatement(node)
  case tt._export:
  case tt._import:
    if (this.options.ecmaVersion > 10 && starttype === tt._import) {
      skipWhiteSpace.lastIndex = this.pos
      let skip = skipWhiteSpace.exec(this.input)
      let next = this.pos + skip[0].length, nextCh = this.input.charCodeAt(next)
      if (nextCh === 40 || nextCh === 46) // '(' or '.'
        return this.parseExpressionStatement(node, this.parseExpression())
    }

    if (!this.options.allowImportExportEverywhere) {
      if (!topLevel)
        this.raise(this.start, "'import' and 'export' may only appear at the top level")
      if (!this.inModule)
        this.raise(this.start, "'import' and 'export' may appear only with 'sourceType: module'")
    }
    return starttype === tt._import ? this.parseImport(node) : this.parseExport(node, exports)

    // If the statement does not start with a statement keyword or a
    // brace, it's an ExpressionStatement or LabeledStatement. We
    // simply start parsing an expression, and afterwards, if the
    // next token is a colon and the expression was a simple
    // Identifier node, we switch to interpreting it as a label.
  default:
    if (this.isAsyncFunction()) {
      if (context) this.unexpected()
      this.next()
      return this.parseFunctionStatement(node, true, !context) // 这里!context为true
    }

    let maybeName = this.value, expr = this.parseExpression()
    if (starttype === tt.name && expr.type === "Identifier" && this.eat(tt.colon))
      return this.parseLabeledStatement(node, maybeName, expr, context)
    else return this.parseExpressionStatement(node, expr)
  }
}

简单介绍几个常见的函数作用

  1. eat 判断当前token是否传入的类型, 如果是则调用next并返回true
  2. insertSemicolon 判断是否可插入分号的位置, 具体为判断是否}或者换行, 如果是则返回true
  3. unexpected 顾名思义, 抛出错误
  4. semicolon 尝试消费一个分号, 如果没有, 则尝试插入一个分号, 如果都不行, 则报错.

parseBreakContinueStatement

解析break和continue的函数很清晰

先执行next获取下一个token, 然后

  1. 判断是否有分号或者换行, 如果是, 则设置label为null
  2. 判断当前token是否类型为name, 如果是, 则node.label 设置为parseIdent解析出来的值, 否则报错
  3. 对labels进行检查, 对于continue关键词, 要求跳出的节点必须是循环, 对于break, 可以不是循环. labels的值可以在后面再看.
/**
 * 解析break或者continue
 * @param {Node} node 
 * @param {string} keyword 
 * @returns {Node}
 */
pp.parseBreakContinueStatement = function(node, keyword) {
  let isBreak = keyword === "break"
  this.next()
  if (this.eat(tt.semi) || this.insertSemicolon()) node.label = null
  else if (this.type !== tt.name) this.unexpected()
  else {
    node.label = this.parseIdent()
    this.semicolon()
  }

  // Verify that there is an actual destination to break or
  // continue to.
  let i = 0
  for (; i < this.labels.length; ++i) {
    let lab = this.labels[i]
    if (node.label == null || lab.name === node.label.name) {
      // 判断continue或break, 如果是continue, 则要求是循环
      if (lab.kind != null && (isBreak || lab.kind === "loop")) break
      // break 到label.name上, 这里可以不是循环
      if (node.label && isBreak) break
    }
  }
  if (i === this.labels.length) this.raise(node.start, "Unsyntactic " + keyword)
  return this.finishNode(node, isBreak ? NodeTypes.BreakStatement : NodeTypes.ContinueStatement)
}

parseIdent

  1. 首先判断type是否为name, 如果是, 则直接设置node.name
  2. 判断type.keyword, 如果存在, 则设置node.name 为type.keyword, 并且处理class和function关键词的类型, 如果是的话则弹出一个context.
/**
 * 解析当前token并作为变量类型返回
 * @param {boolean} liberal 是否解析properties
 * @returns 
 */
pp.parseIdent = function(liberal) {
  let node = this.startNode()
  if (this.type === tt.name) {
    node.name = this.value
  } else if (this.type.keyword) {
    node.name = this.type.keyword

    // To fix https://github.com/acornjs/acorn/issues/575
    // `class` and `function` keywords push new context into this.context.
    // But there is no chance to pop the context if the keyword is consumed as an identifier such as a property name.
    // If the previous token is a dot, this does not apply because the context-managing code already ignored the keyword
    if ((node.name === "class" || node.name === "function") &&
        (this.lastTokEnd !== this.lastTokStart + 1 || this.input.charCodeAt(this.lastTokStart) !== 46)) { // 46 .
        // 只有是xxx.class这种情况不需要pop
      this.context.pop()
    }
  } else {
    this.unexpected()
  }
  this.next(!!liberal)
  this.finishNode(node, "Identifier")
  if (!liberal) {
    // 检查变量合法性
    this.checkUnreserved(node)
    if (node.name === "await" && !this.awaitIdentPos)
      this.awaitIdentPos = node.start
  }
  return node
}

parseDoStatement

  1. 首先在labels中推入一个label, 用于continue和break的校验合法性
  2. 然后调用parseStatement("do")解析出{}中的执行体
  3. 退出labels
  4. 读出while, 然后调用parseParenExpression读出括号内的条件表达式, 结束
/**
解析do while
 * @param {Node} node 
 * @returns {Node}
 */
pp.parseDoStatement = function(node) {
  this.next()
  // 记录label用于break或continue
  this.labels.push(loopLabel)
  node.body = this.parseStatement("do")
  this.labels.pop()
  this.expect(tt._while)
  node.test = this.parseParenExpression()
  if (this.options.ecmaVersion >= 6)
    this.eat(tt.semi)
  else
    this.semicolon()
  return this.finishNode(node, NodeTypes.DoWhileStatement)
}