编译原理笔记-源码学习-语法分析(1)入口从下面代码可以看出, 其实解析重点就是循环的调用parseStatement

入口

从下面代码可以看出, 其实解析重点就是循环的调用parseStatement, 直到遇到type == tt.eof, 此时就完成了解析.

然后再进行格式检查和Directive处理.

/**
 * parse的入口函数, 解析出program的node.
 * @param {Node} node 
 * @returns {Node}
 */
pp.parseTopLevel = function(node) {
  let exports = Object.create(null)
  if (!node.body) node.body = []
  // 重点
  while (this.type !== tt.eof) {
    let stmt = this.parseStatement(null, true, exports)
    node.body.push(stmt)
  }
  if (this.inModule)
    // undefined exports在checkLocalExport时插入, 如果是没有定义的本地变量, 则报错.
    for (let name of Object.keys(this.undefinedExports))
      this.raiseRecoverable(this.undefinedExports[name].start, `Export '${name}' is not defined`)
  this.adaptDirectivePrologue(node.body)
  this.next()
  node.sourceType = this.options.sourceType
  return this.finishNode(node, NodeTypes.Program)
}

parseStatement

解析单个表达式, 是每一段解析的开始入口. 从代码中可以看出, 其操作是根据startType(在Tokenizer中读出并区别的类型)区分去调用不同的处理函数执行.

按深度优先的递归的方式读完所有的解析函数, 就算是完成了语法解析的第一部分的解读.

/**
 * 解析单个表达式
 * 比如`class node {}`, `let n = new node();`, `const num = 1`都是一个单独的statement
 * 所以区分的方式是分号, 换行, 或者大括号
 * @param {null | string} context 作用在于判断上层类型, 具体而言, 是指do/if/while/for/with几种关键词.
 * @param {boolean} topLevel 是否顶层解析, 非顶层解析时, 不允许import和export
 * @param {Record<string, unknown>} exports 导出的类型对象
 * @returns {Node}
 */
pp.parseStatement = function(context, topLevel, exports) {
  let starttype = this.type, node = this.startNode(), kind

  if (this.isLet(context)) {
    // 如果是let表达式, tokenType转换为var
    starttype = tt._var
    kind = "let"
  }

  // Most types of statements are recognized by the keyword they
  // start with. Many are trivial to parse, some require a bit of
  // complexity.

  switch (starttype) {
  case tt._break: case tt._continue: return this.parseBreakContinueStatement(node, starttype.keyword)
  case tt._debugger: return this.parseDebuggerStatement(node)
  case tt._do: return this.parseDoStatement(node)
  case tt._for: return this.parseForStatement(node)
  case tt._function:
    // Function as sole body of either an if statement or a labeled statement
    // works, but not when it is part of a labeled statement that is the sole
    // body of an if statement.
    if ((context && (this.strict || context !== "if" && context !== "label")) && this.options.ecmaVersion >= 6) this.unexpected()
    return this.parseFunctionStatement(node, false, !context)
  case tt._class:
    if (context) this.unexpected()
    return this.parseClass(node, true)
  case tt._if: return this.parseIfStatement(node)
  case tt._return: return this.parseReturnStatement(node)
  case tt._switch: return this.parseSwitchStatement(node)
  case tt._throw: return this.parseThrowStatement(node)
  case tt._try: return this.parseTryStatement(node)
  case tt._const: case tt._var:
    kind = kind || this.value
    if (context && kind !== "var") this.unexpected()
    return this.parseVarStatement(node, kind)
  case tt._while: return this.parseWhileStatement(node)
  case tt._with: return this.parseWithStatement(node)
  case tt.braceL: return this.parseBlock(true, node)
  case tt.semi: return this.parseEmptyStatement(node)
  case tt._export:
  case tt._import:
    if (this.options.ecmaVersion > 10 && starttype === tt._import) {
      skipWhiteSpace.lastIndex = this.pos
      let skip = skipWhiteSpace.exec(this.input)
      let next = this.pos + skip[0].length, nextCh = this.input.charCodeAt(next)
      if (nextCh === 40 || nextCh === 46) // '(' or '.'
        return this.parseExpressionStatement(node, this.parseExpression())
    }

    if (!this.options.allowImportExportEverywhere) {
      if (!topLevel)
        this.raise(this.start, "'import' and 'export' may only appear at the top level")
      if (!this.inModule)
        this.raise(this.start, "'import' and 'export' may appear only with 'sourceType: module'")
    }
    return starttype === tt._import ? this.parseImport(node) : this.parseExport(node, exports)

    // If the statement does not start with a statement keyword or a
    // brace, it's an ExpressionStatement or LabeledStatement. We
    // simply start parsing an expression, and afterwards, if the
    // next token is a colon and the expression was a simple
    // Identifier node, we switch to interpreting it as a label.
  default:
    if (this.isAsyncFunction()) {
      if (context) this.unexpected()
      this.next()
      return this.parseFunctionStatement(node, true, !context) // 这里!context为true
    }

    let maybeName = this.value, expr = this.parseExpression()
    if (starttype === tt.name && expr.type === "Identifier" && this.eat(tt.colon))
      return this.parseLabeledStatement(node, maybeName, expr, context)
    else return this.parseExpressionStatement(node, expr)
  }
}

简单介绍几个常见的函数作用

eat 判断当前token是否传入的类型, 如果是则调用next并返回true
insertSemicolon 判断是否可插入分号的位置, 具体为判断是否}或者换行, 如果是则返回true
unexpected 顾名思义, 抛出错误
semicolon 尝试消费一个分号, 如果没有, 则尝试插入一个分号, 如果都不行, 则报错.

parseBreakContinueStatement

解析break和continue的函数很清晰

先执行next获取下一个token, 然后

判断是否有分号或者换行, 如果是, 则设置label为null
判断当前token是否类型为name, 如果是, 则node.label 设置为parseIdent解析出来的值, 否则报错
对labels进行检查, 对于continue关键词, 要求跳出的节点必须是循环, 对于break, 可以不是循环. labels的值可以在后面再看.

/**
 * 解析break或者continue
 * @param {Node} node 
 * @param {string} keyword 
 * @returns {Node}
 */
pp.parseBreakContinueStatement = function(node, keyword) {
  let isBreak = keyword === "break"
  this.next()
  if (this.eat(tt.semi) || this.insertSemicolon()) node.label = null
  else if (this.type !== tt.name) this.unexpected()
  else {
    node.label = this.parseIdent()
    this.semicolon()
  }

  // Verify that there is an actual destination to break or
  // continue to.
  let i = 0
  for (; i < this.labels.length; ++i) {
    let lab = this.labels[i]
    if (node.label == null || lab.name === node.label.name) {
      // 判断continue或break, 如果是continue, 则要求是循环
      if (lab.kind != null && (isBreak || lab.kind === "loop")) break
      // break 到label.name上, 这里可以不是循环
      if (node.label && isBreak) break
    }
  }
  if (i === this.labels.length) this.raise(node.start, "Unsyntactic " + keyword)
  return this.finishNode(node, isBreak ? NodeTypes.BreakStatement : NodeTypes.ContinueStatement)
}

parseIdent

首先判断type是否为name, 如果是, 则直接设置node.name
判断type.keyword, 如果存在, 则设置node.name 为type.keyword, 并且处理class和function关键词的类型, 如果是的话则弹出一个context.

/**
 * 解析当前token并作为变量类型返回
 * @param {boolean} liberal 是否解析properties
 * @returns 
 */
pp.parseIdent = function(liberal) {
  let node = this.startNode()
  if (this.type === tt.name) {
    node.name = this.value
  } else if (this.type.keyword) {
    node.name = this.type.keyword

    // To fix https://github.com/acornjs/acorn/issues/575
    // `class` and `function` keywords push new context into this.context.
    // But there is no chance to pop the context if the keyword is consumed as an identifier such as a property name.
    // If the previous token is a dot, this does not apply because the context-managing code already ignored the keyword
    if ((node.name === "class" || node.name === "function") &&
        (this.lastTokEnd !== this.lastTokStart + 1 || this.input.charCodeAt(this.lastTokStart) !== 46)) { // 46 .
        // 只有是xxx.class这种情况不需要pop
      this.context.pop()
    }
  } else {
    this.unexpected()
  }
  this.next(!!liberal)
  this.finishNode(node, "Identifier")
  if (!liberal) {
    // 检查变量合法性
    this.checkUnreserved(node)
    if (node.name === "await" && !this.awaitIdentPos)
      this.awaitIdentPos = node.start
  }
  return node
}

parseDoStatement

首先在labels中推入一个label, 用于continue和break的校验合法性
然后调用parseStatement("do")解析出{}中的执行体
退出labels
读出while, 然后调用parseParenExpression读出括号内的条件表达式, 结束

/**
解析do while
 * @param {Node} node 
 * @returns {Node}
 */
pp.parseDoStatement = function(node) {
  this.next()
  // 记录label用于break或continue
  this.labels.push(loopLabel)
  node.body = this.parseStatement("do")
  this.labels.pop()
  this.expect(tt._while)
  node.test = this.parseParenExpression()
  if (this.options.ecmaVersion >= 6)
    this.eat(tt.semi)
  else
    this.semicolon()
  return this.finishNode(node, NodeTypes.DoWhileStatement)
}