编译原理笔记-源码学习-词法分析(tokenizer)(2)

178 阅读1分钟

接上篇词法分析1

其次看字符串读取

readString 往下读字符串, 直到遇到同样的quote后结束

/**
 * 读到单双引号时进入, 读出一个string
 * @param {number} quote
 * @returns {Token}
*/
pp.readString = function(quote) {
  let out = "", chunkStart = ++this.pos
  for (;;) {
    if (this.pos >= this.input.length) this.raise(this.start, "Unterminated string constant")
    let ch = this.input.charCodeAt(this.pos)
    if (ch === quote) break
    if (ch === 92) { // '\'
      out += this.input.slice(chunkStart, this.pos)
      out += this.readEscapedChar(false)
      chunkStart = this.pos
    } else if (ch === 0x2028 || ch === 0x2029) {
      if (this.options.ecmaVersion < 10) this.raise(this.start, "Unterminated string constant")
      ++this.pos
      if (this.options.locations) {
        this.curLine++
        this.lineStart = this.pos
      }
    } else {
      if (isNewLine(ch)) this.raise(this.start, "Unterminated string constant")
      ++this.pos
    }
  }
  out += this.input.slice(chunkStart, this.pos++)
  return this.finishToken(tt.string, out)
}

其次是根据不同的字符, 选择不同的函数调用

case 47: // '/'
    return this.readToken_slash()

  case 37: case 42: // '%*'
    return this.readToken_mult_modulo_exp(code)

  case 124: case 38: // '|&'
    return this.readToken_pipe_amp(code)

  case 94: // '^'
    return this.readToken_caret()

  case 43: case 45: // '+-'
    return this.readToken_plus_min(code)

  case 60: case 62: // '<>'
    return this.readToken_lt_gt(code)

  case 61: case 33: // '=!'
    return this.readToken_eq_excl(code)

  case 63: // '?'
    return this.readToken_question()

  case 126: // '~'
    return this.finishOp(tt.prefix, 1)

  case 35: // '#' 开头, 在ecma版本较新的情况下表示私有变量名.
    return this.readToken_numberSign()

比较常见的是readToken_question

/** 读出问号 */
pp.readToken_question = function() { // '?'
  const ecmaVersion = this.options.ecmaVersion
  if (ecmaVersion >= 11) {
    let next = this.input.charCodeAt(this.pos + 1)
    if (next === 46) { // .
      let next2 = this.input.charCodeAt(this.pos + 2)
      if (next2 < 48 || next2 > 57) return this.finishOp(tt.questionDot, 2)
    }
    if (next === 63) { // ?
      if (ecmaVersion >= 12) {
        let next2 = this.input.charCodeAt(this.pos + 2)
        if (next2 === 61) return this.finishOp(tt.assign, 3) // ??=
      }
      return this.finishOp(tt.coalesce, 2) // ?? 
    }
  }
  return this.finishOp(tt.question, 1) // 三元表达式
}

接下来继续看readWord

/**
 * 读出一个合法的变量名, 如果是keyword, 则作为keyword返回, 否则作为变量返回.
 * let 关键词被单独处理, 会作为name返回
 */
pp.readWord = function() {
  let word = this.readWord1()
  let type = tt.name
  if (this.keywords.test(word)) {
    type = keywordTypes[word]
  }
  return this.finishToken(type, word)
}
/**
 * 往后读出一个变量名.
 * @returns {string}
 */
pp.readWord1 = function() {
  this.containsEsc = false
  let word = "", first = true, chunkStart = this.pos
  let astral = this.options.ecmaVersion >= 6
  while (this.pos < this.input.length) {
    let ch = this.fullCharCodeAtPos()
    if (isIdentifierChar(ch, astral)) { // 正常变量
      this.pos += ch <= 0xffff ? 1 : 2
    } else if (ch === 92) { // "\"
      this.containsEsc = true
      word += this.input.slice(chunkStart, this.pos)
      let escStart = this.pos
      if (this.input.charCodeAt(++this.pos) !== 117) // "u"
        this.invalidStringToken(this.pos, "Expecting Unicode escape sequence \\uXXXX")
      ++this.pos
      let esc = this.readCodePoint()
      if (!(first ? isIdentifierStart : isIdentifierChar)(esc, astral))
        this.invalidStringToken(escStart, "Invalid Unicode escape")
      word += codePointToString(esc)
      chunkStart = this.pos
    } else {
      break
    }
    first = false
  }
  return word + this.input.slice(chunkStart, this.pos)
}

总结

  1. 在readToken的时候, 首先判断当前是否合法的变量名开始(以字母或者_开头), 如果是, 则调用readWord读出keyword或者变量名
  2. 如果不是变量名开始, 则调用getTokenFromCode读出对应的类型, 函数内部根据不同的字符, 区别调用其他函数解析出对应的Token类型.