本文已参与「新人创作礼」活动，一起开启掘金创作之路。

通过上一节我们分析了Vue的编译流程入口，不清楚可以点击这里，其中vue编译的核心是执行baseCompile，包括三个步骤：解析生成ast树，优化ast树，生成代码。本节我们来分析第一个步骤parse：

function baseCompile (
  template: string,
  options: CompilerOptions
): CompiledResult {
  // 解析成ast树
  const ast = parse(template.trim(), options)
  // 优化ast树
  if (options.optimize !== false) {
    optimize(ast, options)
  }
  // 生成代码
  const code = generate(ast, options)
  return {
    ast,
    render: code.render,
    staticRenderFns: code.staticRenderFns
  }
}

Vue编译（compile）流程之parse（template的解析）

parse函数定义在src/compiler/parser/index.js中：

/**
 * Convert HTML string to AST.
 */
export function parse (
  template: string,
  options: CompilerOptions
): ASTElement | void {
  // 对options进行解析
  warn = options.warn || baseWarn

  platformIsPreTag = options.isPreTag || no
  platformMustUseProp = options.mustUseProp || no
  platformGetTagNamespace = options.getTagNamespace || no

  transforms = pluckModuleFunction(options.modules, 'transformNode')
  preTransforms = pluckModuleFunction(options.modules, 'preTransformNode')
  postTransforms = pluckModuleFunction(options.modules, 'postTransformNode')

  // 分隔符可以自己设置
  delimiters = options.delimiters

  const stack = []
  const preserveWhitespace = options.preserveWhitespace !== false
  let root
  let currentParent
  let inVPre = false
  let inPre = false
  let warned = false

  function warnOnce (msg) {
    ......
  }

  function closeElement (element) {
    ......
  }

  // 执行parseHTML，传入了一个对象，里面有一些配置项及定义了一些钩子函数
  parseHTML(template, {
    warn,
    expectHTML: options.expectHTML,
    isUnaryTag: options.isUnaryTag,
    canBeLeftOpenTag: options.canBeLeftOpenTag,
    shouldDecodeNewlines: options.shouldDecodeNewlines,
    shouldDecodeNewlinesForHref: options.shouldDecodeNewlinesForHref,
    shouldKeepComment: options.comments,
    start (tag, attrs, unary) {
      ......
    },

    end () {
      ......
    },

    chars (text: string) {
      ......
    },
    comment (text: string) {
      ......
    }
  })
  return root
}

parse这个函数对传入的options配置做了一些解析，最后执行了parseHTML这个函数。parseHTML函数的参数中传入了两个参数，第一个参数是template，第二个参数是一个对象，里面有一些配置属性，另外定义了start、end、chars、comment方法。parseHTML定义在src/compiler/parser/html-parser.js中：

parseHTML

export function parseHTML (html, options) {
  const stack = []
  const expectHTML = options.expectHTML
  const isUnaryTag = options.isUnaryTag || no
  const canBeLeftOpenTag = options.canBeLeftOpenTag || no
  let index = 0
  let last, lastTag
    // 循环template
  while (html) {
    last = html
    // Make sure we're not in a plaintext content element like script/style
    // lastTag为空，不是script,style,textarea这种标签，进入逻辑
    if (!lastTag || !isPlainTextElement(lastTag)) {
      // 
      let textEnd = html.indexOf('<')
      if (textEnd === 0) {
        // Comment:
        // 判断是否是注释节点
        if (comment.test(html)) {
          const commentEnd = html.indexOf('-->')

          if (commentEnd >= 0) {
            // shouldKeepComment属性与comment函数是自己传入的配置
            if (options.shouldKeepComment) {
              options.comment(html.substring(4, commentEnd))
            }
            // 前进注释节点的长度并截取
            advance(commentEnd + 3)
            continue
          }
        }

        // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
        // 匹配<![xx]>这种格式的标签
        if (conditionalComment.test(html)) {
          const conditionalEnd = html.indexOf(']>')

          if (conditionalEnd >= 0) {
            advance(conditionalEnd + 2)
            continue
          }
        }

        // Doctype:
        // 匹配 DOCTYPE
        const doctypeMatch = html.match(doctype)
        if (doctypeMatch) {
          advance(doctypeMatch[0].length)
          continue
        }

        // End tag:
        const endTagMatch = html.match(endTag)
        if (endTagMatch) {
          const curIndex = index
          // 前进end Tag长度
          advance(endTagMatch[0].length)
          parseEndTag(endTagMatch[1], curIndex, index)
          continue
        }

        // Start tag:
        // 开始标签匹配
        const startTagMatch = parseStartTag()
        if (startTagMatch) {
          handleStartTag(startTagMatch)
          if (shouldIgnoreFirstNewline(lastTag, html)) {
            advance(1)
          }
          continue
        }
      }

      let text, rest, next
      // 进入text区域，textEnd大于0
      if (textEnd >= 0) {
        // 获取剩余文本字符串
        rest = html.slice(textEnd)
        // 这个地方判断文本区域是否有'>'字符串
        while (
          !endTag.test(rest) &&
          !startTagOpen.test(rest) &&
          !comment.test(rest) &&
          !conditionalComment.test(rest)
        ) {
          // < in plain text, be forgiving and treat it as text
          // 寻找下一个'<'
          next = rest.indexOf('<', 1)
          if (next < 0) break
          textEnd += next
          rest = html.slice(textEnd)
        }
        // 截取文本
        text = html.substring(0, textEnd)
        // 前进
        advance(textEnd)
      }

      // 最后找不到'>'的情况
      if (textEnd < 0) {
        text = html
        html = ''
      }

      // 调用char函数处理文本，生成节点
      if (options.chars && text) {
        options.chars(text)
      }
    } else {
      // 如果有lastTag标签且标签为'script,style,textarea'中的一种，会进入这个逻辑，这一块的逻辑感兴趣的可以自己去分析
      let endTagLength = 0
      const stackedTag = lastTag.toLowerCase()
      const reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)(</' + stackedTag + '[^>]*>)', 'i'))
      const rest = html.replace(reStackedTag, function (all, text, endTag) {
        endTagLength = endTag.length
        if (!isPlainTextElement(stackedTag) && stackedTag !== 'noscript') {
          text = text
            .replace(/<!\--([\s\S]*?)-->/g, '$1') // #7298
            .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1')
        }
        if (shouldIgnoreFirstNewline(stackedTag, text)) {
          text = text.slice(1)
        }
        if (options.chars) {
          options.chars(text)
        }
        return ''
      })
      index += html.length - rest.length
      html = rest
      parseEndTag(stackedTag, index - endTagLength, index)
    }

    if (html === last) {
      options.chars && options.chars(html)
      if (process.env.NODE_ENV !== 'production' && !stack.length && options.warn) {
        options.warn(`Mal-formatted tag at end of template: "${html}"`)
      }
      break
    }
  }

  // Clean up any remaining tags
  parseEndTag()

  // 前进传入的数字长度，截取后面的字符串
  function advance (n) {
    index += n
    html = html.substring(n)
  }

  // 解析开始标签函数
  function parseStartTag () {
    ......
  }

  // 处理开始标签函数
  function handleStartTag (match) {
    ......
  }

  // 解析结束标签函数
  function parseEndTag (tagName, start, end) {
    ......
  }
}

parseHTML函数主要是遍历传入的template，扫描template再进行解析，首先找到起始的'<',分为三种情况：

1.'<'在起始位置

首先会判断是否是注释节点，是注释节点，前进注释节点长度，并截取后面的字符串，进行下一次循环；
匹配<![xx]>这种格式的标签，前进节点长度，并截取后面的字符串，进行下一次循环；
匹配 DOCTYPE 标签，前进节点长度，并截取后面的字符串，进行下一次循环；以上的三种情况是直接跳过，截取剩余的字符串，剩下的几种情况需要做进一步处理：
1.匹配开始标签

const ncname = '[a-zA-Z_][\\w\\-\\.]*'
const qnameCapture = `((?:${ncname}\\:)?${ncname})`
const startTagOpen = new RegExp(`^<${qnameCapture}`)
const startTagClose = /^\s*(\/?)>/

// Start tag:
// 开始标签解析
const startTagMatch = parseStartTag()
if (startTagMatch) {
  // 处理扫描后的结果
  handleStartTag(startTagMatch)
  if (shouldIgnoreFirstNewline(lastTag, html)) {
    advance(1)
  }
  continue
}

function parseStartTag() {
  // 解析标签 匹配tag 例如<div class="name"></div>
  const start = html.match(startTagOpen) // 这儿拿到了'div'
  if (start) {
    // 生成一个match对象
    const match = {
      tagName: start[1], // 标签名 'div'
      attrs: [],
      start: index
    }
    advance(start[0].length) // '<div'.length
    let end, attr
    // 没有匹配上开始标签的闭合标签且匹配上了属性 这儿是class="name"
    while (!(end = html.match(startTagClose)) && (attr = html.match(attribute))) {
      advance(attr[0].length)
      // 将解析的class="name"推入attrs数组
      match.attrs.push(attr)
    }
    // 开始标签的闭合匹配上了
    if (end) {
      // 如果是一元标签，例如<img />, unarySlash会有值
      match.unarySlash = end[1]
      advance(end[0].length)
      match.end = index
      // 返回这个match对象
      return match
    }
  }
}

扫描开始标签，目的就是将tag标签及属性等通过正则表达式从template中解析出来，并最终生成match对象，方便后面对开始标签的处理，我们继续看下handleStartTag：

// 处理解析过的开始标签对象
function handleStartTag (match) {
  const tagName = match.tagName
  const unarySlash = match.unarySlash

  // 这块之后分析
  if (expectHTML) {
    if (lastTag === 'p' && isNonPhrasingTag(tagName)) {
      parseEndTag(lastTag)
    }
    if (canBeLeftOpenTag(tagName) && lastTag === tagName) {
      parseEndTag(tagName)
    }
  }

  // 判断是否是一元标签
  const unary = isUnaryTag(tagName) || !!unarySlash

  const l = match.attrs.length
  // 生成新数组
  const attrs = new Array(l)
  // 遍历match.attrs数组
  for (let i = 0; i < l; i++) {
    const args = match.attrs[i]
    // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778
    if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) {
      if (args[3] === '') { delete args[3] }
      if (args[4] === '') { delete args[4] }
      if (args[5] === '') { delete args[5] }
    }
    const value = args[3] || args[4] || args[5] || ''
    const shouldDecodeNewlines = tagName === 'a' && args[1] === 'href'
      ? options.shouldDecodeNewlinesForHref
      : options.shouldDecodeNewlines
      
    // 转换成对象格式存入新数组
    attrs[i] = {
      name: args[1],
      value: decodeAttr(value, shouldDecodeNewlines)
    }
  }

  // 不是一元标签，推入stack数组
  if (!unary) {
    stack.push({ tag: tagName, lowerCasedTag: tagName.toLowerCase(), attrs: attrs })
    // lasttag赋值为标签名
    lastTag = tagName
  }

  // 调用options中的start函数，这块下一节分析
  if (options.start) {
    options.start(tagName, attrs, unary, match.start, match.end)
  }
}

handleStartTag函数的作用就是将解析后的attr属性数组转换成对象数组的形式，并最终调用传入的start函数，start函数的逻辑我们下一节分析；

2.匹配结束标签

// End tag:
const endTagMatch = html.match(endTag)
if (endTagMatch) {
  const curIndex = index
  // 前进end Tag长度
  advance(endTagMatch[0].length)
  parseEndTag(endTagMatch[1], curIndex, index)
  continue
}

function parseEndTag (tagName, start, end) {
  let pos, lowerCasedTagName
  if (start == null) start = index
  if (end == null) end = index

  if (tagName) {
    // tag名转换成小写
    lowerCasedTagName = tagName.toLowerCase()
  }

  // Find the closest opened tag of the same type
  if (tagName) {
    for (pos = stack.length - 1; pos >= 0; pos--) {
      if (stack[pos].lowerCasedTag === lowerCasedTagName) {
        break
      }
    }
  } else {
    // If no tag name is provided, clean shop
    pos = 0
  }

  if (pos >= 0) {
    // Close all the open elements, up the stack
    // 关闭匹配上的开始标签，将匹配上的推出stack栈数组
    // 如果结束标签没有正确的匹配，报出警告
    for (let i = stack.length - 1; i >= pos; i--) {
      if (process.env.NODE_ENV !== 'production' &&
        (i > pos || !tagName) &&
        options.warn
      ) {
        options.warn(
          `tag <${stack[i].tag}> has no matching end tag.`
        )
      }
      // 执行end函数
      if (options.end) {
        options.end(stack[i].tag, start, end)
      }
    }

    // Remove the open elements from the stack
    // 匹配上了,stack数组上最后一位的tag被pop
    stack.length = pos
    lastTag = pos && stack[pos - 1].tag
  } 
  ......
}

parseEndTag的作用就是与stack中存储的开始标签进行匹配，匹配上了就推出stack栈，直到最后的template模板完全扫描结束，stack清空。

2.'<'的位置大于0

这种情况是当扫描到文本字符串的时候，这时候的'<'的index就会大于0:

let text, rest, next
// 进入text区域，textEnd大于0
if (textEnd >= 0) {
  // 获取剩余文本字符串
  rest = html.slice(textEnd)
  // 这个地方判断文本区域是否有'>'字符串
  while (
    !endTag.test(rest) &&
    !startTagOpen.test(rest) &&
    !comment.test(rest) &&
    !conditionalComment.test(rest)
  ) {
    // < in plain text, be forgiving and treat it as text
    // 寻找下一个'<'
    next = rest.indexOf('<', 1)
    if (next < 0) break
    textEnd += next
    rest = html.slice(textEnd)
  }
  // 截取文本
  text = html.substring(0, textEnd)
  // 前进
  advance(textEnd)
}

// 调用char函数处理文本，生成节点
if (options.chars && text) {
  options.chars(text)
}

扫描到完整的文本字符串后，会截取保存起来，执行chars方法对文本进行解析，这儿的逻辑我们也在下一节进行分析。

总结

到此为止，我们分析了parse过程的大致流程，执行parseHTML函数对我们传入的template模板字符串进行解析，根据不同的情况，扫描出其中的tag标签，扫描出tag绑定的所有属性，并将它们解析成一个对象，方便下一步的解析操作。下一节我们继续分析ast树是如何生成的。

源码分析：Vue 编译（compile）核心流程之parse（上）

Vue编译（compile）流程之parse（template的解析）

parseHTML

1.'<'在起始位置

2.'<'的位置大于0

总结