Nim 语言提取文章元数据

194 阅读2分钟

Nim中文教程

我们可以使用 Nim 语言,来提取文章的元数据。

文章的元数据限定在文章开头,且使用 --- 与正文分隔。author 为关键词,flywind 为值。关键词,是我们限定的属性,有以下这些:"title", "layout", "id", "author", "datetime", "excerpt", "tags" 。

--- 
author: flywind 
title: it's a test file.
id: 12 
tags: id, bae, nae
---

导入的标准库

我们主要使用 parseutils 模块来解析文章的元数据。

import sets, parseutils, strutils, strformat, times

# 关键词列表
let lookupSet = ["title", "layout", "id", "author", "datetime", "excerpt", "tags"].toHashSet

定义元数据的数据结构

元数据就是我们希望提取的属性。

type
  MetaData* = tuple
    title: string
    layout: string
    id: string
    author: string
    dateTime: string
    excerpt: string
    tags: seq[string]
  Article* = object
    data*: MetaData
    content*: string
    title: bool
    id: bool
    author: bool
    dateTime: bool
    excerpt: bool
    tags: bool
  ParseError* = Exception

解析元数据

我们使用 parseutils 按照步骤解析文章。

proc parseHeader*(s: string): Article = 
  result.dateTime = true
  result.data.dateTime = $local(now())

  template assign(attr, value: untyped): untyped = 
    result.attr = true
    result.data.attr = value
  var 
    pos: int
    key, value: string
  let length = s.len
  if skip(s, "---", pos) != 3:
    raise newException(ParseError, "No `---` in the head")
  pos += 3
  pos += skipWhitespace(s, pos)
  if skipUntil(s, {'\n'}, pos) == 0:
    raise newException(ParseError, "can't have words after `---`")
  while true:
    pos += skipWhitespace(s, pos)
    pos += parseUntil(s, key, {':'}, pos) + 1
    key = normalize(key)
    if key notin lookupSet:
      raise newException(KeyError, fmt"key should be in {lookupSet}")
    pos += skipWhitespace(s, pos)
    pos += parseUntil(s, value, {'\n'}, pos)
    case key
    of "title":
      assign(title, value.strip)
    of "id":
      assign(id, value.strip)
    of "author":
      assign(author, value.strip)
    of "datetime":
      assign(datetime, value.strip)
    of "excerpt":
      assign(excerpt, value.strip)
    of "tags":
      result.tags = true
      result.data.tags = value.strip.split(", ")
    else: discard
    pos += skipWhitespace(s, pos)
    pos += skipWhile(s, {'\n'}, pos)
    if pos >= length or s[pos .. pos + 2] == "---":
      pos += 3 
      break
  result.content = s[pos ..< s.len]


proc parseHeader*(s: File): Article =
  result = parseHeader(s.readAll())   

测试程序

程序只是为了学习的目的,所以不保证正确性与健壮性。

const tmp = """
--- 
author: flywind 

title:  it's a test file.
id: 12 

tags:   id, bae, nae
---

## title
Let's play with Nim lang.
"""

echo parseHeader(tmp)

输出:

(data: (title: "it\'s a test file.", layout: "", id: "12", author: "flywind", dateTime: "2019-12-09T19:39:38+08:00", excerpt: "", tags: @["id", "bae", "nae"]), content: "\n\n## title\nLet\'s play with Nim lang.\n", title: true, id: true, author: true, dateTime: true, excerpt: false, tags: true)