ES + HDFS 存储架构:ES 做索引,HDFS 存数据

14 阅读9分钟

重点:ES 不擅长存超大原文/海量 blob;HDFS 不擅长低延迟随机小读。工程上通常会做 “HDFS 大文件 + 索引定位”“HDFS(Parquet/ORC) + 回表服务缓存”

适合用于新增数据、查询数据,不是很适合删除数据和修改数据,如果需要修改数据,就需要重新创建一份文件,将es存储的路径进行更改即可。


总体架构与数据模型

1) 写入流程(Ingest)

  1. 生成全局 docID(UUID/雪花)
  2. 将原始数据写入 HDFS
    • 推荐路径:/data/app/yyyy/mm/dd/docID.json(小规模)
    • 或写入按天滚动的大文件(更适合 HDFS):/data/app/yyyy/mm/dd/part-0000,并记录 offset/length
  3. 向 ES 写索引文档:仅包含
    • 可检索字段(title、tags、time、userId…)
    • HDFS 指针:hdfs_path,以及(可选)offsetlengthchecksumversion

2) 读取流程(Query)

  1. 用 ES 搜索(全文/过滤/聚合)
  2. 对命中的每条结果,按 hdfs_path(或 path+offset)到 HDFS 取原文
  3. 返回:ES 的高亮/摘要 + 原文(或原文片段)

关键设计选择

HDFS 如何存:小文件 vs 大文件

  • 小文件(每条一文件):实现最简单,但 NameNode 压力大,规模上来会出问题
  • 大文件(按批/按天 append 或者写成 Parquet):更推荐
    • 需要额外记录 offset/length 或者使用列存格式 + 通过主键回表
    • Go 直接“随机读大文件 offset”在 HDFS 上可行,但你要确保写入方式支持可追踪 offset,并且 reader 端可 Seek

下面代码我给两种方案:

  • 方案 A:每条一个文件(最直观、好理解)
  • 方案 B:按天滚动大文件(更贴近生产,但略复杂)

方案1

ES Mapping

(索引只存字段 + HDFS 指针)

示例(你可以用 Kibana/脚本创建):

PUT my_index
{
  "mappings": {
    "properties": {
      "doc_id":    { "type": "keyword" },
      "title":     { "type": "text", "analyzer": "standard" },
      "tags":      { "type": "keyword" },
      "ts":        { "type": "date" },

      "hdfs_path": { "type": "keyword" },
      "offset":    { "type": "long" },      
      "length":    { "type": "integer" },   
      "checksum":  { "type": "keyword" }    
    }
  }
}

  • ES:官方 Go 客户端 github.com/elastic/go-elasticsearch/v8
  • HDFS:常用库 github.com/colinmarc/hdfs/v2(通过 RPC 直连 NameNode,不依赖 shell)
go get github.com/elastic/go-elasticsearch/v8
go get github.com/colinmarc/hdfs/v2

写入 HDFS + 写入 ES

package main

import (
	"bytes"
	"context"
	"crypto/sha256"
	"encoding/hex"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"path"
	"time"

	"github.com/colinmarc/hdfs/v2"
	"github.com/elastic/go-elasticsearch/v8"
	"github.com/elastic/go-elasticsearch/v8/esapi"
)

type RawDoc struct {
	DocID string    `json:"doc_id"`
	Title string    `json:"title"`
	Tags  []string  `json:"tags"`
	TS    time.Time `json:"ts"`
	Body  any       `json:"body"` // 这里放你的原始大 JSON / 原始事件
}

type IndexDoc struct {
	DocID    string    `json:"doc_id"`
	Title    string    `json:"title"`
	Tags     []string  `json:"tags"`
	TS       time.Time `json:"ts"`
	HDFSPath string    `json:"hdfs_path"`
	Checksum string    `json:"checksum"`
	// 方案A不需要 offset/length
}

func sha256Hex(b []byte) string {
	sum := sha256.Sum256(b)
	return hex.EncodeToString(sum[:])
}

func mustJSON(v any) []byte {
	b, err := json.Marshal(v)
	if err != nil {
		panic(err)
	}
	return b
}

// 写原文到 HDFS:/data/app/yyyy/mm/dd/<docID>.json
func writeToHDFS(client *hdfs.Client, baseDir string, doc RawDoc) (hdfsPath string, checksum string, err error) {
	rawBytes := mustJSON(doc)
	checksum = sha256Hex(rawBytes)

	dateDir := doc.TS.Format("2006/01/02")
	dir := path.Join(baseDir, dateDir)
	if err = client.MkdirAll(dir, 0755); err != nil {
		return "", "", fmt.Errorf("mkdir %s: %w", dir, err)
	}

	hdfsPath = path.Join(dir, doc.DocID+".json")
	// Create 会覆盖;若你要幂等,可先 Stat 判断或用临时文件+Rename
	f, err := client.Create(hdfsPath)
	if err != nil {
		return "", "", fmt.Errorf("create %s: %w", hdfsPath, err)
	}
	defer f.Close()

	if _, err = f.Write(rawBytes); err != nil {
		return "", "", fmt.Errorf("write %s: %w", hdfsPath, err)
	}
	return hdfsPath, checksum, nil
}

func indexToES(es *elasticsearch.Client, index string, idx IndexDoc) error {
	body := mustJSON(idx)
	req := esapi.IndexRequest{
		Index:      index,
		DocumentID: idx.DocID,           // 用 doc_id 当 ES _id,方便幂等 upsert
		Body:       bytes.NewReader(body),
		Refresh:    "false",             // 批量写入建议 false 或 "wait_for"
	}
	resp, err := req.Do(context.Background(), es)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	if resp.IsError() {
		b, _ := io.ReadAll(resp.Body)
		return fmt.Errorf("es index error: %s", string(b))
	}
	return nil
}

func ingestOne(hdfsClient *hdfs.Client, es *elasticsearch.Client, hdfsBaseDir, esIndex string, doc RawDoc) error {
	hdfsPath, checksum, err := writeToHDFS(hdfsClient, hdfsBaseDir, doc)
	if err != nil {
		return err
	}

	idx := IndexDoc{
		DocID:    doc.DocID,
		Title:    doc.Title,
		Tags:     doc.Tags,
		TS:       doc.TS,
		HDFSPath: hdfsPath,
		Checksum: checksum,
	}
	// 注意一致性:先写 HDFS 再写 ES;若 ES 失败,做重试/补偿
	return indexToES(es, esIndex, idx)
}

func main() {
	// 1) HDFS client(按你的集群配置:NameNode 地址、Kerberos 等)
	hdfsClient, err := hdfs.New("namenode1:8020") // 也可从 HADOOP_CONF_DIR 读取
	if err != nil {
		log.Fatal(err)
	}

	// 2) ES client
	es, err := elasticsearch.NewClient(elasticsearch.Config{
		Addresses: []string{"http://es1:9200"},
		// Username/Password/TLS 按需配置
	})
	if err != nil {
		log.Fatal(err)
	}

	doc := RawDoc{
		DocID: "doc-001",
		Title: "hello es+hdfs",
		Tags:  []string{"demo", "hdfs"},
		TS:    time.Now(),
		Body: map[string]any{
			"message": "raw big json here",
			"n":       123,
		},
	}

	if err := ingestOne(hdfsClient, es, "/data/app", "my_index", doc); err != nil {
		log.Fatal(err)
	}
	log.Println("ok")
}

代码:查询 ES + 回表 HDFS 读取原文

func readFromHDFS(client *hdfs.Client, hdfsPath string) ([]byte, error) {
	f, err := client.Open(hdfsPath)
	if err != nil {
		return nil, err
	}
	defer f.Close()
	return io.ReadAll(f)
}

ES 搜索示例(简化版):

type HitSource struct {
	DocID    string `json:"doc_id"`
	Title    string `json:"title"`
	HDFSPath string `json:"hdfs_path"`
}

type ESSearchResp struct {
	Hits struct {
		Hits []struct {
			Source HitSource `json:"_source"`
		} `json:"hits"`
	} `json:"hits"`
}

func searchAndFetch(es *elasticsearch.Client, hdfsClient *hdfs.Client, index, q string) error {
	query := map[string]any{
		"query": map[string]any{
			"multi_match": map[string]any{
				"query":  q,
				"fields": []string{"title^2", "tags"},
			},
		},
		"size": 10,
	}
	var buf bytes.Buffer
	_ = json.NewEncoder(&buf).Encode(query)

	resp, err := es.Search(
		es.Search.WithIndex(index),
		es.Search.WithBody(&buf),
	)
	if err != nil {
		return err
	}
	defer resp.Body.Close()

	var r ESSearchResp
	if err := json.NewDecoder(resp.Body).Decode(&r); err != nil {
		return err
	}

	for _, h := range r.Hits.Hits {
		raw, err := readFromHDFS(hdfsClient, h.Source.HDFSPath)
		if err != nil {
			return fmt.Errorf("hdfs read %s: %w", h.Source.HDFSPath, err)
		}
		fmt.Printf("doc_id=%s title=%s raw_len=%d\n", h.Source.DocID, h.Source.Title, len(raw))
	}
	return nil
}

一致性与可靠性(生产必做)

因为你是“双写”(HDFS + ES),必然面对一致性问题。常见做法:

  1. 写入顺序:先 HDFS,后 ES(避免 ES 指针指向不存在的数据)
  2. 幂等
    • ES:用 doc_id_id(重复写覆盖)
    • HDFS:写入用 tmp 文件 + Rename 原子替换,或检查存在则跳过
  3. 失败补偿
    • ES 写失败:把 docID + hdfsPath 放到重试队列(Kafka/DB)
    • HDFS 写失败:不写 ES
  4. 可观测性:记录 checksum/版本号;
  5. 回表时校验(可选)

缺陷

  • 由于小文件较多,对于集群压力比较大,并不是很好用。

方案2

HDFS 按天滚动大文件 append,每条记录采用 length-prefix(4 字节大端)+ payload (JSON)  的格式。

package main

import (
    "bytes"
    "context"
    "crypto/sha256"
    "encoding/binary"
    "encoding/hex"
    "encoding/json"
    "errors"
    "fmt"
    "io"
    "log"
    "path"
    "time"

    "github.com/colinmarc/hdfs/v2"
    "github.com/elastic/go-elasticsearch/v8"
    "github.com/elastic/go-elasticsearch/v8/esapi"
)

/*
Record format in HDFS:
  [4 bytes big-endian uint32: payloadLen] [payload bytes: JSON]
We store in ES:
  hdfs_path, offset (start of record), length (total bytes = 4 + payloadLen)
*/

type RawDoc struct {
    DocID string    `json:"doc_id"`
    Title string    `json:"title"`
    Tags  []string  `json:"tags"`
    TS    time.Time `json:"ts"`
    Body  any       `json:"body"` // 原始大JSON/事件
}

type IndexDoc struct {
    DocID    string    `json:"doc_id"`
    Title    string    `json:"title"`
    Tags     []string  `json:"tags"`
    TS       time.Time `json:"ts"`
    HDFSPath string    `json:"hdfs_path"`
    Offset   int64     `json:"offset"`
    Length   int       `json:"length"`   // record total length (4 + payloadLen)
    Checksum string    `json:"checksum"` // sha256(payload)
}

func mustJSON(v any) []byte {
    b, err := json.Marshal(v)
    if err != nil {
       panic(err)
    }
    return b
}

func sha256Hex(b []byte) string {
    sum := sha256.Sum256(b)
    return hex.EncodeToString(sum[:])
}

func dailyLogPath(baseDir string, t time.Time) string {
    // /data/app/yyyy/mm/dd/part-0000.log
    dateDir := t.Format("2006/01/02")
    return path.Join(baseDir, dateDir, "part-0000.log")
}

func ensureParentDir(client *hdfs.Client, filePath string) error {
    dir := path.Dir(filePath)
    return client.MkdirAll(dir, 0755)
}

// appendRecord appends one record to an HDFS file and returns offset/length.
// Important: file must have single writer to avoid race/overlap offsets.
func appendRecord(client *hdfs.Client, filePath string, payload []byte) (offset int64, recLen int, err error) {
    if len(payload) > int(^uint32(0)) {
       return 0, 0, fmt.Errorf("payload too large: %d", len(payload))
    }

    if err := ensureParentDir(client, filePath); err != nil {
       return 0, 0, fmt.Errorf("ensureParentDir: %w", err)
    }

    // Open existing file for append, or create if not exists.
    var f *hdfs.FileWriter
    // Try append first.
    f, err = client.Append(filePath)
    if err != nil {
       // If file doesn't exist, create it.
       // Unfortunately different clusters may return different errors; do a stat check.
       _, statErr := client.Stat(filePath)
       if statErr != nil {
          // does not exist (or inaccessible) -> create
          f, err = client.Create(filePath)
          if err != nil {
             return 0, 0, fmt.Errorf("create %s: %w", filePath, err)
          }
       } else {
          // exists but append failed
          return 0, 0, fmt.Errorf("append %s: %w", filePath, err)
       }
    }
    defer f.Close()

    // Current file size as offset (start position of this record)
    info, err := client.Stat(filePath)
    if err != nil {
       return 0, 0, fmt.Errorf("stat %s: %w", filePath, err)
    }
    offset = info.Size()

    // Build record bytes: len-prefix + payload
    recLen = 4 + len(payload)
    buf := make([]byte, recLen)
    binary.BigEndian.PutUint32(buf[0:4], uint32(len(payload)))
    copy(buf[4:], payload)

    // Write atomically from writer perspective; HDFS itself is streaming, so ensure full write
    n, err := f.Write(buf)
    if err != nil {
       return 0, 0, fmt.Errorf("write record: %w", err)
    }
    if n != len(buf) {
       return 0, 0, fmt.Errorf("short write: %d/%d", n, len(buf))
    }
    // Close() will flush pipeline
    return offset, recLen, nil
}

// readRecord reads one record from HDFS by offset.
// It validates length-prefix and returns payload bytes.
func readRecord(client *hdfs.Client, filePath string, offset int64) ([]byte, error) {
    f, err := client.Open(filePath)
    if err != nil {
       return nil, fmt.Errorf("open %s: %w", filePath, err)
    }
    defer f.Close()

    if _, err := f.Seek(offset, io.SeekStart); err != nil {
       return nil, fmt.Errorf("seek %s offset=%d: %w", filePath, offset, err)
    }

    var lenBuf [4]byte
    if _, err := io.ReadFull(f, lenBuf[:]); err != nil {
       return nil, fmt.Errorf("read length-prefix: %w", err)
    }
    payloadLen := binary.BigEndian.Uint32(lenBuf[:])
    if payloadLen == 0 {
       return nil, errors.New("invalid payloadLen=0")
    }
    // 你也可以设置上限避免异常长度攻击
    if payloadLen > 64*1024*1024 {
       return nil, fmt.Errorf("payloadLen too large: %d", payloadLen)
    }

    payload := make([]byte, payloadLen)
    if _, err := io.ReadFull(f, payload); err != nil {
       return nil, fmt.Errorf("read payload: %w", err)
    }
    return payload, nil
}

func indexToES(es *elasticsearch.Client, index string, doc IndexDoc) error {
    body := mustJSON(doc)
    req := esapi.IndexRequest{
       Index:      index,
       DocumentID: doc.DocID,
       Body:       bytes.NewReader(body),
       Refresh:    "false",
    }
    resp, err := req.Do(context.Background(), es)
    if err != nil {
       return err
    }
    defer resp.Body.Close()
    if resp.IsError() {
       b, _ := io.ReadAll(resp.Body)
       return fmt.Errorf("es index error: %s", string(b))
    }
    return nil
}

func ingestOne(hdfsClient *hdfs.Client, es *elasticsearch.Client, hdfsBaseDir, esIndex string, doc RawDoc) (IndexDoc, error) {
    // payload = 原始文档(可只存 Body,也可存整个 RawDoc;这里存整个 RawDoc 方便回放)
    payload := mustJSON(doc)
    checksum := sha256Hex(payload)

    hdfsPath := dailyLogPath(hdfsBaseDir, doc.TS)
    offset, recLen, err := appendRecord(hdfsClient, hdfsPath, payload)
    if err != nil {
       return IndexDoc{}, err
    }

    idx := IndexDoc{
       DocID:    doc.DocID,
       Title:    doc.Title,
       Tags:     doc.Tags,
       TS:       doc.TS,
       HDFSPath: hdfsPath,
       Offset:   offset,
       Length:   recLen,
       Checksum: checksum,
    }
    if err := indexToES(es, esIndex, idx); err != nil {
       // 一致性建议:ES失败则将 idx 记录到重试队列(Kafka/DB),避免“已落HDFS但没索引”
       return IndexDoc{}, err
    }
    return idx, nil
}

// ---- ES Search + HDFS fetch ----

type HitSource struct {
    DocID    string    `json:"doc_id"`
    Title    string    `json:"title"`
    Tags     []string  `json:"tags"`
    TS       time.Time `json:"ts"`
    HDFSPath string    `json:"hdfs_path"`
    Offset   int64     `json:"offset"`
    Length   int       `json:"length"`
    Checksum string    `json:"checksum"`
}

type ESSearchResp struct {
    Hits struct {
       Hits []struct {
          Source HitSource `json:"_source"`
       } `json:"hits"`
    } `json:"hits"`
}

func search(es *elasticsearch.Client, index string, q string, size int) ([]HitSource, error) {
    query := map[string]any{
       "query": map[string]any{
          "multi_match": map[string]any{
             "query":  q,
             "fields": []string{"title^2", "tags"},
          },
       },
       "size": size,
    }
    var buf bytes.Buffer
    _ = json.NewEncoder(&buf).Encode(query)

    resp, err := es.Search(
       es.Search.WithIndex(index),
       es.Search.WithBody(&buf),
    )
    if err != nil {
       return nil, err
    }
    defer resp.Body.Close()
    if resp.IsError() {
       b, _ := io.ReadAll(resp.Body)
       return nil, fmt.Errorf("es search error: %s", string(b))
    }

    var r ESSearchResp
    if err := json.NewDecoder(resp.Body).Decode(&r); err != nil {
       return nil, err
    }

    out := make([]HitSource, 0, len(r.Hits.Hits))
    for _, h := range r.Hits.Hits {
       out = append(out, h.Source)
    }
    return out, nil
}

func fetchRawByHit(hdfsClient *hdfs.Client, hit HitSource) (RawDoc, []byte, error) {
    payload, err := readRecord(hdfsClient, hit.HDFSPath, hit.Offset)
    if err != nil {
       return RawDoc{}, nil, err
    }
    if hit.Checksum != "" {
       if sha256Hex(payload) != hit.Checksum {
          return RawDoc{}, nil, fmt.Errorf("checksum mismatch doc_id=%s", hit.DocID)
       }
    }
    var doc RawDoc
    if err := json.Unmarshal(payload, &doc); err != nil {
       return RawDoc{}, payload, fmt.Errorf("unmarshal payload: %w", err)
    }
    return doc, payload, nil
}

func main() {
    // HDFS client
    hdfsClient, err := hdfs.New("namenode1:8020")
    if err != nil {
       log.Fatal(err)
    }

    // ES client
    es, err := elasticsearch.NewClient(elasticsearch.Config{
       Addresses: []string{"http://es1:9200"},
    })
    if err != nil {
       log.Fatal(err)
    }

    const (
       hdfsBaseDir = "/data/app"
       esIndex     = "my_index"
    )

    // --- Ingest demo ---
    now := time.Now()
    doc := RawDoc{
       DocID: fmt.Sprintf("doc-%d", now.UnixNano()),
       Title: "方案B:length-prefix + offset 回读",
       Tags:  []string{"demo", "hdfs", "es"},
       TS:    now,
       Body: map[string]any{
          "message": "this is raw data stored in HDFS append log",
          "ts":      now.Format(time.RFC3339Nano),
       },
    }

    idx, err := ingestOne(hdfsClient, es, hdfsBaseDir, esIndex, doc)
    if err != nil {
       log.Fatal("ingest error:", err)
    }
    log.Printf("ingested doc_id=%s hdfs=%s offset=%d length=%d\n", idx.DocID, idx.HDFSPath, idx.Offset, idx.Length)

    // --- Search + fetch demo ---
    hits, err := search(es, esIndex, "length-prefix", 5)
    if err != nil {
       log.Fatal("search error:", err)
    }
    for _, hit := range hits {
       raw, _, err := fetchRawByHit(hdfsClient, hit)
       if err != nil {
          log.Fatal("fetch error:", err)
       }
       log.Printf("fetched doc_id=%s title=%s body=%v\n", raw.DocID, raw.Title, raw.Body)
    }
}

好处:

  1. 扛规模:避免 NameNode 元数据爆炸小文件模式每个文件都会占用 NameNode 内存(inode、block 元数据等)。数据量上来后,瓶颈不是 DataNode 磁盘,而是 NameNode 内存与 RPC。大文件模式把 “百万 / 千万文件” 变成 “每天几百 / 几千个文件”,NameNode 压力直接数量级下降。

  2. 吞吐更高:顺序写 / 顺序读更符合 HDFS 设计HDFS 为大文件流式 IO 优化:pipeline 写入、磁盘顺序写、网络顺序传输。小文件则会产生大量 open/create/close 与 block 分配的 RPC 往返,吞吐被元操作吃掉

  3. 成本更低:相同副本下更少的管理开销 + 更好的压缩空间

    • 文件数少 → 元数据与小块浪费少
    • 如果用 Parquet/ORC 等列存,压缩比与编码效率通常远好于原始 JSON
  4. 更利于下游计算:天然对接 Spark/Flink/Hive大文件(尤其 Parquet)是大数据计算的 “主粮”。小文件会导致 大量 task 启动 / 调度 /scan 开销,计算性能和稳定性都差。

  5. 生命周期管理更简单:按天 / 批次做归档、冷热分层、删除大文件按分区(dt=2026-02-19)管理:过期直接删分区目录即可。小文件删除 / 迁移涉及海量 RPC,速度慢、对集群冲击大。

  6. 写入更稳:更少对象 = 更少失败点小文件模式下任何一次抖动都可能导致 “部分文件成功、部分失败” 的碎片化问题;大文件批量写更容易做幂等与重试(例如按批次文件名、manifest)。