通过 bytepool 避免 sync.Pool 的额外内存分配通常情况下，根据 sync.pool 分配内存会导致额外

通常情况下，根据 sync.pool 分配内存会导致额外的 24 byte 内存分配，这是因为堆逃逸导致的。

空间换时间

当我们总的需要使用的 []byte 数量是一定的时候，我们可以预分配，然后复用：

package utils

import (
    "reflect"
    "sync/atomic"
    "unsafe"
)

// BytePool is used to concurrently obtain byte arrays of size cap.
type BytePool struct {
    i      atomic.Int64
    caches [][]byte
    used   []atomic.Bool
    head   uintptr
    cap    int
}

// NewBytePool creates a byte array pool with max byte arrays, each with a capacity of cap.
// If sync.Pool is used, at least 24 bytes of memory will be allocated each time,
// see https://blog.mike.norgate.xyz/unlocking-go-slice-performance-navigating-sync-pool-for-enhanced-efficiency-7cb63b0b453e.
// By using the space-for-time method, zero memory allocation can be achieved.
// Here, a large byte array is first allocated, and then it is divided into small byte arrays.
// The address difference between the small byte arrays is the same.
// For each byte array, an atomic.Bool is used to determine whether it is in use.
// If it is false, it means it is not in use, and it is converted to true and returned to the caller;
// if it is true, it means it has been used.
// When the byte array is used,
// we find the corresponding atomic.Bool by the difference between its address and the first address of the large byte array,
// and then set it to false.
func NewBytePool(cap, max int) *BytePool {
    data := make([]byte, cap*max)
    caches := make([][]byte, max)
    for i := range caches {
       caches[i] = data[i*cap : (i+1)*cap][:0]
    }
    rt := (*reflect.SliceHeader)(unsafe.Pointer(&data))
    return &BytePool{
       caches: caches,
       used:   make([]atomic.Bool, max),
       head:   rt.Data,
       cap:    cap,
    }
}

// Get byte array from pool.
func (b *BytePool) Get() []byte {
    for {
       cur := b.i.Add(1)
       if cur >= int64(len(b.caches)) {
          cur = 0
          b.i.Store(0)
       }
       if b.used[cur].CompareAndSwap(false, true) {
          return b.caches[cur][:0]
       }
    }
}

// Put the data back into the BytePool.
func (b *BytePool) Put(x []byte) {
    rt := (*reflect.SliceHeader)(unsafe.Pointer(&x))
    i := int(rt.Data-b.head) / b.cap
    b.used[i].Store(false)
}

压测

压测效果：比正常的 sync.pool 快了一倍，并且零内存分配。

// goos: linux
// goarch: amd64
// pkg: pool
// cpu: Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz
// Benchmark/byte_pool-32          38809202                30.97 ns/op            0 B/op          0 allocs/op
// Benchmark/pool-32               20152437                61.58 ns/op           24 B/op          1 allocs/op

package utils

import (
    "testing"
    "time"

    "github.com/bytedance/gopkg/lang/mcache"
    "github.com/stretchr/testify/require"
)

func Benchmark(b *testing.B) {
    b.Run("byte pool", func(b *testing.B) {
       x := NewBytePool(10, 10)
       b.ReportAllocs()
       for i := 0; i < b.N; i++ {
          data := x.Get()
          x.Put(data)
       }
    })
    b.Run("pool", func(b *testing.B) {
       b.ReportAllocs()
       for i := 0; i < b.N; i++ {
          data := mcache.Malloc(0, 10)
          mcache.Free(data)
       }
    })
}

func TestConcurrent(t *testing.T) {
    x := NewBytePool(65000, 100)
    for i := 0; i < 10; i++ {
       go func() {
          data := x.Get()
          require.Equal(t, 0, len(data))
          data = append(data, []byte("test")...)
          require.Equal(t, []byte("test"), data)
          x.Put(data)
       }()
    }
    time.Sleep(time.Second)
}