一、布隆过滤器的原理（来源：Golang中的布隆过滤器 - 布史 - 博客园 (cnblogs.com)）

布隆过滤器（Bloom Filter） 是由 Howard Bloom在1970年提出的二进制向量数据结构，它具有很好的空间和时间效率，被用来检测一个元素是不是集合中的一个成员，即判定 “可能已存在和绝对不存在” 两种情况。如果检测结果为是，该元素不一定在集合中；但如果检测结果为否，该元素一定不在集合中,因此Bloom filter具有100%的召回率。

布隆过滤器的核心是一个超大的位数组和几个哈希函数。

下图表示有三个hash函数，比如一个集合中有x，y，z三个元素，分别用三个hash函数映射到二进制序列的某些位上，假设我们判断w是否在集合中，同样用三个hash函数来映射，结果发现取得的结果不全为1，则表示w不在集合里面。

二、布隆过滤器的使用场景

不关注原有数据是什么，只需要知道原有数据是否存在的情况时考虑使用布隆过滤器。例如：

ip黑名单
爬虫的URL过滤
防止缓存击穿

三、布隆过滤器的优化思考

使用多个hash函数，实现多个bit对应一条数据，但不能过多的使用hash函数，应按实际需求使用，过多的使用hash会增加空间占用，使得冲突更容易发生，所以hash函数的增加对应存储bit的空间也要相应的增大。
布隆过滤器由于可能产生冲突的原因，无法删除某条数据，所以再发生数据变动时只有通过重构整个空间去替换。
布隆过滤器存在假阳性误判，对于不存在的数据可能会被误判为存在，对于已存在的数据不可能发生误判，所以实际使用中，对于存在的判断不能完全信任。

四、布隆过滤器的实现

代码中使用的hash函数是goframe框架内的hash函数，可自行替换，仅供参考。代码中的cache使用的是内存变量，可替换其他方案，如共享内存、redis等

type Blond struct {
    hashCount    uint   // hash函数个数
    hashMaxCount uint   // hash函数最大个数
    cacheCount   uint64 // 缓存个数
    cache        []byte // 缓存
    cacheSize    uint64 // 缓存大小
}

func New() *Blond {
    return &Blond{hashMaxCount: 8}
}

//
// Init
//  @Description: 初始化
//  @receiver b
//  @param cacheSize
//  @param hashCount
//  @return *Blond
//
func (b *Blond) Init(cacheSize uint64, hashCount uint) *Blond {
    if hashCount > b.hashMaxCount {
       hashCount = b.hashMaxCount
    }
    b.hashCount = hashCount
    b.cacheSize = (cacheSize * uint64(hashCount)) / 8
    if cacheSize%8 != 0 {
       b.cacheSize++
    }
    b.cache = make([]byte, b.cacheSize)
    return b
}

//
// Test
//  @Description: 测试
//  @receiver b
//  @return *Blond
//
func (b *Blond) Test() *Blond {
    b.Init(100, 8)
    err := b.Add(gconv.Bytes("www.baidu.com"))
    if err != nil {
       g.Dump(err.Error())
    }
    err = b.Add(gconv.Bytes("www.baidu1.com"))
    if err != nil {
       g.Dump(err.Error())
    }
    err = b.Add(gconv.Bytes("www.baidu.com"))
    if err != nil {
       g.Dump(err.Error())
    }
    g.Dump(b.Check(gconv.Bytes("www.baidu.com")))
    g.Dump(b.Check(gconv.Bytes("www.baidu1.com")))
    g.Dump(b.Check(gconv.Bytes("www.baidu2.com")))
    g.Dump(b.Check(gconv.Bytes("www.baidu3.com")))
    fmt.Printf("\n %v \n %d \n", b.cache, len(b.cache))
    return b
}

//
// Check
//  @Description: 检验是否存在Blond过滤器中
//  @receiver b
//  @param value
//  @return bool
//
func (b *Blond) Check(value []byte) bool {
    var (
       i     uint
       index uint64
    )
    for i = 0; i < b.hashCount; i++ {
       switch i % b.hashMaxCount {
       case 0:
          index = ghash.AP64(value) % b.cacheSize
       case 1:
          index = ghash.BKDR64(value) % b.cacheSize
       case 2:
          index = ghash.DJB64(value) % b.cacheSize
       case 3:
          index = ghash.ELF64(value) % b.cacheSize
       case 4:
          index = ghash.JS64(value) % b.cacheSize
       case 5:
          index = ghash.PJW64(value) % b.cacheSize
       case 6:
          index = ghash.RS64(value) % b.cacheSize
       case 7:
          index = ghash.SDBM64(value) % b.cacheSize
       default:
          return false
       }
       if !b.bitSet(index) {
          return false
       }
    }
    return true
}

//
// Add
//  @Description: 添加值到Blond过滤器中
//  @receiver b
//  @param value
//  @return err
//
func (b *Blond) Add(value []byte) (err error) {
    var (
       i         uint
       index     uint64
       clashFlag = true
    )
    for i = 0; i < b.hashCount; i++ {
       switch i % b.hashMaxCount {
       case 0:
          index = ghash.AP64(value) % b.cacheSize
       case 1:
          index = ghash.BKDR64(value) % b.cacheSize
       case 2:
          index = ghash.DJB64(value) % b.cacheSize
       case 3:
          index = ghash.ELF64(value) % b.cacheSize
       case 4:
          index = ghash.JS64(value) % b.cacheSize
       case 5:
          index = ghash.PJW64(value) % b.cacheSize
       case 6:
          index = ghash.RS64(value) % b.cacheSize
       case 7:
          index = ghash.SDBM64(value) % b.cacheSize
       default:
          return
       }
       if !b.bitSet(index) {
          clashFlag = false
       }
    }
    if clashFlag {
       err = errors.New("this value is in conflict！")
    } else {
       b.cacheCount++
    }
    return
}

//
// bitGet
//  @Description: 获取指定索引比特位
//  @receiver b
//  @param index
//  @return bool
//
func (b *Blond) bitGet(index uint64) bool {
    if index < b.cacheSize {
       offset := index >> 3
       bitOffset := index & 7
       if b.cache[offset]&(1<<bitOffset) != 0 {
          return true
       }
    }
    return false
}

//
// bitSet
//  @Description: 设置指定索引比特位
//  @receiver b
//  @param index
//  @return bool
//
func (b *Blond) bitSet(index uint64) bool {
    if index < b.cacheSize {
       offset := index >> 3
       bitOffset := index & 7
       if b.cache[offset]&(1<<bitOffset) != 0 {
          return true
       } else {
          b.cache[offset] |= 1 << bitOffset
       }
    }
    return false
}

运行截图：

微信图片_20240510100540.png

Goland 布隆过滤器的实现

一、布隆过滤器的原理 （来源：Golang中的布隆过滤器 - 布史 - 博客园 (cnblogs.com)）

二、布隆过滤器的使用场景

三、布隆过滤器的优化思考

四、布隆过滤器的实现

一、布隆过滤器的原理（来源：Golang中的布隆过滤器 - 布史 - 博客园 (cnblogs.com)）