背景
业务开发中为了从map中随机取一个元素,不想用传统方式随机key的index来获取,而是直接依据map遍历的无序性获取随机值,结果造成了随机分布不均匀的问题,特此分析记录
情况分析
单元测试
从单元测试可以看出来,map随机获取的分布很不均匀,方差远远超过常规随机算法,接下来,分析下具体原因
func GetRandEquipmentType() (eqType pb.EquipmentType) {
i := randutil.Int(len(pb.EquipmentType_name) - 1) //[0,11]
eqType = pb.EquipmentType(i + 1)
return
}
func GetMapRandEquipmentType() (eqType pb.EquipmentType) {
for k := range pb.EquipmentType_name {
if k == 0 {
continue
}
eqType = pb.EquipmentType(k)
break
}
return
}
func TestBoxUC_GetRandEquipmentType(t *testing.T) {
calMap := make(map[pb.EquipmentType]int32)
for i := 0; i < 1000; i++ {
calMap[GetRandEquipmentType()]++ // 随机函数获取
}
mapCalMap := make(map[pb.EquipmentType]int32)
for i := 0; i < 1000; i++ {
mapCalMap[GetMapRandEquipmentType()]++ // map遍历获取
}
// 计算方差
v := variance(calMap)
mapV := variance(mapCalMap)
// 打印结果
t.Logf(" var:%f", v)
t.Logf(" mapVar:%f", mapV)
t.Log(calMap)
t.Log(mapCalMap)
}
=== RUN TestBoxUC_GetRandEquipmentType
usecase_test.go:41: var:46.722222
usecase_test.go:42: mapVar:2922.555556
usecase_test.go:43: map[EquipmentTypeShoulder:80 EquipmentTypeHelmet:92 EquipmentTypeNecklace:85 EquipmentTypeBracelet:71 EquipmentTypeArmor:83 EquipmentTypeGlove:79 EquipmentTypeBelt:84 EquipmentTypePants:72 EquipmentTypeWeapon:94 EquipmentTypeOrnament:87 EquipmentTypeShoes:83 EquipmentTypeShield:90]
usecase_test.go:44: map[EquipmentTypeShoulder:253 EquipmentTypeHelmet:57 EquipmentTypeNecklace:119 EquipmentTypeBracelet:53 EquipmentTypeArmor:71 EquipmentTypeGlove:63 EquipmentTypeBelt:58 EquipmentTypePants:49 EquipmentTypeWeapon:73 EquipmentTypeOrnament:58 EquipmentTypeShoes:78 EquipmentTypeShield:68]
--- PASS: TestBoxUC_GetRandEquipmentType (0.00s)
map原理
goland中map中的基础结构是由 runtime.hmap组成
//src/runtime/map.go line 115
// A header for a Go map.
type hmap struct {
// Note: the format of the hmap is also encoded in cmd/compile/internal/gc/reflect.go.
// Make sure this stays in sync with the compiler's definition.
//
count int //len(map)时,返回的值
flags uint8 //表示是否有其他协程在操作map
B uint8 //上图中[]bmap的''长度'' 2^B
noverflow uint16 //// 溢出的bucket个数
hash0 uint32 // hash seed
buckets unsafe.Pointer //buckets 数组指针
oldbuckets unsafe.Pointer // 扩容的时候用于赋值的buckets数组
nevacuate uintptr // 搬迁进度
extra *mapextra // 用于扩容的指针
type mapextra struct {
overflow *[]*bmap
oldoverflow *[]*bmap
nextOverflow *bmap
}
// A bucket for a Go map.
type bmap struct {
tophash [bucketCnt]uint8 // len为8的数组
}
//底层定义的常量
const (
// Maximum number of key/value pairs a bucket can hold.
bucketCntBits = 3
bucketCnt = 1 << bucketCntBits
}
内存模型
简单来说,hmap内就是由多个buckets和内部的bmap-keys集合元素,理论上遍历只需要按顺序遍历 bucket,同时按顺序遍历 bucket 中的key
而根据map扩容的特性,在map触发扩容时,内部会进行buckets迁移,并对key进行rehash,导致顺序发生改变,所以对map的遍历顺序是不可靠的
但是,那为什么静态的map也是无序的呢?
因为go底层为了保证使用统一,特地对所有map进行了无序处理,防止程序员滥用特性触发bug.
迭代器
再迭代器初始化时,会通过fastrand()函数随机一个bucket和offset成为开始遍历的节点,这样就导致map遍历的随机性
func mapiterinit(t *maptype, h *hmap, it *hiter) {
...
...
// decide where to start
r := uintptr(fastrand())
if h.B > 31-bucketCntBits {
r += uintptr(fastrand()) << 31
}
it.startBucket = r & bucketMask(h.B) // 随机bucket
it.offset = uint8 (r >> h.B & ( bucketCnt - 1 )) // 随机偏移量
// iterator state
it.bucket = it.startBucket
...
...
mapiternext(it)
}
func mapiternext(it *hiter) {
...
...
for ; i < bucketCnt; i++ {
offi := (i + it.offset) & ( bucketCnt - 1 ) // 指定下标
if isEmpty(b.tophash[offi]) || b.tophash[offi] == evacuatedEmpty {
// 找不到对应下标的元素则会跳过
continue
}
k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+uintptr(offi)*uintptr(t.valuesize))
if (b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY) ||
!(t.reflexivekey() || alg.equal(k, k)) {
it.key = k
it.value = v
} else {
rk, rv := mapaccessK(t, h, k)
it.key = rk
it.value = rv
}
it.bucket = bucket
it.i = i + 1
return
}
b = b.overflow(t)
i = 0
goto next
}
这里省略了一些判断逻辑,将主体逻辑放在这里
简单解释一下:
runtime.mapiternext 剩余代码的作用是从桶中找到下一个遍历的元素,在大多数情况下都会直接操作内存获取目标键值的内存地址,不过如果哈希表处于扩容期间就会调用 runtime.mapaccessK 获取键值对
简单总结一下哈希表遍历的顺序
首先会选出一个绿色的正常桶开始遍历,随后遍历所有黄色的溢出桶,最后依次按照索引顺序遍历哈希表中其他的桶,直到所有的桶都被遍历完成。
而桶内的bmap遍历则是会通过随机一个下标[0-7]来获取对应的迭代元素,不存在则会跳过查找下一个
结论
由于每个buckets内部的bmap的长度都为8,在一些长度不为8的倍数的结构中,会将后面定位不到的元素跳过,重新指定到前面的第一个元素,从而获得高于其他元素的随机权重,而这样的结果也会和这样buckets的个数有关
所以就导致了map随机权重分布不均匀,且前几个元素显著高于其他的元素的现象