数据结构

哈希表数据结构

typedef struct dictht
{
    dictEntry **table;      // 数组（想象一下）
    unsigned long size;     // Hash表大小
    unsigned long sizemask; // 用来计算索引值，始终等于size - 1
    unsigned long used;     // 已经存在的节点数量(桶的数量)
} dictht;

哈希表节点

typedef struct dictEntry
{
    void *key;
    union
    {
        void *val;
        // 节省内存：当值为整数或者双精度浮点数的时候由于
        // 其本身就是64位，就可以不用指针指向了，而是直接存储在键值对的结构体中
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;
} dictEntry;

字典

typedef struct dict
{
    dictType *type; // 特定类型函数,必须计算index的散列函数
    void *privdata;
    dictht ht[2];   // 为了做rehash，有两个hash表
    long rehashidx;          /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

哈希表的各种操作

哈希算法计算

uint64_t (*hashFunction)(const void *key);
    // 如何计算index：
    // 1. hash = dict->type->hashFunction(key)
    // 2. index = hash & ht[0].sizemark

计算出key的hash值
然后通过hash值 && ht[0]的掩码计算出index

ReHash过程

什么时候触发rehash

_dictExpandIfNeeded函数

static int _dictExpandIfNeeded(dict *d)
{
    /* Incremental rehashing already in progress. Return. */
		// 如果当前正在执行扩容-> 判断一下rehashIdx是不是-1，就返回
    if (dictIsRehashing(d))
        return DICT_OK;

    /* If the hash table is empty expand it to the initial size. */
    // 如果hash表为空就进行扩展，初始大小为4
    if (d->ht[0].size == 0)
        return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    // 这里面还需要注意的是：
    // 1 .当没有执行BGSAVE或者BGREWRITEAOF并且负载因子>= 1会执行扩容
    // 2. 负载因子>=5，也会执行扩容,因为此时负载情况已经比较严重了
    if (d->ht[0].used >= d->ht[0].size &&
        (dict_can_resize ||
         d->ht[0].used / d->ht[0].size > dict_force_resize_ratio))
    {
        return dictExpand(d, d->ht[0].used * 2);
    }
    return DICT_OK;
}

总结触发条件
- 当ht[0]为空的时候，自动初始化为容量为4的hash表
- 当ht[0] → used ≥ ht[0] → size 并且 redis可以扩容的标志位为1的时候可以进行扩容
  - 那么什么时候不可以扩容呢？(server.c)
```
void updateDictResizePolicy(void)
{
    if (server.rdb_child_pid == -1 && server.aof_child_pid == -1)
				// dict_can_resize = 1;
        dictEnableResize();
    else
        // dict_can_resize = 0;
        dictDisableResize();
}
```
    总结一下：
    - 当没有执行BGSAVE或者BGREWRITEAOF并且负载因子>= 1会执行扩容
- 当负载因子已经大于5的情况下，强制扩容
什么字典操作会造成rehash
- dictAdd：用来往 Hash 表中添加一个键值对。
- dictRelace：用来往 Hash 表中添加一个键值对，或者键值对存在时，修改键值对。
- dictAddorFind：直接调用 dictAddRaw

扩容机制

每次扩容扩多大？

首先来看什么函数调用了dictExpand

static int _dictExpandIfNeeded(dict *d)
{
		...

    /* If the hash table is empty expand it to the initial size. */
    // 如果hash表为空就进行扩展，初始大小为4
    if (d->ht[0].size == 0)
        return dictExpand(d, DICT_HT_INITIAL_SIZE);

		...
    if (d->ht[0].used >= d->ht[0].size &&
        (dict_can_resize ||
         d->ht[0].used / d->ht[0].size > dict_force_resize_ratio))
    {
        return dictExpand(d, d->ht[0].used * 2);
    }
}

也就是说，只要ht[0]不是空，那么就扩容以前的两倍

怎么实现扩容？

/* Expand or create the hash table */
int dictExpand(dict *d, unsigned long size)
{
    /* the size is invalid if it is smaller than the number of
     * elements already inside the hash table */
    // 这里判断是不是在rehash状态的依据就是dict的rehashIdx == -1
    if (dictIsRehashing(d) || d->ht[0].used > size)
        return DICT_ERR;

    dictht n; /* the new hash table */
		// 这里面做了两件事
    // 1. 如果说size已经超过了最大容量，就分配最大容量 + 1
    // 2. 否则，让i从初始容量4开始，不断的 * 2，直到大于等于size，并返回i
    unsigned long realsize = _dictNextPower(size);

    /* Rehashing to the same table size is not useful. */
    if (realsize == d->ht[0].size)
        return DICT_ERR;

    /* Allocate the new hash table and initialize all pointers to NULL */
    // 初始话ht的各个初始参数
    n.size = realsize;
    n.sizemask = realsize - 1;
    n.table = zcalloc(realsize * sizeof(dictEntry *));
    n.used = 0;

    /* Is this the first initialization? If so it's not really a rehashing
     * we just set the first hash table so that it can accept keys. */
    if (d->ht[0].table == NULL)
    {
        d->ht[0] = n;
        return DICT_OK;
    }

    /* Prepare a second hash table for incremental rehashing */
    d->ht[1] = n;
    d->rehashidx = 0;
    return DICT_OK;
}

渐进式rehash实现过程

/* Performs N steps of incremental rehashing. Returns 1 if there are still
 * keys to move from the old to the new hash table, otherwise 0 is returned.
 *
 * Note that a rehashing step consists in moving a bucket (that may have more
 * than one key as we use chaining) from the old to the new hash table, however
 * since part of the hash table may be composed of empty spaces, it is not
 * guaranteed that this function will rehash even a single bucket, since it
 * will visit at max N*10 empty buckets in total, otherwise the amount of
 * work it does would be unbound and the function may block for a long time.
 *
 * 什么时候进行rehash
 * 1. 每次增删改查dict的时候，每次rehash 1个bucket
 * 2. 如果对dict没有操作的话，那么就每隔100ms执行一次rehash，每次100个桶*/
int dictRehash(dict *d, int n)
{
    // 参数n就是桶bucket的数量
    int empty_visits = n * 10; /* Max number of empty buckets to visit. */
    if (!dictIsRehashing(d))
        return 0;

    // 大循环，当rehash桶的数量为0或者ht[0]中已经没有元素的时候就停止rehash
    while (n-- && d->ht[0].used != 0)
    {
        dictEntry *de, *nextde;

        /* Note that rehashidx can't overflow as we are sure there are more
         * elements because ht[0].used != 0 */
        // 不能溢出
        assert(d->ht[0].size > (unsigned long)d->rehashidx);
        while (d->ht[0].table[d->rehashidx] == NULL)
        {
            d->rehashidx++;
            if (--empty_visits == 0)
                // 当这个值为0的时候就说明整个hash表bucket为空的数量已经到了阈值，就不再进行rehash
                return 1;
        }
				// 拿到将要rehash的那个bucket
        de = d->ht[0].table[d->rehashidx];
        /* Move all the keys in this bucket from the old to the new hash HT */
        while (de)
        {
						// 这里就是真正rehash的地方
            uint64_t h;
						
						// 保存一下下一个要rehash的dictEntry
            nextde = de->next;
            /* Get the index in the new hash table */
            // 计算出在ht[1]上新的index
            h = dictHashKey(d, de->key) & d->ht[1].sizemask;
            // 将目前进行rehash的这个bucket的下一个位置指向ht[1]新的index的bucket的第一个元素上，
            // 其实就是头插法
            de->next = d->ht[1].table[h];
            // 将ht[1]新的bucket位置指向de
            d->ht[1].table[h] = de;
            d->ht[0].used--;
            d->ht[1].used++;
            // de变为下一个结点，重新rehash
            de = nextde;
        }
        d->ht[0].table[d->rehashidx] = NULL;
        d->rehashidx++;
    }

    /* Check if we already rehashed the whole table... */
    // 如果当前ht[0]所有节点已经迁移完毕，否则就等待下次rehash
    if (d->ht[0].used == 0)
    {
        zfree(d->ht[0].table);
        // 交换ht[0]和ht[1]
        d->ht[0] = d->ht[1];
        // 重置ht[1]
        _dictReset(&d->ht[1]);
        // rehashIdx = -1
        d->rehashidx = -1;
        return 0;
    }

    /* More to rehash... */
    return 1;
}

每次rehash多少？
```
static void _dictRehashStep(dict *d)
{
    if (d->iterators == 0)
        // 代表每次进行完一个bucket的rehash，hash表就可以执行正常的CRUD操作
        dictRehash(d, 1);
}
```
从 Redis 的源码中我们可以看到，一共会有 5 个函数通过调用 _dictRehashStep 函数，进而调用 dictRehash 函数，来执行 rehash，它们分别是：dictAddRaw，dictGenericDelete，dictFind，dictGetRandomKey，dictGetSomeKeys。

但是除了这种情况外，在server对dict没有操作的情况下，那么就每隔100ms执行一次rehash，每次100个桶

Redis 哈希表和字典总结

数据结构

哈希表的各种操作