Redis——字典

前言

Follow my step, read the code.

数据结构

首先定义的是一个dictEntry结构体,字典本身就是多个键值对，因此一个key和一个val

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;     /* Next entry in the same hash bucket. */
    void *metadata[];           /* An arbitrary number of bytes (starting at a
                                 * pointer-aligned address) of size as returned
                                 * by dictType's dictEntryMetadataBytes(). */
} dictEntry;
1
2
3
4
5
6
7
8
9
10
11
12
13

在这里插入图片描述
上面展示的就就是一个典型的哈希表，由诸多bucket组成，每个bucket指向一个dictEntry对象，由于dictEntry对象本身存在next成员，所以类似于链表，可以指向下一个dictEntry对象。因此有两点需要注意：

多个键的哈希值可能相等，一个bucket后可能链接多个dictEntry对象
bucket的数目会≥dictEntry对象的数目

因此就有一个问题，我查找的时候根据键key找到的是一个bucket的位置，如果这个bucket之后有多个dictEntry对象，我怎么知道是哪一个？

答案很简单，dictEntry对象本身就存储有key这个变量，直接一一比较即可。

dictType

里面是一些函数指针用来指定哈希函数，键复制函数，值复制函数，键比较函数，键析构函数，值析构函数

typedef struct dictType {
    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(dict *d, const void *key);
    void *(*valDup)(dict *d, const void *obj);
    int (*keyCompare)(dict *d, const void *key1, const void *key2);
    void (*keyDestructor)(dict *d, void *key);
    void (*valDestructor)(dict *d, void *obj);
    int (*expandAllowed)(size_t moreMem, double usedRatio);
    /* Allow a dictEntry to carry extra caller-defined metadata.  The
     * extra memory is initialized to 0 when a dictEntry is allocated. */
    size_t (*dictEntryMetadataBytes)(dict *d);
} dictType;
1
2
3
4
5
6
7
8
9
10
11
12

hash函数dictHashKey

这里是一个宏定义，调用指定的hashFunction得到hash值，注意返回的是一个uint64_t类型

#define dictHashKey(d, key) (d)->type->hashFunction(key)
1

dict

struct dict {
    dictType *type;

    dictEntry **ht_table[2];//两个哈希表
    unsigned long ht_used[2];

    long rehashidx; /* rehashing not in progress if rehashidx == -1 */

    /* Keep small vars at end for optimal (minimal) struct padding */
    int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */
    signed char ht_size_exp[2]; //这个成员记录的是两个哈希表的尺寸，实际值的指数就是哈希表的尺寸，如果是8，则哈希表的尺寸为2^8
};
1
2
3
4
5
6
7
8
9
10
11
12

字典的迭代器dictIterator

如果safe变量设置为1则意味着这是一个安全的迭代器，可以调用dictAdd,dictFind,不然的话只能调用 dictNext()

typedef struct dictIterator {
    dict *d;
    long index;
    int table, safe;
    dictEntry *entry, *nextEntry;
    /* unsafe iterator fingerprint for misuse detection. */
    unsigned long long fingerprint;
} dictIterator;
1
2
3
4
5
6
7
8

哈希表的初始大小

//初始大小是4
#define DICT_HT_INITIAL_EXP      2
#define DICT_HT_INITIAL_SIZE     (1<<(DICT_HT_INITIAL_EXP))
1
2
3

感觉有点复杂，还是从dict创建开始讲起吧

具体实现

创建一个新的哈希表

根据传入的dictType创建一个dict,函数相对比较简单

dict *dictCreate(dictType *type)
{
    dict *d = zmalloc(sizeof(*d));
    _dictInit(d,type);
    return d;
}
1
2
3
4
5
6

int _dictInit(dict *d, dictType *type)
{
    _dictReset(d, 0);
    _dictReset(d, 1);
    d->type = type;
    d->rehashidx = -1;
    d->pauserehash = 0;
    return DICT_OK;
}
1
2
3
4
5
6
7
8
9

static void _dictReset(dict *d, int htidx)
{
    d->ht_table[htidx] = NULL;
    d->ht_size_exp[htidx] = -1;//初始的时候两个哈希表的大小都小于1
    d->ht_used[htidx] = 0;//初始的时候两个哈希表都没用
}
1
2
3
4
5
6

增大哈希表的大小

int dictExpand(dict *d, unsigned long size) {
    return _dictExpand(d, size, NULL);
}
1
2
3

具体函数为：

int _dictExpand(dict *d, unsigned long size, int* malloc_failed)
{
    if (malloc_failed) *malloc_failed = 0;

    //如果正在重新哈希或者本来尺寸就足够的话，返回错误
    if (dictIsRehashing(d) || d->ht_used[0] > size)
        return DICT_ERR;

    //新的哈希表
    dictEntry **new_ht_table;
    unsigned long new_ht_used;
    //新的最合适的大小（power(2,new_ht_size_exp)>=size）
    signed char new_ht_size_exp = _dictNextExp(size);

    // ul指的是unsigned long
    size_t newsize = 1ul<<new_ht_size_exp;
    if (newsize < size || newsize * sizeof(dictEntry*) < newsize)
        return DICT_ERR;

    /* Rehashing to the same table size is not useful. */
    if (new_ht_size_exp == d->ht_size_exp[0]) return DICT_ERR;

    if (malloc_failed) {//malloc失败的话就调用 zcalloc
        new_ht_table = ztrycalloc(newsize*sizeof(dictEntry*));
        *malloc_failed = new_ht_table == NULL;
        if (*malloc_failed)
            return DICT_ERR;
    } else
    //这里申请的内存大小是newsize*sizeof(dictEntry*)，也就是newsize个dictEntry*
        new_ht_table = zcalloc(newsize*sizeof(dictEntry*));

    new_ht_used = 0;

    //在上面初始化的时候，两个ht_table都是NULL,因此在这里判断是否是真正的rehash
    if (d->ht_table[0] == NULL) {
        d->ht_size_exp[0] = new_ht_size_exp;
        d->ht_used[0] = new_ht_used;
        d->ht_table[0] = new_ht_table;
        return DICT_OK;
    }

    //如果第一个ht_table不是NULL，之前申请的空间就是为第二个ht_table准备的
    //该空间申请之后并未立即进行节点的转移，反而是将d->rehashidx设置为0
    //接下来便开始rehash操作了
    d->ht_size_exp[1] = new_ht_size_exp;
    d->ht_used[1] = new_ht_used;
    d->ht_table[1] = new_ht_table;
    d->rehashidx = 0;
    return DICT_OK;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

根据需要增大哈希表的大小

上一个函数是直接增大哈希表的大小，这里判断该哈希表需不需要扩充

static int _dictExpandIfNeeded(dict *d)
{
    //如果正在进行rehash，直接返回
    if (dictIsRehashing(d)) return DICT_OK;

    //计算第一个的哈希表大小，初始大小是4
    if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    //这个判断的条件就很多，需要同时满足三个条件：
    //1.ht_table[0]的所有空间都被使用
    //2.字典能够被调整大小（dict_can_resize是一个静态变量，初始值是1）或者ht_used[0]/ht_table[0]的尺寸大于dict_force_resize_ratio
    //3.dictTypeExpandAllowed(d)为真，因为一次可能会加载大量的内存，需要使用成员函数判断一下
        if (d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0]) &&
        (dict_can_resize ||
         d->ht_used[0]/ DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio) &&
        dictTypeExpandAllowed(d))
    {
        return dictExpand(d, d->ht_used[0] + 1);
    }
    return DICT_OK;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

Rehash

这个是dict的一个非常重要的步骤，意味着之前的bucket数量已经不足以容纳当前的节点，需要重新创建新的多个bucket，并将之前的值移动,但是在Redis中有两点需要注意：

同时移动大量的节点非常耗时，这将会影响Redis的速度，因此Redis中采用的是惰性移动方式，一次移动少量的节点
Redis中所有的dictEntry对象都是在堆中存储，移动的时候并不需要深拷贝，资源并未复制

int dictRehash(dict *d, int n) {
    int empty_visits = n*10; //最多访问n*10个bucket，如果连着n*10个bucket都是空的话，直接返回成功
    if (!dictIsRehashing(d)) return 0;

    while(n-- && d->ht_used[0] != 0) {//第一个表不能为空，这里写成d->ht_used[0] != NULL更好
        dictEntry *de, *nextde;

        //防止rehashidx溢出
        assert(DICTHT_SIZE(d->ht_size_exp[0]) > (unsigned long)d->rehashidx);
        //连着n*10个bucket都是空的话，直接返回成功
        while(d->ht_table[0][d->rehashidx] == NULL) {
            d->rehashidx++;
            if (--empty_visits == 0) return 1;
        }
        de = d->ht_table[0][d->rehashidx];
        //这个rehashidx是bucket的索引，有效的话把该bucket的dictEntry链全部移动
        while(de) {
            uint64_t h;

            nextde = de->next;
            h = dictHashKey(d, de->key) & DICTHT_SIZE_MASK(d->ht_size_exp[1]);//计算在新的哈希表中的hash值
            /*下面这两行代码虽然简单，但是笔者认为不好理解，这两行代码是这样一个过程：
            ht_table[1][h]是计算出来指定key的新位置，但该位置是一个指针，初始值为NULL
            第一次移动节点，de就是原节点的指针，将 de->next指向d->ht_table[1][h]，也就是de->next
            置为空,然后此时d->ht_table[1][h]指向de,完成第一个节点的转移；接着对于一个新的节点de，将 de->next指向d->ht_table[1][h]，也就是将de->next指向当前 d->ht_table[1][h]指向的节点，d->ht_table[1][h]又指向新的de，以此类推*/
            de->next = d->ht_table[1][h];
            d->ht_table[1][h] = de;
            d->ht_used[0]--;
            d->ht_used[1]++;
            de = nextde;
        }
        d->ht_table[0][d->rehashidx] = NULL;
        d->rehashidx++;
    }

    //节点全部转移完毕，废弃ht[1]仍然用ht[0]
    if (d->ht_used[0] == 0) {
        zfree(d->ht_table[0]);
        /* Copy the new ht onto the old one */
        d->ht_table[0] = d->ht_table[1];
        d->ht_used[0] = d->ht_used[1];
        d->ht_size_exp[0] = d->ht_size_exp[1];
        _dictReset(d, 1);
        d->rehashidx = -1;
        return 0;
    }

    /* More to rehash... */
    return 1;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

添加键

int dictAdd(dict *d, void *key, void *val)
{
    dictEntry *entry = dictAddRaw(d,key,NULL);

    if (!entry) return DICT_ERR;
    dictSetVal(d, entry, val);
    return DICT_OK;
}
1
2
3
4
5
6
7
8

这个函数非常重要

dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
    long index;
    dictEntry *entry;
    int htidx;

    if (dictIsRehashing(d)) _dictRehashStep(d);

    //获取新元素的索引，返回值为-1时表明已经存在
    if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
        return NULL;

    //如果是在进行rehash，新节点添加到ht[1]上
    htidx = dictIsRehashing(d) ? 1 : 0;
    size_t metasize = dictMetadataSize(d);//调用type中的dictEntryMetadataBytes计算metasize
    entry = zmalloc(sizeof(*entry) + metasize);//每个dictEntry对象申请的内存有这两部分
    if (metasize > 0) {
        memset(dictMetadata(entry), 0, metasize);
    }
    entry->next = d->ht_table[htidx][index];
    d->ht_table[htidx][index] = entry;
    d->ht_used[htidx]++;

    //这个函数（宏定义）主要是用来判断是否需要调用type->keyDup函数对key进行复制
    dictSetKey(d, entry, key);
    return entry;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

找到指定键的索引

static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing)
{
    unsigned long idx, table;
    dictEntry *he;
    if (existing) *existing = NULL;

    /* Expand the hash table if needed */
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return -1;
    for (table = 0; table <= 1; table++) {
        idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
        /* Search if this slot does not already contain the given key */
        he = d->ht_table[table][idx];
        while(he) {
            if (key==he->key || dictCompareKeys(d, key, he->key)) {
                if (existing) *existing = he;
                return -1;
            }
            he = he->next;
        }
        if (!dictIsRehashing(d)) break;
    }
    return idx;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

删除键

既然有增加键，也就有删除键

dictDelete

int dictDelete(dict *ht, const void *key) {
    return dictGenericDelete(ht,key,0) ? DICT_OK : DICT_ERR;
}
1
2
3

dictGenericDelete

static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) {
    uint64_t h, idx;
    dictEntry *he, *prevHe;
    int table;

    //原本就是空的话直接返回就好
    if (dictSize(d) == 0) return NULL;

	//发现正在进行rehash的话就来一步
    if (dictIsRehashing(d)) _dictRehashStep(d);
    h = dictHashKey(d, key);

    for (table = 0; table <= 1; table++) {
        idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
        he = d->ht_table[table][idx];
        prevHe = NULL;
        while(he) {
            //这个dictEntry对象位于这条链上，因此要逐个查找比较
            if (key==he->key || dictCompareKeys(d, key, he->key)) {//同样是宏定义
                if (prevHe)
                    prevHe->next = he->next;
                else//prevHe为空，说明是第一个节点
                    d->ht_table[table][idx] = he->next;
                if (!nofree) {
                    //只把节点断开，但是不释放这块内存
                    dictFreeUnlinkedEntry(d, he);
                }
                d->ht_used[table]--;
                return he;//在这里无论he的返回值不是NULL,无论是否释放这块内存
            }
            prevHe = he;
            he = he->next;
        }
        if (!dictIsRehashing(d)) break;
    }
    return NULL; /* not found */
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

有了删除键的相关函数，查找键和这个非常类似，这里就不详细叙述了

清除整个表

有了前面的代码，这部分也很好理解，先把所有节点中key val指向的内存释放干净之后直接free整张表

int _dictClear(dict *d, int htidx, void(callback)(dict*)) {
    unsigned long i;

    //这个判断条件很有意思，DICTHT_SIZE(d->ht_size_exp[htidx])和d->ht_used[htidx]有一个为0就终止遍历
    for (i = 0; i < DICTHT_SIZE(d->ht_size_exp[htidx]) && d->ht_used[htidx] > 0; i++) {
        dictEntry *he, *nextHe;

        if (callback && (i & 65535) == 0) callback(d);

        if ((he = d->ht_table[htidx][i]) == NULL) continue;
        while(he) {
            nextHe = he->next;
            dictFreeKey(d, he);
            dictFreeVal(d, he);
            zfree(he);
            d->ht_used[htidx]--;
            he = nextHe;
        }
    }
    /* Free the table and the allocated cache structure */
    zfree(d->ht_table[htidx]);
    /* Re-initialize the table */
    _dictReset(d, htidx);
    return DICT_OK; /* never fails */
}
//清空两个hash表
void dictRelease(dict *d)
{
    _dictClear(d,0,NULL);
    _dictClear(d,1,NULL);
    zfree(d);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

fingerprint

//指纹就是一个64位的数，用来表示字典在特定时间的状态，只是一个异或的字典属性；
//当一个不安全的迭代器初始化时，我们可以获得字典饿指纹，并在迭代器释放之后校验指纹
//如果两个指纹不一样，就意味着使用者在迭代时执行了违禁的操作
unsigned long long dictFingerprint(dict *d) {
    unsigned long long integers[6], hash = 0;
    int j;

    integers[0] = (long) d->ht_table[0];//将ht[0]的地址保存
    integers[1] = d->ht_size_exp[0];
    integers[2] = d->ht_used[0];
    integers[3] = (long) d->ht_table[1];//将ht[1]的地址保存
    integers[4] = d->ht_size_exp[1];
    integers[5] = d->ht_used[1];

    /* We hash N integers by summing every successive integer with the integer
     * hashing of the previous sum. Basically:
     *
     * Result = hash(hash(hash(int1)+int2)+int3) ...
     *
     * This way the same set of integers in a different order will (likely) hash
     * to a different number. */
    for (j = 0; j < 6; j++) {
        hash += integers[j];
        //使用一个特定的算法获得这6个unsigned long long数的hash值
        /* For the hashing step we use Tomas Wang's 64 bit integer hash. */
        hash = (~hash) + (hash << 21); // hash = (hash << 21) - hash - 1;
        hash = hash ^ (hash >> 24);
        hash = (hash + (hash << 3)) + (hash << 8); // hash * 265
        hash = hash ^ (hash >> 14);
        hash = (hash + (hash << 2)) + (hash << 4); // hash * 21
        hash = hash ^ (hash >> 28);
        hash = hash + (hash << 31);
    }
    return hash;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

迭代器

迭代器的基本组成为：

```c
typedef struct dictIterator {
    dict *d;
    long index;
    int table, safe;
    dictEntry *entry, *nextEntry;
    /* unsafe iterator fingerprint for misuse detection. */
    unsigned long long fingerprint;
} dictIterator;
1
2
3
4
5
6
7
8
9

```c
//第一步：获取迭代器，其实就是新创建一个迭代器
dictIterator *dictGetIterator(dict *d)
{
    dictIterator *iter = zmalloc(sizeof(*iter));

    iter->d = d;
    iter->table = 0;
    iter->index = -1;
    iter->safe = 0;
    iter->entry = NULL;
    iter->nextEntry = NULL;
    return iter;
}
//创建一个安全的迭代器，这里和上面的指纹都是围绕迭代器安不安全来说，所谓的安全：
//就是不对dict做出改变，例如增删节点，rehash等
dictIterator *dictGetSafeIterator(dict *d) {
    dictIterator *i = dictGetIterator(d);

    i->safe = 1;
    return i;
}
//获取下一个节点
dictEntry *dictNext(dictIterator *iter)
{
    while (1) {
        if (iter->entry == NULL) {
            if (iter->index == -1 && iter->table == 0) {
                if (iter->safe)
                //安全的话先暂停rehash
                    dictPauseRehashing(iter->d);
                else
                    iter->fingerprint = dictFingerprint(iter->d);
            }
            iter->index++;
            if (iter->index >= (long) DICTHT_SIZE(iter->d->ht_size_exp[iter->table])) {
                if (dictIsRehashing(iter->d) && iter->table == 0) {
                    iter->table++;
                    iter->index = 0;
                } else {
                    break;
                }
            }
            iter->entry = iter->d->ht_table[iter->table][iter->index];
        } else {
            iter->entry = iter->nextEntry;
        }
        if (iter->entry) {
            /* We need to save the 'next' here, the iterator user
             * may delete the entry we are returning. */
            iter->nextEntry = iter->entry->next;
            return iter->entry;
        }
    }
    return NULL;
}

void dictReleaseIterator(dictIterator *iter)
{
    if (!(iter->index == -1 && iter->table == 0)) {
        if (iter->safe)
            dictResumeRehashing(iter->d);
        else
            assert(iter->fingerprint == dictFingerprint(iter->d));
    }
    zfree(iter);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

个人感悟

大部分代码读下来，发现dict设计还是非常有意思的，一个dict两个hash表，尺寸以4 8 16…逐级增加，一个hash表空间不够，将节点转移到另一个hash表上（rehash），hash表的每个bucket中节点以链式的形式存放。把握了这些要点，后面的函数就很好理解，有问题可以在评论区留言，欢迎交流~

相关阅读:
vue传递给后端时间格式问题
 开发1-5年的Java程序员，该学习哪些知识实现涨薪30K？
ShareSDK for Flutter
Check for degenerate boxes检查退化框
 开源协议介绍
 Qt ListItem添加右键菜单
 vs2022的下载及安装教程（Visual Studio 2022）
Mysql 面试题总结
 C++ Reference: Standard C++ Library reference: C Library: cstdio: fgets
LLM探索：环境搭建与模型本地部署
原文地址：https://blog.csdn.net/qq_36763031/article/details/125448592