Redis之ziplist源码分析

一、ziplist简介

从上一篇分析我们知道quicklist的底层存储使用了ziplist（压缩列表），由于压缩列表本身也有不少内容，所以重新开了一篇，在正式源码之前，还是先看下ziplist的特点：

ziplist是一种特殊编码的双向列表，特殊编码是为了节省存储空间。
ziplist允许同时存放字符串和整型类型，并且整型数被编码成真实的整型数而不是字符串序列(节省空间)。
ziplist列表支持在头部和尾部进行push和pop操作的时间复杂度都在常量范围O(1)，但是每次操作都涉及内存重新分配，尤其在头部操作时，会涉及大段的内存移动操作，增加了操作的复杂性。

上面粗体部分会在下面的代码分析中一一体现（ziplist.h和ziplist.c）。

二、ziplist数据结构

下面我们先看一下ziplist的结构示意图：

上面示意图展示了ziplist的整体结构，由于ziplist和entry的长度是不定长的，因此代码中也没有这两个接口的定义，这里先给出一个示意结构定义，方便理解：

struct ziplist{

unsigned int zlbytes; // ziplist的长度字节数，包含头部、所有entry和zipend。
unsigned int zloffset; // 从ziplist的头指针到指向最后一个entry的偏移量，用于快速反向查询
unsigned short int zllength; // entry元素个数
T[] entry;              // 元素值
unsigned char zlend;   // ziplist结束符，值固定为0xFF

}

struct entry{
char[var] prevlen; // 前面一个entry的字节长度值。
char[var] encoding; // 元素编码类型
char[] content; // 元素内容
}

代码中对ziplist的变量的读取和赋值都是通过宏来实现的，如下：

define ZIPLIST_BYTES(zl) (((uint32_t)(zl)))

define ZIPLIST_TAIL_OFFSET(zl) (((uint32_t)((zl)+sizeof(uint32_t))))

define ZIPLIST_LENGTH(zl) (((uint16_t)((zl)+sizeof(uint32_t)*2)))

define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t))

define ZIPLIST_END_SIZE (sizeof(uint8_t))

define ZIPLIST_ENTRY_HEAD(zl) ((zl)+ZIPLIST_HEADER_SIZE)

define ZIPLIST_ENTRY_TAIL(zl) ((zl)+intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)))

define ZIPLIST_ENTRY_END(zl) ((zl)+intrev32ifbe(ZIPLIST_BYTES(zl))-1)

entry的结构要稍微复杂一些了，里面对prevlen和encoding做了特殊编码以节省空间，ziplist的精髓也正是在这里体现的。

首先看prevlen赋值方法的源代码：

/* Encode the length of the previous entry and write it to "p". Return the

number of bytes needed to encode this length if "p" is NULL.
把前一个entry的长度编码后写入当前entry的prevLen字段，编码规则：
1. 如果len<254，prevLen占用一个字节，并写入当前entry的第一个字节。
2. 如果len>=254，prevLen占用五个字节，第一个字节固定写入254，第二个至第五个字节写入实际的长度。
  */

static unsigned int zipPrevEncodeLength(unsigned char *p, unsigned int len) {

if (p == NULL) { // 此时只是计算len所需的存储长度
    return (len < ZIP_BIGLEN) ? 1 : sizeof(len)+1;
} else {
    if (len < ZIP_BIGLEN) {
        p[0] = len;
        return 1;
    } else {
        p[0] = ZIP_BIGLEN;
        memcpy(p+1,&len,sizeof(len));
        memrev32ifbe(p+1);
        return 1+sizeof(len);
    }
}

}

下面再来分析encoding字段，redis根据存储元素的值做不同的编码（long long类型和String类型），long long类型编码也是为了节省空间，其是在zipTryEncoding方法中进行：

/* Check if string pointed to by 'entry' can be encoded as an integer.

Stores the integer value in 'v' and its encoding in 'encoding'.
当存储内容可以转化为long long类型时，encoding占用一个字节，其中前2位固定都是1，后面6位根据value值大小不同，具体如下：
1. OX11000000 表示content内容是int16，长度是2个字节。
2. OX11010000 表示content内容是int32，长度是4个字节。
3. OX11100000 表示content内容是int64，长度是8个字节。
4. OX11110000 表示content内容是int24，长度是3个字节。
5. OX11111110 表示content内容是int8，长度是1个字节。
6. OX11111111 表示ziplist的结束。
7. 0X1111xxxx 表示极小数，存储0-12的值，由于0000和1111都不能使用，所以它的实际值将是1至13，程序在取得这4位的值之后，还需要减去1，才能计算出正确的值，比如说，如果后4位为0001 = 1，那么程序返回的值将是1-1=0。
  */

static int zipTryEncoding(unsigned char entry, unsigned int entrylen, long long v, unsigned char *encoding) {

long long value;

if (entrylen >= 32 || entrylen == 0) return 0;
if (string2ll((char*)entry,entrylen,&value)) {
    /* Great, the string can be encoded. Check what's the smallest
     * of our encoding types that can hold this value. */
    if (value >= 0 && value <= 12) {
        *encoding = ZIP_INT_IMM_MIN+value;
    } else if (value >= INT8_MIN && value <= INT8_MAX) {
        *encoding = ZIP_INT_8B;
    } else if (value >= INT16_MIN && value <= INT16_MAX) {
        *encoding = ZIP_INT_16B;
    } else if (value >= INT24_MIN && value <= INT24_MAX) {
        *encoding = ZIP_INT_24B;
    } else if (value >= INT32_MIN && value <= INT32_MAX) {
        *encoding = ZIP_INT_32B;
    } else {
        *encoding = ZIP_INT_64B;
    }
    *v = value;
    return 1;
}
return 0;

}

上述方法定义了是否可以编码为long long类型，如果不能，则编码为String类型并赋值，编码代码在zipEncodeLength方法：

/* Encode the length 'rawlen' writing it in 'p'. If p is NULL it just returns

the amount of bytes required to encode such a length.
本方法对encoding是String类型时，进行编码并赋值（如果entry内容可以转化为long long类型，在zipTryEncoding方法中进行编码），并根据不同长度的字符串来编码encoding的值，具体如下：
1. 0X00xxxxxx 前两位00表示最大长度为63的字符串，后面6位表示实际字符串长度，encoding占用1个字节。
2. 0X01xxxxxx xxxxxxxx 前两位01表示中等长度的字符串（大于63小于等于16383），后面14位表示实际长度，encoding占用两个字节。
3. OX10000000 xxxxxxxx xxxxxxxx xxxxxxxx 表示特大字符串，第一个字节固定128(0X80)，后面四个字节存储实际长度，encoding占用5个字节。
  */

static unsigned int zipEncodeLength(unsigned char *p, unsigned char encoding, unsigned int rawlen) {

unsigned char len = 1, buf[5];

if (ZIP_IS_STR(encoding)) {
    /* Although encoding is given it may not be set for strings,
     * so we determine it here using the raw length. */
    if (rawlen <= 0x3f) {
        if (!p) return len;
        buf[0] = ZIP_STR_06B | rawlen;
    } else if (rawlen <= 0x3fff) {
        len += 1;
        if (!p) return len;
        buf[0] = ZIP_STR_14B | ((rawlen >> 8) & 0x3f);
        buf[1] = rawlen & 0xff;
    } else {
        len += 4;
        if (!p) return len;
        buf[0] = ZIP_STR_32B;
        buf[1] = (rawlen >> 24) & 0xff;
        buf[2] = (rawlen >> 16) & 0xff;
        buf[3] = (rawlen >> 8) & 0xff;
        buf[4] = rawlen & 0xff;
    }
} else {
    /* Implies integer encoding, so length is always 1. */
    if (!p) return len;
    buf[0] = encoding;
}

/* Store this length at p */
memcpy(p,buf,len);
return len;

}

三、ziplist增删改查

创建ziplist

在执行lpush命令时，如果当前quicklistNode是新建的，则需要新建一个ziplist：

/* Add new entry to head node of quicklist.
*

Returns 0 if used existing head.
Returns 1 if new head created.
在quicklist的头部节点添加新元素：

如果新元素添加在head中，返回0，否则返回1.
*/
int quicklistPushHead(quicklist quicklist, void value, size_t sz) {

quicklistNode *orig_head = quicklist->head;
// 如果head不为空，且空间大小满足新元素的存储要求，则新元素添加到head中，否则新加一个quicklistNode
if (likely(
        _quicklistNodeAllowInsert(quicklist->head, quicklist->fill, sz))) {
    quicklist->head->zl =
        ziplistPush(quicklist->head->zl, value, sz, ZIPLIST_HEAD);
    quicklistNodeUpdateSz(quicklist->head);
} else {
    // 创建新的quicklistNode
    quicklistNode *node = quicklistCreateNode();
    // 把新元素添加到新建的ziplist中
    node->zl = ziplistPush(ziplistNew(), value, sz, ZIPLIST_HEAD);
    // 更新ziplist的长度到quicklistNode的sz字段
    quicklistNodeUpdateSz(node);
    // 把新node添加到quicklist中，即添加到原head前面
    _quicklistInsertNodeBefore(quicklist, quicklist->head, node);
}
quicklist->count++;
quicklist->head->count++;
return (orig_head != quicklist->head);

}

/ Create a new empty ziplist. /
unsigned char *ziplistNew(void) {

unsigned int bytes = ZIPLIST_HEADER_SIZE+1;
unsigned char *zl = zmalloc(bytes);
ZIPLIST_BYTES(zl) = intrev32ifbe(bytes);
ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(ZIPLIST_HEADER_SIZE);
ZIPLIST_LENGTH(zl) = 0;
zl[bytes-1] = ZIP_END;
return zl;

}

添加entry

添加entry的代码在ziplistPush方法中：

unsigned char ziplistPush(unsigned char zl, unsigned char *s, unsigned int slen, int where) {

unsigned char *p;
p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl);
return __ziplistInsert(zl,p,s,slen);

}

/ Insert item at "p". zl中添加一个元素 /
static unsigned char __ziplistInsert(unsigned char zl, unsigned char p, unsigned char s, unsigned int slen) {

size_t curlen = intrev32ifbe(ZIPLIST_BYTES(zl)), reqlen;
unsigned int prevlensize, prevlen = 0;
size_t offset;
int nextdiff = 0;
unsigned char encoding = 0;
long long value = 123456789; /* initialized to avoid warning. Using a value
                                that is easy to see if for some reason
                                we use it uninitialized. */
zlentry tail;

/* Find out prevlen for the entry that is inserted. */
if (p[0] != ZIP_END) {
    ZIP_DECODE_PREVLEN(p, prevlensize, prevlen);
} else {
    // 当之前的操作从尾巴删除元素时，ZIPLIST_ENTRY_TAIL指针会向前迁移，此时ptail[0] != ZIP_END
    unsigned char *ptail = ZIPLIST_ENTRY_TAIL(zl);
    if (ptail[0] != ZIP_END) {
        prevlen = zipRawEntryLength(ptail);
    }
}

/* See if the entry can be encoded */
// 检查entry的value是否可以编码为long long类型，如果可以就把值保存在value中，
// 并把所需最小字节长度保存在encoding
if (zipTryEncoding(s,slen,&value,&encoding)) {
    /* 'encoding' is set to the appropriate integer encoding */
    reqlen = zipIntSize(encoding);
} else {
    /* 'encoding' is untouched, however zipEncodeLength will use the
     * string length to figure out how to encode it. */
    reqlen = slen;
}
/* We need space for both the length of the previous entry and
 * the length of the payload. */
reqlen += zipPrevEncodeLength(NULL,prevlen);
reqlen += zipEncodeLength(NULL,encoding,slen);

/* When the insert position is not equal to the tail, we need to
 * make sure that the next entry can hold this entry's length in
 * its prevlen field. */
nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;

// reqlen是zlentry所需大小，nextdiff是待插入位置原entry中prelen与新entry中prelen所需存储空间的大小差值。
/* Store offset because a realloc may change the address of zl. */
offset = p-zl;
zl = ziplistResize(zl,curlen+reqlen+nextdiff);
p = zl+offset;

/* Apply memory move when necessary and update tail offset. */
if (p[0] != ZIP_END) {
    /* Subtract one because of the ZIP_END bytes */
    // 原数据向后移动，腾出空间写入新的zlentry
    memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);

    /* Encode this entry's raw length in the next entry. */
    // 新entry的长度写入下一个zlentry的prelen
    zipPrevEncodeLength(p+reqlen,reqlen);

    /* Update offset for tail */
    // 更新ZIPLIST_TAIL_OFFSET指向原来的tail entry。
    ZIPLIST_TAIL_OFFSET(zl) =
        intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+reqlen);

    /* When the tail contains more than one entry, we need to take
     * "nextdiff" in account as well. Otherwise, a change in the
     * size of prevlen doesn't have an effect on the *tail* offset. */
    zipEntry(p+reqlen, &tail);
    // 如果原插入位置的entry不是最后的tail元素，需要调整ZIPLIST_TAIL_OFFSET值（增加nextdiff）
    if (p[reqlen+tail.headersize+tail.len] != ZIP_END) {
        ZIPLIST_TAIL_OFFSET(zl) =
            intrev32ifbe(intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl))+nextdiff);
    }
} else {
    /* This element will be the new tail. */
    // ZIPLIST_TAIL_OFFSET指向新加的entry，即新加的entry是tail元素
    ZIPLIST_TAIL_OFFSET(zl) = intrev32ifbe(p-zl);
}

/* When nextdiff != 0, the raw length of the next entry has changed, so
 * we need to cascade the update throughout the ziplist */
if (nextdiff != 0) {
    // 如果nextdiff不为0，需要循环更新后续entry中的prelen，最差情况下，所有entry都需要更新一遍
    offset = p-zl;
    zl = __ziplistCascadeUpdate(zl,p+reqlen);
    p = zl+offset;
}

/* Write the entry */
// 给新加的entry赋值
p += zipPrevEncodeLength(p,prevlen);
p += zipEncodeLength(p,encoding,slen);
if (ZIP_IS_STR(encoding)) {
    memcpy(p,s,slen);
} else {
    zipSaveInteger(p,value,encoding);
}
ZIPLIST_INCR_LENGTH(zl,1);
return zl;

}

从上面代码可以看出，如果是在头部添加元素时，需要把执行memmove方法把当前ziplist中的所有元素后移一段距离，消耗还是比较大的。

删除entry

删除操作在ziplistDelete方法中实现，其逻辑和添加刚刚相反，就不再赘述了。

至此，ziplist的主体代码就分析结束了，从代码可以看到，ziplist的实现非常精妙，尽可能的节省存储空间，但是在头部操作时，会有大量的内存移动操作，消耗挺大，在尾部操作时，无内存移动，效率则要高很多。

本篇内容参考了钱文品的《Redis深度历险：核心原理与应用实践》，特此感谢！

https://github.com/tomliugen
原文地址https://www.cnblogs.com/xinghebuluo/p/12727840.html

Redis之ziplist源码分析

define ZIPLIST_BYTES(zl) (((uint32_t)(zl)))

define ZIPLIST_TAIL_OFFSET(zl) (((uint32_t)((zl)+sizeof(uint32_t))))

define ZIPLIST_LENGTH(zl) (((uint16_t)((zl)+sizeof(uint32_t)*2)))

define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t))

define ZIPLIST_END_SIZE (sizeof(uint8_t))

define ZIPLIST_ENTRY_HEAD(zl) ((zl)+ZIPLIST_HEADER_SIZE)

define ZIPLIST_ENTRY_TAIL(zl) ((zl)+intrev32ifbe(ZIPLIST_TAIL_OFFSET(zl)))

define ZIPLIST_ENTRY_END(zl) ((zl)+intrev32ifbe(ZIPLIST_BYTES(zl))-1)

热门文章

最新文章

相关课程

相关电子书

相关实验场景