list源码解析(Cpython)

2021-06-18

2021-06-17

本文字数: 2.4k 阅读时长: 11 分钟

阅读次数次

源码文件

https://github.com/python/cpython/blob/main/Objects/listobject.c

PyListObject的定义

在列表对象接口listobject.h中，PyListObject的定义是：

typedef struct {
    PyObject_VAR_HEAD
    /* Vector of pointers to list elements.  list[0] is ob_item[0], etc. */
    PyObject **ob_item;

    /* ob_item contains space for 'allocated' elements.  The number
     * currently in use is ob_size.
     * Invariants:
     *     0 <= ob_size <= allocated
     *     len(list) == ob_size
     *     ob_item == NULL implies ob_size == allocated == 0
     * list.sort() temporarily sets allocated to -1 to detect mutations.
     *
     * Items must normally not be NULL, except during construction when
     * the list is not yet visible outside the function that builds it.
     */
    Py_ssize_t allocated;
} PyListObject;

1.ob_item是指向了元素列表所在的内存块的首地址
2.allocated维护了当前列表中的可容纳的元素的总数
我们知道，用户选用list正是为了可以频繁的执行插入或删除等操作，如果是需要存多少就申请多大的内存，
这种内存管理显然是低效的。那么Python内部是怎么实现的呢？这就与刚才所提到的allocated有关了，我们知道，
在PyObject_VAR_HEAD中有一个ob_size，在PyListObject中，每一次需要申请内存时，总会申请一大块内存存，
这时申请的总内存的大小记录记录在allocated中，而其中实际被使用了的内存的数量则记录在ob_size中。

PyListObject对象的创建与维护

创建

步骤

1.首先，进行溢出检查
2.List对象的创建Python中的list对象实际上是分为两部分
- 1.PyListObject对象本身
- 2.PyListObject对象维护的元素列表，而这两块内存是通过ob_item建立联系的
3.在创建PyListObject对象时，检查缓冲池中free_list是否有可用的对象，如果有，则直接使用，
4.若没有可用对象，则通过PyObject_GC_New在系统堆中申请内存
5.当创建了新的PyListObject对象之后，会根据调用PyList_New是传递的
size参数创建ListObject对象所维护的元素列表。

源码

Python对于创建一个列表，提供了唯一的一条途径，就是PyList_New()

PyObject *
PyList_New(Py_ssize_t size)
{
PyListObject *op;
size_t nbytes;
#ifdef SHOW_ALLOC_COUNT
static int initialized = 0;
if (!initialized) {
Py_AtExit(show_alloc);
initialized = 1;
}
#endif

if (size < 0) {
PyErr_BadInternalCall();
return NULL;
}

//进行溢出检查
if ((size_t)size > PY_SIZE_MAX / sizeof(PyObject *))
return PyErr_NoMemory();
nbytes = size * sizeof(PyObject *);

//为PyListObject对象申请空间，使用到缓冲池技术
if (numfree) {
numfree--;
op = free_list[numfree];
_Py_NewReference((PyObject *)op);
#ifdef SHOW_ALLOC_COUNT
count_reuse++;
#endif
} else {
op = PyObject_GC_New(PyListObject, &PyList_Type);
if (op == NULL)
return NULL;
#ifdef SHOW_ALLOC_COUNT
count_alloc++;
#endif
}

//为PyListObject对象中维护的元素列表申请空间
if (size <= 0)
op->ob_item = NULL;
else {
op->ob_item = (PyObject **) PyMem_MALLOC(nbytes);
if (op->ob_item == NULL) {
Py_DECREF(op);
return PyErr_NoMemory();
}
memset(op->ob_item, 0, nbytes);
}
Py_SIZE(op) = size;
op->allocated = size;
_PyObject_GC_TRACK(op);
return (PyObject *) op;
}

设置元素

元素创建好了，下一步就是向元素中添加元素了,通过PyList_SetItem()实现：

1	list[5] = 1

步骤

1.进行类型检查
2.然后进行索引的有效性检查
3.当类型检查和索引检查均通过的时候，就可以将待加入的Pyobject*指针放在指定的位置。

源码

PyList_SetItem(register PyObject *op, register Py_ssize_t i,
register PyObject *newitem)
{
register PyObject *olditem;
register PyObject **p;
if (!PyList_Check(op)) {
Py_XDECREF(newitem);
PyErr_BadInternalCall();
return -1;
}
//索引检查
if (i < 0 || i >= Py_SIZE(op)) {
Py_XDECREF(newitem);
PyErr_SetString(PyExc_IndexError,
"list assignment index out of range");
return -1;
}
//存放元素
p = ((PyListObject *)op) -> ob_item + i;
olditem = *p;
*p = newitem;
Py_XDECREF(olditem);
return 0;
}

插入元素

插入元素和设置元素的不同在于:设置元素不会将ob_item指向的内存
发生变化，而插入内存可能会导致ob_item指向的内存发生变化。

比如：
1
2
3
4
5
6
a = [0, 0, 0, 0]
a[2] = 3
print(a)
[0, 0, 3, 0]
a.insert(2,3)
[0, 0, 2, 3, 0]
这个插入动作确实导致了元素列表的内存发生变化。关于插入，在列表中有两种操作：insert()和append()。

insert插入步骤

1.参数检查
2.从新调整列表容量通过 list_resize 方法确定是否需要申请内存
3.确定插入点
4.插入元素

insert源码

static int
ins1(PyListObject *self, Py_ssize_t where, PyObject *v)
{
    Py_ssize_t i, n = Py_SIZE(self);
    PyObject **items;
    if (v == NULL) {
        PyErr_BadInternalCall();
        return -1;
    }
    if (n == PY_SSIZE_T_MAX) {
        PyErr_SetString(PyExc_OverflowError,
            "cannot add more objects to list");
        return -1;
    }

    if (list_resize(self, n+1) < 0)
        return -1;

    if (where < 0) {
        where += n;
        if (where < 0)
            where = 0;
    }
    if (where > n)
        where = n;
    items = self->ob_item;
    for (i = n; --i >= where; )
        items[i+1] = items[i];
    Py_INCREF(v);
    items[where] = v;
    return 0;
}

append步骤

1.参数检查
2.容量检查
3.调用 list_resize 方法检查是否需要申请内存
4.添加元素

append源码

int
PyList_Append(PyObject *op, PyObject *newitem)
{
    if (PyList_Check(op) && (newitem != NULL))
        return app1((PyListObject *)op, newitem);
    PyErr_BadInternalCall();
    return -1;
}

列表转元组

其实就是新建一个大小和列表一样大小的数组，并将该列表内的元素添加到元组中

PyObject *
PyList_AsTuple(PyObject *v)
{
    PyObject *w;
    PyObject **p, **q;
    Py_ssize_t n;
    if (v == NULL || !PyList_Check(v)) {
        PyErr_BadInternalCall();
        return NULL;
    }
    n = Py_SIZE(v);
    w = PyTuple_New(n);
    if (w == NULL)
        return NULL;
    p = ((PyTupleObject *)w)->ob_item;
    q = ((PyListObject *)v)->ob_item;
    while (--n >= 0) {
        Py_INCREF(*q);
        *p = *q;
        p++;
        q++;
    }
    return w;
}

列表反转

双向指针反转

static void
reverse_slice(PyObject **lo, PyObject **hi)
{
    assert(lo && hi);

    --hi;
    while (lo < hi) {
        PyObject *t = *lo;
        *lo = *hi;
        *hi = t;
        ++lo;
        --hi;
    }
}

列表extend

extend传说中的鸭子类型（动态类型），不在乎对象的确切形态，只要是可迭代对象就可以操作

if (PyList_CheckExact(iterable) || PyTuple_CheckExact(iterable) ||
                (PyObject *)self == iterable) {
        PyObject **src, **dest;
        iterable = PySequence_Fast(iterable, "argument must be iterable");
        if (!iterable)
            return NULL;

删除元素

对于一个容器而言，除了创建，插入这些操作，肯定是还得有删除操作的。

remove()
a = [1, 2, 3, 4]
print a.remove(3)
[1, 2, 4]

步骤

1.遍历ob_item对比remove的对象
2.返回cmp大于0调用list_ass_slice删除后判断是否还有元素
3.如果本来就没有这个元素直接跳到最后报错
1
ValueError: list.remove(x): x not in list

源码

static PyObject *
list_remove(PyListObject *self, PyObject *value)
/*[clinic end generated code: output=f087e1951a5e30d1 input=2dc2ba5bb2fb1f82]*/
{
    Py_ssize_t i;

    for (i = 0; i < Py_SIZE(self); i++) {
        PyObject *obj = self->ob_item[i];
        Py_INCREF(obj);
        int cmp = PyObject_RichCompareBool(obj, value, Py_EQ);
        Py_DECREF(obj);
        if (cmp > 0) {
            if (list_ass_slice(self, i, i+1,
                               (PyObject *)NULL) == 0)
                Py_RETURN_NONE;
            return NULL;
        }
        else if (cmp < 0)
            return NULL;
    }
    PyErr_SetString(PyExc_ValueError, "list.remove(x): x not in list");
    return NULL;
}

PyListObject对象缓冲池

1.在创建PyListObject对象时，会首先检查缓冲区中的free_lists中是否有可用的对象，如果没有。
- 1.创建PyListObject对象
- 2.然后创建PyListObject对象所维护的元素列表，与之对应
2.在销毁一个list时销毁的过程也是分离的。
- 1.销毁PyListObject所维护的元素列表
- 2.然后释放PyListObject对象自身
3.在删除PylsitObject对象（较小的）自身时
- 1.Python会先检查我们开始提到的那个缓冲池free_list，查看其中缓存的PyListObject的数量是否已经满了
- 2.如果没有，就将待删除的PyListObject对象放到缓冲池中，以备后用

因此，那个在Python启动时空荡荡的缓冲池原来都是被本应该死去的PyListObject对象给填充了，在以后需要创建新的PyListObject的时候，
Python会首先唤醒这些对象，重新分配Pyobject*元素列表占用的内存，重新拥抱新的对象。

list_resize补充

list作为一个可变对象，自然离不开内存的增加和减少(append、extend、insert、remove、pop等等都涉及到)，所以这方面对性能的影响也是有的。

步骤

1.如果allocated/2 <= newsize <= allocated则allocated不变直接把ob_size设置成Py_SET_SIZE(self, newsize)

否则

1 2	allocated = newsize + ((newsize >> 3) + (newsize < 9 ? 3 : 6)) ob_size = newsize

源码

static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
    PyObject **items;
    size_t new_allocated, num_allocated_bytes;
    Py_ssize_t allocated = self->allocated;

    /* Bypass realloc() when a previous overallocation is large enough
       to accommodate the newsize.  If the newsize falls lower than half
       the allocated size, then proceed with the realloc() to shrink the list.
    */
    if (allocated >= newsize && newsize >= (allocated >> 1)) {
        assert(self->ob_item != NULL || newsize == 0);
        Py_SET_SIZE(self, newsize);
        return 0;
    }

    /* This over-allocates proportional to the list size, making room
     * for additional growth.  The over-allocation is mild, but is
     * enough to give linear-time amortized behavior over a long
     * sequence of appends() in the presence of a poorly-performing
     * system realloc().
     * Add padding to make the allocated size multiple of 4.
     * The growth pattern is:  0, 4, 8, 16, 24, 32, 40, 52, 64, 76, ...
     * Note: new_allocated won't overflow because the largest possible value
     *       is PY_SSIZE_T_MAX * (9 / 8) + 6 which always fits in a size_t.
     */
    new_allocated = ((size_t)newsize + (newsize >> 3) + 6) & ~(size_t)3;
    /* Do not overallocate if the new size is closer to overallocated size
     * than to the old size.
     */
    if (newsize - Py_SIZE(self) > (Py_ssize_t)(new_allocated - newsize))
        new_allocated = ((size_t)newsize + 3) & ~(size_t)3;

    if (newsize == 0)
        new_allocated = 0;
    num_allocated_bytes = new_allocated * sizeof(PyObject *);
    items = (PyObject **)PyMem_Realloc(self->ob_item, num_allocated_bytes);
    if (items == NULL) {
        PyErr_NoMemory();
        return -1;
    }
    self->ob_item = items;
    Py_SET_SIZE(self, newsize);
    self->allocated = new_allocated;
    return 0;
}

python分析

import sys
for i in range(100):
	if sys.getsizeof(a) not in b:
		print(i, sys.getsizeof(a))
		b.add(sys.getsizeof(a))
	a.append(i)

	
0 56
1 88
5 120
9 184
17 256
26 336
36 424
47 520
59 632
73 760
89 904

append&insert&extend性能分析

insert的逻辑中，需要对插入部分后面的数据进行后移操作，
以及对插入位置进行一些范围处理，所以效率上会比append差一些z，
在append里每添加一条数据都需要进行尺寸判断，不够则重新申请，
而extend则是一口气算出最终需要的尺寸，只需要申请一次即可，
所以效率上append会比extend差一些。

python源码解析