前言:
写这个文章的初衷是因为今天手写了一个字典树,然后写字典树以后忽然想到了之前看的技术文章,linux kernel 之前的pid 申请方式已经从 bitmap 变成了 基数树,所以打算写文章再回顾一下这种数据结构算法
一、内核中pid的申请
旧的内核中pid的申请就不多说了,是使用的是bitmap去实现的,新版的换为了基数树。申请pid的内核代码在:
- struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
- size_t set_tid_size)
- {
- .......
- }
继续看代码,申请函数为:
- if (tid) {
- nr = idr_alloc(&tmp->idr, NULL, tid,
- tid + 1, GFP_ATOMIC);
- /*
- * If ENOSPC is returned it means that the PID is
- * alreay in use. Return EEXIST in that case.
- */
- if (nr == -ENOSPC)
- nr = -EEXIST;
- } else {
- int pid_min = 1;
- /*
- * init really needs pid 1, but after reaching the
- * maximum wrap back to RESERVED_PIDS
- */
- if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
- pid_min = RESERVED_PIDS;
-
- /*
- * Store a null pointer so find_pid_ns does not find
- * a partially initialized PID (see below).
- */
- nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
- pid_max, GFP_ATOMIC);
- }
进程最大的pid号为:
-
- int pid_max = PID_MAX_DEFAULT;
-
- /*
- * This controls the default maximum pid allocated to a process
- */
- #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
这里我发现了一个问题,linux 最大的pid 是 0x8000 而不是 0xFFFF,这是为了和老的内核兼容,我们可以通过
cat /proc/sys/kernel/pid_max
查询当前内核的最大pid。
申请pid 函数我们聚焦在
idr_alloc 和 idr_alloc_cyclic
再进去看 idr_alloc_cyclic 的实现
- int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
- {
- u32 id = idr->idr_next;
- int err, max = end > 0 ? end - 1 : INT_MAX;
-
- if ((int)id < start)
- id = start;
-
- err = idr_alloc_u32(idr, ptr, &id, max, gfp);
- if ((err == -ENOSPC) && (id > start)) {
- id = start;
- err = idr_alloc_u32(idr, ptr, &id, max, gfp);
- }
- if (err)
- return err;
-
- idr->idr_next = id + 1;
- return id;
- }
具体实现pid 申请的函数在:
- int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
- unsigned long max, gfp_t gfp)
- {
- struct radix_tree_iter iter;
- void __rcu **slot;
- unsigned int base = idr->idr_base;
- unsigned int id = *nextid;
-
- if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
- idr->idr_rt.xa_flags |= IDR_RT_MARKER;
-
- id = (id < base) ? 0 : id - base;
- radix_tree_iter_init(&iter, id);
- slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
- if (IS_ERR(slot))
- return PTR_ERR(slot);
-
- *nextid = iter.index + base;
- /* there is a memory barrier inside radix_tree_iter_replace() */
- radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
- radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);
-
- return 0;
- }
我们需要重点分析这个函数,重要的数据结构radix_tree_iter,我们看一下这个数据结构:
这是一个基数树的迭代器
- /**
- * struct radix_tree_iter - radix tree iterator state
- *
- * @index: index of current slot
- * @next_index: one beyond the last index for this chunk
- * @tags: bit-mask for tag-iterating
- * @node: node that contains current slot
- *
- * This radix tree iterator works in terms of "chunks" of slots. A chunk is a
- * subinterval of slots contained within one radix tree leaf node. It is
- * described by a pointer to its first slot and a struct radix_tree_iter
- * which holds the chunk's position in the tree and its size. For tagged
- * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
- * radix tree tag.
- */
- struct radix_tree_iter {
- unsigned long index;
- unsigned long next_index;
- unsigned long tags;
- struct radix_tree_node *node;
- };
迭代器中有重要的数据结构
#define radix_tree_node xa_node
也就是替换后 变为了
struct xa_node*
我们再继续看 xa_node* 这个数据结构,这是基数树的节点
- /*
- * @count is the count of every non-NULL element in the ->slots array
- * whether that is a value entry, a retry entry, a user pointer,
- * a sibling entry or a pointer to the next level of the tree.
- * @nr_values is the count of every element in ->slots which is
- * either a value entry or a sibling of a value entry.
- */
- struct xa_node {
- unsigned char shift; /* Bits remaining in each slot */
- unsigned char offset; /* Slot offset in parent */
- unsigned char count; /* Total entry count */
- unsigned char nr_values; /* Value entry count */
- struct xa_node __rcu *parent; /* NULL at top of tree */
- struct xarray *array; /* The array we belong to */
- union {
- struct list_head private_list; /* For tree user */
- struct rcu_head rcu_head; /* Used when freeing node */
- };
- void __rcu *slots[XA_CHUNK_SIZE];
- union {
- unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS];
- unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS];
- };
- };
这些宏:
- #ifndef XA_CHUNK_SHIFT
- #define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
- #endif
- #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
XA_CHUNK_SIZE 是 64 或者16.
#define XA_MAX_MARKS 3
对应的CHUNK是6个位或者 4个位
然后我们看到pid最大是0x8000, 4 *4 为 16 个 槽,这意味着基数树高最大为3((16 / 6 ) + 1)
- #if defined __x86_64__ && !defined __ILP32__
- # define __WORDSIZE 64
- #else
- # define __WORDSIZE 32
- #define __WORDSIZE32_SIZE_ULONG 0
- #define __WORDSIZE32_PTRDIFF_LONG 0
- #endif
-
- #ifndef BITS_PER_LONG
- # define BITS_PER_LONG __WORDSIZE
- #endif
BITS_PER_LONG 为64 或者32
- #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-
- /*
- * The IDR API does not expose the tagging functionality of the radix tree
- * to users. Use tag 0 to track whether a node has free space below it.
- */
- #define IDR_FREE 0
带入计算就是:
(64 + 64 -1) / 64 = 1
对应的宏
- #define XA_MAX_MARKS 3
- #define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
初始化基数树的迭代器:
- /**
- * radix_tree_iter_init - initialize radix tree iterator
- *
- * @iter: pointer to iterator state
- * @start: iteration starting index
- * Returns: NULL
- */
- static __always_inline void __rcu **
- radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
- {
- /*
- * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
- * in the case of a successful tagged chunk lookup. If the lookup was
- * unsuccessful or non-tagged then nobody cares about ->tags.
- *
- * Set index to zero to bypass next_index overflow protection.
- * See the comment in radix_tree_next_chunk() for details.
- */
- iter->index = 0;
- iter->next_index = start;
- return NULL;
- }
主要看idr_get_free,这个函数要看懂
shift -= RADIX_TREE_MAP_SHIFT;
shift 最大是16, 每次我们移动是6位,对应的是CHUNKSIZE
- if (!tag_get(node, IDR_FREE, offset)) {
- offset = radix_tree_find_next_bit(node, IDR_FREE,
- offset + 1);
- start = next_index(start, node, offset);
- if (start > max || start == 0)
- return ERR_PTR(-ENOSPC);
- while (offset == RADIX_TREE_MAP_SIZE) {
- offset = node->offset + 1;
- node = node->parent;
- if (!node)
- goto grow;
- shift = node->shift;
- }
- child = rcu_dereference_raw(node->slots[offset]);
- }
每次循环如果循环到tag不是IDR_FREE,就会继续找下一个tag,直到找到是IDR_FREE的solt。
在这里退出循环
- else if (!radix_tree_is_internal_node(child))
- break;
最后返回空闲槽
- iter->index = start;
- if (node)
- iter->next_index = 1 + min(max, (start | node_maxindex(node)));
- else
- iter->next_index = 1;
- iter->node = node;
- set_iter_tags(iter, node, offset, IDR_FREE);
-
- return slot;
上面的start 就应该是要申请的pid,我们最后看next_index
- /*
- * The maximum index which can be stored in a radix tree
- */
- static inline unsigned long shift_maxindex(unsigned int shift)
- {
- return (RADIX_TREE_MAP_SIZE << shift) - 1;
- }
-
- static inline unsigned long node_maxindex(const struct radix_tree_node *node)
- {
- return shift_maxindex(node->shift);
- }
-
- static unsigned long next_index(unsigned long index,
- const struct radix_tree_node *node,
- unsigned long offset)
- {
- return (index & ~node_maxindex(node)) + (offset << node->shift);
- }
所以start 就是要申请的pid。