linux控制组: cpuset解析

cpuset提供了一种机制，用于将一组CPU和内存节点分配给一组任务。在这里，“内存节点”是指包含内存的在线节点。

CPU集将任务的CPU和内存放置限制为仅任务当前cpuset中的资源。它们形成虚拟文件系统中可见的嵌套层次结构。这些是管理大型系统上动态作业放置所需的基本钩子，超出了已经存在的钩子。

cpuset使用控制组中描述的通用cgroup子系统。

任务的请求，使用sched_setaffinity系统调用将CPU包含在CPU关联掩码中，并使用mbind和set_mempolicy系统调用将内存节点包含在内存策略中，都通过该任务的CPU集进行过滤，过滤掉不在CPU集中的任何CPU或内存节点。调度程序不会在其cpus_allowed向量中不允许的CPU上调度任务，并且内核页面分配器不会在请求任务的mems_allowed向量中不允许的节点上分配页面。

用户级代码可以在cgroup虚拟文件系统中按名称创建和销毁cpuset，管理这些cpuset的属性和权限以及分配给每个cpuset的CPU和内存节点，指定和查询任务分配给哪个cpuset，并列出分配给cpuset的任务pid。

cgroup在这里作为一组内存区域空间，通过obj_cgroup数组与它建立存储内存的访问关系。如通过alloc_percpu分配内存，得到是obj_cgroup对象所对应的块，它先从内核静态percpu区域的预留块中查找是否存在对应(大小，并且按指定大小对齐)的块区域，如果没有找到再从创建新的pcpu块区域，从顶部分配虚拟区间。

cpuset支持CPU/内存的热插拔事件(注册通知链)，它通过cpuset_track_online_nodes_nb函数检测cpu集的跟踪节点在线状态，当mems_allowed跟踪node_states[N_MEMORY]发生变化时，调度工作队列cpuset_hotplug_work，处理cpuset的CPU/内存热插拔等相关的变化。

内容

1. 函数分析

1.1 cpuset_init

初始化percpu读写信号量、分配顶部cpu集的部分参数

BUG_ON(percpu_init_rwsem(&cpuset_rwsem)); // 定义、初始化percpu读写信号量
        
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); // 分配顶部cpu集的cpus_allowed指针，(可以在哪些CPU上调度，用户配置的CPU和内存节点允许执行任务）
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); // 分配effective_cpus指针，(CPU 集中的CPU列表副本，有效的CPU和内存节点允许执行任务)
BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); // 分配subparts_cpus指针，(分配给子分区的CPU）

cpumask_setall(top_cpuset.cpus_allowed); // 设置(填充)cpumask中的所有cpu（

DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
percpu_init_rwsem
top_cpuset

1.2 cpuset_init_smp

void __init cpuset_init_smp(void)
{
        /*
         * cpus_allowd/mems_allowd在初始值中设置为v2值
         * cpuset_bind() 调用将在另一个调用中重置为v1值
         * 安装v1 cpuset时调用cpuset_bind(）
         */
        top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; // 记录nodemask的所有内存节点

        cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); // 拷贝cpu可用于迁移时的掩码
        top_cpuset.effective_mems = node_states[N_MEMORY]; // 拷贝cpu的内存节点列表

		register_hotmemory_notifier(&cpuset_track_online_nodes_nb); // 设置cpu集跟踪节点在线状态
		// 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点
1
2
3
4
5
6
7
8
9
10
11
12
13
14

cpuset_track_online_nodes_nb
register_hotmemory_notifier

2. 源码结构

cpuset_rwsem 一种新的读写信号量设计，针对读取锁定进行了优化

static struct percpu_rw_semaphore cpuset_rwsem = {                           
        .rss = __RCU_SYNC_INITIALIZER(cpuset_rwsem,.rss),                        
        .read_count = &__percpu_rwsem_rc_cpuset_rwsem,                        
        .writer = __RCUWAIT_INITIALIZER(cpuset_rwsem.writer),                   
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(cpuset_rwsem.waiters),         
        .block = ATOMIC_INIT(0),                                       
        __PERCPU_RWSEM_DEP_MAP_INIT(cpuset_rwsem)                             
};
1
2
3
4
5
6
7
8

cpuset_track_online_nodes_nb cpu集跟踪节点在线状态

static struct notifier_block cpuset_track_online_nodes_nb = {
        .notifier_call = cpuset_track_online_nodes,
        .priority = 10,         /* ??! */
};
1
2
3
4

cpuset_track_online_nodes

__RWSEM_INITIALIZER 块通知链表头，展开形式

#define __RWSEM_INITIALIZER(memory_chain)	
||
\/
struct blocking_notifier_head memory_chain = { 
		.count =  { (0) },
	  	.owner =  { (0) },		
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
	  	.osq = { (0) },		
#endif	
	  	.wait_lock = {	
					.raw_lock = 1,
#ifdef CONFIG_DEBUG_SPINLOCK
					.magic = 0xdead4ead,		
					.owner_cpu = -1,		
					.owner = ((void *)-1L),
#endif
					.dep_map = {					
							.name = "wait_lock",	
							.wait_type_inner = 2,	
					}
	  	},
	  	.wait_list = LIST_HEAD_INIT((memory_chain).wait_list),
#ifdef CONFIG_DEBUG_RWSEMS
		.magic = &memory_chain,
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
		.dep_map = {				
				.name = "memory_chain",		
				.wait_type_inner = LD_WAIT_SLEEP,
	},
#endif
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

3. 部分结构定义

percpu_stats 块分配统计

struct percpu_stats {
	u64 nr_alloc;		/* lifetime # 分配数 */
	u64 nr_dealloc;		/* lifetime # 回收数 */
	u64 nr_cur_alloc;	/* current # 当前分配数 */
	u64 nr_max_alloc;	/* max # 最大分配数 */
	u32 nr_chunks;		/* current # 活动块 */
	u32 nr_max_chunks;	/* max # 最大活动块 */
	size_t min_alloc_size;	/* 最小分配 */
	size_t max_alloc_size;	/* 最大分配 */
};
1
2
3
4
5
6
7
8
9
10

top_cpuset 顶部cpu集

static struct cpuset top_cpuset = {
        .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
                  (1 << CS_MEM_EXCLUSIVE)),
        .partition_root_state = PRS_ROOT,
};
1
2
3
4
5

4. 扩展函数/变量

DEFINE_STATIC_PERCPU_RWSEM 定义新类型读写信号量结构对象cpuset_rwsem

DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); // 展开内容参考下发跳转
||
\/
__DEFINE_PERCPU_RWSEM(cpuset_rwsem, static)
||
\/
#define __DEFINE_PERCPU_RWSEM(name, is_static)                          \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);          \
is_static struct percpu_rw_semaphore name = {                           \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                   \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),         \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                               \
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

cpuset_rwsem
DEFINE_PER_CPU

DEFINE_PER_CPU

static unsigned int __percpu_rwsem_rc_cpuset_rwsem;  
// __percpu_rwsem_rc_cpuset_rwsem定义在.data..percpu 段中，属于cpu局部的内存空间
// 该变量在整个vmlinux的.data..percpu区里的位置，然后通过某个cpu的percpu内存块的起始地址，
// 就可以计算出该cpu对应的该变量的运行时内存地址

/*
 * linux内核在启动时，会先把vmlinux文件加载到内存中，然后根据cpu的个数，
 * 为每个cpu都分配一块用于存放percpu变量的内存区域，
 * 之后把vmlinux中的.data..percpu section里的内容，
 * 拷贝到各个cpu的percpu内存块的static区域里，
 * 最后将各percpu内存块的起始地址放到对应cpu的gs寄存器里
 *
 * 当我们在访问percpu变量时，只需要将gs寄存器里的地址，
 * 加上我们想要访问的percpu变量的地址，就能得到在该cpu上，该percpu变量真实的内存地址
 *  https://zhuanlan.zhihu.com/p/340985476
 */

static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);
||
\/
DEFINE_PER_CPU_SECTION(unsigned int, __percpu_rwsem_rc_cpuset_rwsem, "")
||
\/
__PCPU_ATTRS("") __typeof__(unsigned int) __percpu_rwsem_rc_cpuset_rwsem
||
\/
#define __PCPU_ATTRS("")                                               \
        __percpu __attribute__((section(PER_CPU_BASE_SECTION "")))     \
        PER_CPU_ATTRIBUTES   // 通用架构(包括x86系列、arm系列等)为空
||
\/
# define __percpu       __attribute__((noderef, address_space(__percpu))) // __percpu在sparse中定义为#define __percpu __attribute__((address_space(3))) ，表示指针不能被解引用(*ptr访问)，3表示cpu局部的内存空间

#define PER_CPU_BASE_SECTION ".data..percpu"
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

percpu_init_rwsem 定义、初始化percpu读写信号量

define percpu_init_rwsem(sem)                                  \
({                                                              \
        static struct lock_class_key rwsem_key;                 \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);             \
})
||
\/
 static struct lock_class_key rwsem_key; \
  __percpu_init_rwsem(cpuset_rwsem, "cpuset_rwsem", &rwsem_key);
  ||
  \/
  int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
                        const char *name, struct lock_class_key *key)
{
        sem->read_count = alloc_percpu(int); // 分配pcpu块区域，从顶部分配虚拟区间， 记录到cgroup数组中，更新cgroup内存统计信息
        if (unlikely(!sem->read_count))
                return -ENOMEM;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

alloc_percpu

		rcu_sync_init(&sem->rss); // 初始化rcu_sync结构，及gp_wait等待队列
        rcuwait_init(&sem->writer); // w->task = NULL;
        init_waitqueue_head(&sem->waiters); // 初始化写者等待队列
        atomic_set(&sem->block, 0); 
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
        lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
        return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
1
2
3
4
5
6
7
8
9
10
11

alloc_percpu 分配pcpu块区域，从顶部分配虚拟区间，记录到cgroup数组中，更新cgroup内存统计信息

#define alloc_percpu(type)                                              \
        (typeof(type) __percpu *)__alloc_percpu(sizeof(type),           \
                                                __alignof__(type))
||
\/
(int __percpu *)__alloc_percpu(sizeof(int),  __alignof__(int))
||
\/
void __percpu *__alloc_percpu(size_t size, size_t align)  // 分配动态percpu区域
{
        return pcpu_alloc(size, align, false, GFP_KERNEL);
}       
EXPORT_SYMBOL_GPL(__alloc_percpu);
||
\/
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        // 将每个任务的gfp上下文应用到给定的分配标志
        // 指定PF_MEMALLOC_NOIO，将移除__GFP_IO 和 __GFP_FS标志
        // 指定PF_MEMALLOC_NOFS，将移除__GFP_FS标志
		// 指定PF_MEMALLOC_PIN，将移除__GFP_MOVABLE标志

		/* 可传递给后台分配器的白名单标志 */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
        do_warn = !(gfp & __GFP_NOWARN);

		/* 
		 * 现在有一个PCPU_MIN_ALLOC_SIZE的最小分配大小，因此对齐必须是这么多字节中的最小值
		 * 一个分配可能有内部碎片，从四舍五入到PCPU_MIN_ALLOC_SIZE - 1字节
		 * /
		 if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); // 按PCPU_MIN_ALLOC_SIZE对齐分配，如分配1字节 对齐为4字节，不足4字节将分配4字节，如果为5，将分配8字节
        bits = size >> PCPU_MIN_ALLOC_SHIFT; 
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT; 

		if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) { // 最大分配不超过32K，对齐小于页大小，并且为2的幂次数(如二进制10, 100, 1000， n & (n -1) == 0)
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align); 
                return NULL;
        }

		if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg))) // 从每cpu中的mem_cgroup中获取指定字节(内存)的对象
                return NULL;
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

pcpu_memcg_pre_alloc_hook

if (!is_atomic) {
                /*
                 * pcpu_balance_workfn()在这个互斥锁下分配内存，它可能会等待内存回收
                 * 允许当前任务成为OOM受害者，以防内存压力
                 */
                if (gfp & __GFP_NOFAIL) { // 内存分配不会失败的标志(堵塞等待，它一直到分配成功为止)
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { // 获取互斥锁，可被致命信号中断
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

		...
		/* 如果可用，提供来自预留块的预留分配 */
        if (reserved && pcpu_reserved_chunk) { // 内核静态percpu区域的预留块
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); // 查找要开始搜索的块索引
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off); // pcpu_chunk分配区域，此函数从指定偏移量开始搜索，找到(具有)指定大小的对齐区域
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
		pcpu块链表中搜索区域

		...
		/* 
		 * 没有剩余空间， 创建新区块
		 * 我们不希望多个任务同时创建块
		 * 如果在获取互斥体之后仍然没有空块，则序列化并创建
		 * /
		if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
                chunk = pcpu_create_chunk(pcpu_gfp); // 分配pcpu块区域，从顶部分配虚拟区间
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1); // 从插槽里移除块链表
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

pcpu_create_chunk

pcpu_stats_area_alloc(chunk, size); // 加入块统计
1

percpu_stats

/* 如果不是所有页面都已存在，则填充 */
 if (!is_atomic) {
                unsigned int page_end, rs, re;

                rs = PFN_DOWN(off); // 返回对应的物理页号 (off >> PAGE_SHIFT)
                page_end = PFN_UP(off + size); // 返回对应+1的物理页号((off + size) 不能整除PAGE_SHIFT的情况)，如(off + size)余数不足一页或超出一页，按一页补齐，可以整除则返回对应的物理页号

                for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); // 填充和映射pcpu_chunk的一个区域

                        spin_lock_irqsave(&pcpu_lock, flags); // 获取自旋锁，禁止抢占
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re); // 填充后记账
                        spin_unlock_irqrestore(&pcpu_lock, flags); // 归还自旋锁
                }
	mutex_unlock(&pcpu_alloc_mutex);
}

	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work(); // 调度工作队列 pcpu_balance_work， 管理空闲块和已填充页面的数量

	/* 清除区域并返回相对于基址的地址 */
	for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

	/* 默认addr<->pcpu_ptr映射 */
	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp); // 注册新分配的__percpu对象

        trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
                                  chunk->base_addr, off, ptr,
                                  pcpu_obj_full_size(size), gfp); // 增加trace调试输出

		pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); // 记录到cgroup数组中，更新cgroup内存统计信息

        return ptr;
        ...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

pcpu_memcg_pre_alloc_hook 从每cpu中的mem_cgroup中获取指定字节(内存)的对象

static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                      struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) // 首先根据memcg_kmem_enabled预测值(默认值或修改回0值时)直接跳转，这些通过加载时已经确定执行路径(几乎相当于没有进行判断)，用于减少判断带来的开销及提高准确性
                return true;

		objcg = get_obj_cgroup_from_current(); // 储存任意字节大小的cgroup对象
		// 每个 cgroup 都有一个与之关联的内存控制器特定数据结构 (mem_cgroup）
		
		if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { // 向objcg收取一定数量的内核页面
                obj_cgroup_put(objcg); // 减少引用计数
                return false;
        }
        
		*objcgp = objcg;
        return true;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

memcg_kmem_enabled

memcg_kmem_enabled 根据静态分支预测对象memcg_kmem_enabled_key，生成跳转路径，如memcg_kmem_enabled_key为默认值的情况下，直接跳到下面执行代码(跳过true分支)，而这些通过加载时已经确定执行路径(几乎相当于没有进行判断)，用于减少判断带来的开销及提高准确性

static inline bool memcg_kmem_enabled(void)
{
        return static_branch_likely(&memcg_kmem_enabled_key); // 分支预测，加载时已经确定，调用static_key_enable或static_key_disable函数时修改预测值(开销较大)
}  
||
\/
_________________________________________________________________________________
/*
 * 许多对缓存分配函数的调用都期望由编译器内联
 * 由于对memcg_slab_pre_alloc_hook()的调用对于这个静态分支是有条件的，
 * 所以我们必须允许执行kmem_cache_alloc等操作的模块也可以看到这个符号
 * /
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); //  定义静态分支预测 memcg_kmem_enabled_key，主要可作为动态修改预测值，FALSE (enabled)定义默认为0，执行false分支(预测结果)，而TRUE定义默认为1
// static_key_enable函数可以用于修改enabled值为1，执行true分支(预测结果)
// static_key_disable函数可以用于修改enabled值为0，执行false分支(预测结果)

EXPORT_SYMBOL(memcg_kmem_enabled_key); // 导入(声明)全局符号
// 如果这个符号所在的文件被编译成了.o并链接入了vmlinux，则这个符号在内核范围内都可以使用
// 通过 extern ...，外部声明这个符号即可使用(在需使用的文件中extern定义，符号可以是函数或变量)

#define DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key)   \ 
        struct static_key_false memcg_kmem_enabled_key = 
        		(struct static_key_false) { .key =  { .enabled = { 0 },                                     \
          											{ .type = 0UL } } , }
_________________________________________________________________________________
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

pcpu_create_chunk 分配pcpu块区域，从顶部分配虚拟区间

static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{               
        struct pcpu_chunk *chunk;
        struct vm_struct **vms;
                
        chunk = pcpu_alloc_chunk(gfp); // 分配pcpu块区域
        if (!chunk)
                return NULL;
        
        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
                                pcpu_nr_groups, pcpu_atom_size); // pcpu获取虚拟内存区域
        // percpu分配器想要使用一致的vm区域，这样它可以保持percpu区域之间的偏移          
        // 为了避免与常规vmalloc交互，这些区域是从顶部分配的              
       
        chunk->data = vms;
        chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
        
        pcpu_stats_chunk_alloc(); // 增加块统计
        trace_percpu_create_chunk(chunk->base_addr);  // 增加trace调试输出，base_addr
        
        return chunk;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

register_hotmemory_notifier 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点

#define register_hotmemory_notifier(nb)         register_memory_notifier(nb)
||
\/
int register_memory_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&memory_chain, nb);
}
EXPORT_SYMBOL(register_memory_notifier);
||
\/
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, false);
}       
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);
||
\/
static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                                              struct notifier_block *n,
                                              bool unique_priority)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, unique_priority); // 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点

        down_write(&nh->rwsem);
        ret = notifier_chain_register(&nh->head, n, unique_priority);
        up_write(&nh->rwsem);
        return ret;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

memory_chain
notifier_chain_register

memory_chain 块通知链头定义

#define BLOCKING_NOTIFIER_HEAD(memory_chain)				\
	struct blocking_notifier_head memory_chain =			\
		BLOCKING_NOTIFIER_INIT(memory_chain)
||
\/
#define BLOCKING_NOTIFIER_INIT(memory_chain) {				\
		.rwsem = __RWSEM_INITIALIZER((memory_chain).rwsem),	\
		.head = NULL }
||
\/
#define __RWSEM_INITIALIZER(memory_chain)				\
	{ __RWSEM_COUNT_INIT(memory_chain),				\
	  .owner = ATOMIC_LONG_INIT(0),				\
	  __RWSEM_OPT_INIT(memory_chain)				\
	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(memory_chain.wait_lock),\
	  .wait_list = LIST_HEAD_INIT((memory_chain).wait_list),	\
	  __RWSEM_DEBUG_INIT(memory_chain)				\
	  __RWSEM_DEP_MAP_INIT(memory_chain) }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

__RWSEM_INITIALIZER

notifier_chain_register 根据通知块的优先级插入到通知块链表对应的位置，如果申请的是唯一优先级，不能存在相同优先级的节点

static int notifier_chain_register(struct notifier_block **nl,
                                   struct notifier_block *n,
                                   bool unique_priority)
{
        while ((*nl) != NULL) {
                if (unlikely((*nl) == n)) {
                        WARN(1, "notifier callback %ps already registered",
                             n->notifier_call);
                        return -EEXIST;
                }
                if (n->priority > (*nl)->priority) // 如果通知块的优先级高于通知块链表头中的节点
                        break;
                if (n->priority == (*nl)->priority && unique_priority) // 如果优先级相同，而且申请的是唯一优先级
                        return -EBUSY; // 直接返回错误
                nl = &((*nl)->next); // 进入链表头的下一个节点
        }
        n->next = *nl; // 插入通知块到链表
        rcu_assign_pointer(*nl, n); // 分配给rcu保护的指针
        return 0;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

cpuset_track_online_nodes 通知链执行函数

/*
 * 保持top_cpuset
 * mems_allowed跟踪node_states[N_MEMORY]
 * 在node_states[N_MEMORY]更改后随时调用此例程
 * 有关CPU热插拔处理，请参见cpuset_update_active_cpus()
 * /
static int cpuset_track_online_nodes(struct notifier_block *self,
				unsigned long action, void *arg)
{
	schedule_work(&cpuset_hotplug_work); // 调度工作队列cpuset_hotplug_work
	return NOTIFY_OK;
}
||
\/
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

cpuset_hotplug_workfn

cpuset_track_online_nodes 处理cpuset的CPU/内存热插拔

/*
 * 此函数在CPU或内存配置更改后调用，并相应更新cpuset
 * top_cpuset始终与cpu_active_mask和N_MEMORY同步，
 * 这对于在主动使用cpu热插拔但未主动使用cpuse的系统上使cpuse透明（无影响）是必要的
 *
 * 非根cpuse仅受脱机影响
 * 如果任何CPU或内存节点已关闭，则会在所有子体上调cpuset_hotplug_update_tasks()
 *
 * 请注意，暂停期间的CPU脱机被忽略
 * 我们根本不会在挂起/恢复周期中修改cpusets
 */
static void cpuset_hotplug_workfn(struct work_struct *work)
{
	static cpumask_t new_cpus;
	static nodemask_t new_mems;
	bool cpus_updated, mems_updated;
	bool on_dfl = is_in_v2_mode();
	struct tmpmasks tmp, *ptmp = NULL;

	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
		ptmp = &tmp;

	percpu_down_write(&cpuset_rwsem);

	/* 获取可用的cpus/mems，并找出其中的更改方式 */
	cpumask_copy(&new_cpus, cpu_active_mask);
	new_mems = node_states[N_MEMORY];

	/* 
	 * 如果subparts_cpus被填充，那么当cpu列表没有更改时，
	 * 下面的检查会对cpus_updated进行确认是否符合条件
	 * 这是额外的工作，但最好是安全
	 * /
	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);

	/* 在罕见的情况下，热插拔会删除subsets_cpus中的所有cpu，我们假设cpu已更新 */
	if (!cpus_updated && top_cpuset.nr_subparts_cpus)
		cpus_updated = true;

	/* 将允许的cpu同步到cpu_active_mask */
	if (cpus_updated) {
		spin_lock_irq(&callback_lock);
		if (!on_dfl)
			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
		/*
		 * 确保分配给子分区的CPU不会显示在effective_CPUs中
		 * 如果没有剩余的CPU，我们将清除subscripts_cpus，并让子分区再次争夺CPU
		 */
		if (top_cpuset.nr_subparts_cpus) {
			if (cpumask_subset(&new_cpus,
					   top_cpuset.subparts_cpus)) {
				top_cpuset.nr_subparts_cpus = 0;
				cpumask_clear(top_cpuset.subparts_cpus);
			} else {
				cpumask_andnot(&new_cpus, &new_cpus,
					       top_cpuset.subparts_cpus);
			}
		}
		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
		spin_unlock_irq(&callback_lock);
		/* 我们不会在topcpuset中处理任务的cpumask */
	}

	/* 将mems_allowed同步到N_MEMORY */
	if (mems_updated) {
		spin_lock_irq(&callback_lock);
		if (!on_dfl)
			top_cpuset.mems_allowed = new_mems;
		top_cpuset.effective_mems = new_mems;
		spin_unlock_irq(&callback_lock);
		update_tasks_nodemask(&top_cpuset);
	}

	percpu_up_write(&cpuset_rwsem);

	/* 如果cpu或mems发生变化，我们需要传播给子级 */
	if (cpus_updated || mems_updated) {
		struct cpuset *cs;
		struct cgroup_subsys_state *pos_css;

		rcu_read_lock();
		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
				continue;
			rcu_read_unlock();

			cpuset_hotplug_update_tasks(cs, ptmp);

			rcu_read_lock();
			css_put(&cs->css);
		}
		rcu_read_unlock();
	}

	/* 如果cpusallowed已更改，则重建计划域 */
	if (cpus_updated || force_rebuild) {
		force_rebuild = false;
		rebuild_sched_domains();
	}

	free_cpumasks(NULL, ptmp);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

相关阅读:
【算法|动态规划No.8】leetcode面试题 17.16. 按摩师
开源模型应用落地-LangChain高阶-知识图谱助力记忆增强
使用驱动器中的光盘之前需要将其格式化怎么办,使用驱动器中的光盘之前需要将其格式化恢复办法
第十五章 mysql存储过程与存储函数课后练习
H3C AC通过Web平台进行AC软件的升级？
企业日常公关如何抵御负面信息的入侵？
Linux-多路转接-select/poll
智慧社区的魔力：数据可视化的引领之力
【Leetcode】2427. Number of Common Factors
IDEA快捷键

原文地址：https://blog.csdn.net/a29562268/article/details/127605240