Linux内核有什么之内存管理子系统有什么第五回 —— 小内存分配（3）

接前一篇文章：Linux内核有什么之内存管理子系统有什么第四回 —— 小内存分配（2）

本文内容参考：

内存分配不再神秘：深入剖析malloc函数实现原理与机制

系统调用与内存管理（sbrk、brk、mmap、munmap）

Linux系统调用详解（实现机制分析）

Linux操作系统学习笔记（十）内存管理之内存映射【转】

《趣谈Linux操作系统核心原理篇：第四部分内存管理—— 刘超》

特此致谢！

二、小内存分配 —— brk与sbrk

本回开始对于sys_brk函数的代码内容进行解析。

1. brk源码解析

为了方便理解和加深印象，再次贴出源码。brk系统调用实现的入口是sys_brk函数，其代码在内核源码目录下的mm/mmap.c中，代码如下：


SYSCALL_DEFINE1(brk, unsigned long, brk)
{
	unsigned long newbrk, oldbrk, origbrk;
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *brkvma, *next = NULL;
	unsigned long min_brk;
	bool populate;
	bool downgraded = false;
	LIST_HEAD(uf);
	MA_STATE(mas, &mm->mm_mt, 0, 0);
 
	if (mmap_write_lock_killable(mm))
		return -EINTR;
 
	origbrk = mm->brk;
 
#ifdef CONFIG_COMPAT_BRK
	/*
	 * CONFIG_COMPAT_BRK can still be overridden by setting
	 * randomize_va_space to 2, which will still cause mm->start_brk
	 * to be arbitrarily shifted
	 */
	if (current->brk_randomized)
		min_brk = mm->start_brk;
	else
		min_brk = mm->end_data;
#else
	min_brk = mm->start_brk;
#endif
	if (brk < min_brk)
		goto out;
 
	/*
	 * Check against rlimit here. If this check is done later after the test
	 * of oldbrk with newbrk then it can escape the test and let the data
	 * segment grow beyond its set limit the in case where the limit is
	 * not page aligned -Ram Gupta
	 */
	if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
			      mm->end_data, mm->start_data))
		goto out;
 
	newbrk = PAGE_ALIGN(brk);
	oldbrk = PAGE_ALIGN(mm->brk);
	if (oldbrk == newbrk) {
		mm->brk = brk;
		goto success;
	}
 
	/*
	 * Always allow shrinking brk.
	 * do_brk_munmap() may downgrade mmap_lock to read.
	 */
	if (brk <= mm->brk) {
		int ret;
 
		/* Search one past newbrk */
		mas_set(&mas, newbrk);
		brkvma = mas_find(&mas, oldbrk);
		if (!brkvma || brkvma->vm_start >= oldbrk)
			goto out; /* mapping intersects with an existing non-brk vma. */
		/*
		 * mm->brk must be protected by write mmap_lock.
		 * do_brk_munmap() may downgrade the lock,  so update it
		 * before calling do_brk_munmap().
		 */
		mm->brk = brk;
		ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
		if (ret == 1)  {
			downgraded = true;
			goto success;
		} else if (!ret)
			goto success;
 
		mm->brk = origbrk;
		goto out;
	}
 
	if (check_brk_limits(oldbrk, newbrk - oldbrk))
		goto out;
 
	/*
	 * Only check if the next VMA is within the stack_guard_gap of the
	 * expansion area
	 */
	mas_set(&mas, oldbrk);
	next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
		goto out;
 
	brkvma = mas_prev(&mas, mm->start_brk);
	/* Ok, looks good - let it rip. */
	if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
		goto out;
 
	mm->brk = brk;
 
success:
	populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
	if (downgraded)
		mmap_read_unlock(mm);
	else
		mmap_write_unlock(mm);
	userfaultfd_unmap_complete(mm, &uf);
	if (populate)
		mm_populate(oldbrk, newbrk - oldbrk);
	return brk;
 
out:
	mmap_write_unlock(mm);
	return origbrk;
}

代码中先来看第一个赋值语句，也是很核心的一句：

struct mm_struct *mm = current->mm;

struct mm_struct的定义在include/linux/mm_types.h中，代码如下：


struct mm_struct {
	struct {
		struct maple_tree mm_mt;
#ifdef CONFIG_MMU
		unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
#endif
		unsigned long mmap_base;	/* base of mmap area */
		unsigned long mmap_legacy_base;	/* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
		/* Base addresses for compatible mmap() */
		unsigned long mmap_compat_base;
		unsigned long mmap_compat_legacy_base;
#endif
		unsigned long task_size;	/* size of task vm space */
		pgd_t * pgd;
 
#ifdef CONFIG_MEMBARRIER
		/**
		 * @membarrier_state: Flags controlling membarrier behavior.
		 *
		 * This field is close to @pgd to hopefully fit in the same
		 * cache-line, which needs to be touched by switch_mm().
		 */
		atomic_t membarrier_state;
#endif
 
		/**
		 * @mm_users: The number of users including userspace.
		 *
		 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
		 * drops to 0 (i.e. when the task exits and there are no other
		 * temporary reference holders), we also release a reference on
		 * @mm_count (which may then free the &struct mm_struct if
		 * @mm_count also drops to 0).
		 */
		atomic_t mm_users;
 
		/**
		 * @mm_count: The number of references to &struct mm_struct
		 * (@mm_users count as 1).
		 *
		 * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
		 * &struct mm_struct is freed.
		 */
		atomic_t mm_count;
 
#ifdef CONFIG_MMU
		atomic_long_t pgtables_bytes;	/* PTE page table pages */
#endif
		int map_count;			/* number of VMAs */
 
		spinlock_t page_table_lock; /* Protects page tables and some
					     * counters
					     */
		/*
		 * With some kernel config, the current mmap_lock's offset
		 * inside 'mm_struct' is at 0x120, which is very optimal, as
		 * its two hot fields 'count' and 'owner' sit in 2 different
		 * cachelines,  and when mmap_lock is highly contended, both
		 * of the 2 fields will be accessed frequently, current layout
		 * will help to reduce cache bouncing.
		 *
		 * So please be careful with adding new fields before
		 * mmap_lock, which can easily push the 2 fields into one
		 * cacheline.
		 */
		struct rw_semaphore mmap_lock;
 
		struct list_head mmlist; /* List of maybe swapped mm's.	These
					  * are globally strung together off
					  * init_mm.mmlist, and are protected
					  * by mmlist_lock
					  */
 
 
		unsigned long hiwater_rss; /* High-watermark of RSS usage */
		unsigned long hiwater_vm;  /* High-water virtual memory usage */
 
		unsigned long total_vm;	   /* Total pages mapped */
		unsigned long locked_vm;   /* Pages that have PG_mlocked set */
		atomic64_t    pinned_vm;   /* Refcount permanently increased */
		unsigned long data_vm;	   /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
		unsigned long exec_vm;	   /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
		unsigned long stack_vm;	   /* VM_STACK */
		unsigned long def_flags;
 
		/**
		 * @write_protect_seq: Locked when any thread is write
		 * protecting pages mapped by this mm to enforce a later COW,
		 * for instance during page table copying for fork().
		 */
		seqcount_t write_protect_seq;
 
		spinlock_t arg_lock; /* protect the below fields */
 
		unsigned long start_code, end_code, start_data, end_data;
		unsigned long start_brk, brk, start_stack;
		unsigned long arg_start, arg_end, env_start, env_end;
 
		unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
		/*
		 * Special counters, in some configurations protected by the
		 * page_table_lock, in other configurations by being atomic.
		 */
		struct mm_rss_stat rss_stat;
 
		struct linux_binfmt *binfmt;
 
		/* Architecture-specific MM context */
		mm_context_t context;
 
		unsigned long flags; /* Must use atomic bitops to access */
 
#ifdef CONFIG_AIO
		spinlock_t			ioctx_lock;
		struct kioctx_table __rcu	*ioctx_table;
#endif
#ifdef CONFIG_MEMCG
		/*
		 * "owner" points to a task that is regarded as the canonical
		 * user/owner of this mm. All of the following must be true in
		 * order for it to be changed:
		 *
		 * current == mm->owner
		 * current->mm != mm
		 * new_owner->mm == mm
		 * new_owner->alloc_lock is held
		 */
		struct task_struct __rcu *owner;
#endif
		struct user_namespace *user_ns;
 
		/* store ref to file /proc//exe symlink points to */
		struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
		struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
		pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
		/*
		 * numa_next_scan is the next time that PTEs will be remapped
		 * PROT_NONE to trigger NUMA hinting faults; such faults gather
		 * statistics and migrate pages to new nodes if necessary.
		 */
		unsigned long numa_next_scan;
 
		/* Restart point for scanning and remapping PTEs. */
		unsigned long numa_scan_offset;
 
		/* numa_scan_seq prevents two threads remapping PTEs. */
		int numa_scan_seq;
#endif
		/*
		 * An operation with batched TLB flushing is going on. Anything
		 * that can move process memory needs to flush the TLB when
		 * moving a PROT_NONE mapped page.
		 */
		atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
		/* See flush_tlb_batched_pending() */
		atomic_t tlb_flush_batched;
#endif
		struct uprobes_state uprobes_state;
#ifdef CONFIG_PREEMPT_RT
		struct rcu_head delayed_drop;
#endif
#ifdef CONFIG_HUGETLB_PAGE
		atomic_long_t hugetlb_usage;
#endif
		struct work_struct async_put_work;
 
#ifdef CONFIG_IOMMU_SVA
		u32 pasid;
#endif
#ifdef CONFIG_KSM
		/*
		 * Represent how many pages of this process are involved in KSM
		 * merging.
		 */
		unsigned long ksm_merging_pages;
		/*
		 * Represent how many pages are checked for ksm merging
		 * including merged and not merged.
		 */
		unsigned long ksm_rmap_items;
#endif
#ifdef CONFIG_LRU_GEN
		struct {
			/* this mm_struct is on lru_gen_mm_list */
			struct list_head list;
			/*
			 * Set when switching to this mm_struct, as a hint of
			 * whether it has been used since the last time per-node
			 * page table walkers cleared the corresponding bits.
			 */
			unsigned long bitmap;
#ifdef CONFIG_MEMCG
			/* points to the memcg of "owner" above */
			struct mem_cgroup *memcg;
#endif
		} lru_gen;
#endif /* CONFIG_LRU_GEN */
	} __randomize_layout;
 
	/*
	 * The mm_cpumask needs to be at the end of mm_struct, because it
	 * is dynamically sized based on nr_cpu_ids.
	 */
	unsigned long cpu_bitmap[];
};

这个结构体笔者认为是Linux内存管理子系统最为核心的结构，它之于内存管理相当于struct task_struct之于进程管理。

对于此核心结构体的详解将有专门的文章，此处只需要弄清楚current->mm的意义就可以了。Linux内核工程师、尤其是对于进程管理比较熟悉的人对于current绝对不会陌生，它是一个宏，简单地说就是通过它可以得到当前执行的任务（的指针）。current的类型为struct task_struct。该结构带么很长，在此不列出，只给出结构中mm成员的定义（include/linux/sched.h中），如下所示：


struct task_struct {
    ……
    struct mm_struct		*mm;
    ……
};

struct mm_struct *mm成员的意义是：进程空间描述结构指针。对于普通用户来说，mm字段指向其虚拟地址空间的用户空间部分；而对于内核线程来说，该指针为空。此处我们是从用户空间调用下来的，指向的是调用malloc()进程（准确地说是线程）的虚拟地址空间的用户空间部分。

接下来，struct vm_area_struct的定义也是在include/linux/mm_types.h中，代码如下：


/*
 * This struct describes a virtual memory area. There is one of these
 * per VM-area/task. A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
	/* The first cache line has the info for VMA tree walking. */
 
	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */
 
	struct mm_struct *vm_mm;	/* The address space we belong to. */
 
	/*
	 * Access permissions of this VMA.
	 * See vmf_insert_mixed_prot() for discussion.
	 */
	pgprot_t vm_page_prot;
	unsigned long vm_flags;		/* Flags, see mm.h. */
 
	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap interval tree.
	 *
	 * For private anonymous mappings, a pointer to a null terminated string
	 * containing the name given to the vma, or NULL if unnamed.
	 */
 
	union {
		struct {
			struct rb_node rb;
			unsigned long rb_subtree_last;
		} shared;
		/*
		 * Serialized by mmap_sem. Never use directly because it is
		 * valid only when vm_file is NULL. Use anon_vma_name instead.
		 */
		struct anon_vma_name *anon_name;
	};
 
	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_lock &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */
 
	/* Function pointers to deal with this struct. */
	const struct vm_operations_struct *vm_ops;
 
	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */
 
#ifdef CONFIG_SWAP
	atomic_long_t swap_readahead_info;
#endif
#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;

根据函数说明，vm_area_struct结构描述了一个虚拟内存区域，每个VM区域/任务都有一个此结构。VM area（虚拟内存区域）是进程虚拟内存空间的任何部分，其具有用于页面错误异常处理（page-fault handlers）的特殊规则（即共享库、可执行区域等）。

关于该结构及其成员的详细解析，请看下回。

相关阅读:
CSS基础笔记
 【C语言】万字详讲操作符
 dhrystone和coremark测试比较
 Java中ReentrantLock测试线程的安全
 偶数科技：基于OushuDB的新一代云原生湖仓一体为企业助力
 【操作系统】保姆级教程(VMware)Ubuntu+qemu+xv6安装调试
 vue pc商城---最终篇
 UGUI自动布局Layout
Python数据分析--Numpy常用函数介绍(9)-- 与线性代数有关的模块linalg
Linux之open/close/read/write/lseek记录
原文地址：https://blog.csdn.net/phmatthaus/article/details/134318769