接前一篇文章:Linux内核有什么之内存管理子系统有什么第四回 —— 小内存分配(2)
本文内容参考:
系统调用与内存管理(sbrk、brk、mmap、munmap)
《趣谈Linux操作系统 核心原理篇:第四部分 内存管理—— 刘超》
特此致谢!
本回开始对于sys_brk函数的代码内容进行解析。
为了方便理解和加深印象,再次贴出源码。brk系统调用实现的入口是sys_brk函数,其代码在内核源码目录下的mm/mmap.c中,代码如下:
- SYSCALL_DEFINE1(brk, unsigned long, brk)
- {
- unsigned long newbrk, oldbrk, origbrk;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *brkvma, *next = NULL;
- unsigned long min_brk;
- bool populate;
- bool downgraded = false;
- LIST_HEAD(uf);
- MA_STATE(mas, &mm->mm_mt, 0, 0);
-
- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- origbrk = mm->brk;
-
- #ifdef CONFIG_COMPAT_BRK
- /*
- * CONFIG_COMPAT_BRK can still be overridden by setting
- * randomize_va_space to 2, which will still cause mm->start_brk
- * to be arbitrarily shifted
- */
- if (current->brk_randomized)
- min_brk = mm->start_brk;
- else
- min_brk = mm->end_data;
- #else
- min_brk = mm->start_brk;
- #endif
- if (brk < min_brk)
- goto out;
-
- /*
- * Check against rlimit here. If this check is done later after the test
- * of oldbrk with newbrk then it can escape the test and let the data
- * segment grow beyond its set limit the in case where the limit is
- * not page aligned -Ram Gupta
- */
- if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
- mm->end_data, mm->start_data))
- goto out;
-
- newbrk = PAGE_ALIGN(brk);
- oldbrk = PAGE_ALIGN(mm->brk);
- if (oldbrk == newbrk) {
- mm->brk = brk;
- goto success;
- }
-
- /*
- * Always allow shrinking brk.
- * do_brk_munmap() may downgrade mmap_lock to read.
- */
- if (brk <= mm->brk) {
- int ret;
-
- /* Search one past newbrk */
- mas_set(&mas, newbrk);
- brkvma = mas_find(&mas, oldbrk);
- if (!brkvma || brkvma->vm_start >= oldbrk)
- goto out; /* mapping intersects with an existing non-brk vma. */
- /*
- * mm->brk must be protected by write mmap_lock.
- * do_brk_munmap() may downgrade the lock, so update it
- * before calling do_brk_munmap().
- */
- mm->brk = brk;
- ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
- if (ret == 1) {
- downgraded = true;
- goto success;
- } else if (!ret)
- goto success;
-
- mm->brk = origbrk;
- goto out;
- }
-
- if (check_brk_limits(oldbrk, newbrk - oldbrk))
- goto out;
-
- /*
- * Only check if the next VMA is within the stack_guard_gap of the
- * expansion area
- */
- mas_set(&mas, oldbrk);
- next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
- if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
- goto out;
-
- brkvma = mas_prev(&mas, mm->start_brk);
- /* Ok, looks good - let it rip. */
- if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
- goto out;
-
- mm->brk = brk;
-
- success:
- populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- if (downgraded)
- mmap_read_unlock(mm);
- else
- mmap_write_unlock(mm);
- userfaultfd_unmap_complete(mm, &uf);
- if (populate)
- mm_populate(oldbrk, newbrk - oldbrk);
- return brk;
-
- out:
- mmap_write_unlock(mm);
- return origbrk;
- }
代码中先来看第一个赋值语句,也是很核心的一句:
struct mm_struct *mm = current->mm;
struct mm_struct的定义在include/linux/mm_types.h中,代码如下:
- struct mm_struct {
- struct {
- struct maple_tree mm_mt;
- #ifdef CONFIG_MMU
- unsigned long (*get_unmapped_area) (struct file *filp,
- unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags);
- #endif
- unsigned long mmap_base; /* base of mmap area */
- unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
- #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
- /* Base addresses for compatible mmap() */
- unsigned long mmap_compat_base;
- unsigned long mmap_compat_legacy_base;
- #endif
- unsigned long task_size; /* size of task vm space */
- pgd_t * pgd;
-
- #ifdef CONFIG_MEMBARRIER
- /**
- * @membarrier_state: Flags controlling membarrier behavior.
- *
- * This field is close to @pgd to hopefully fit in the same
- * cache-line, which needs to be touched by switch_mm().
- */
- atomic_t membarrier_state;
- #endif
-
- /**
- * @mm_users: The number of users including userspace.
- *
- * Use mmget()/mmget_not_zero()/mmput() to modify. When this
- * drops to 0 (i.e. when the task exits and there are no other
- * temporary reference holders), we also release a reference on
- * @mm_count (which may then free the &struct mm_struct if
- * @mm_count also drops to 0).
- */
- atomic_t mm_users;
-
- /**
- * @mm_count: The number of references to &struct mm_struct
- * (@mm_users count as 1).
- *
- * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
- * &struct mm_struct is freed.
- */
- atomic_t mm_count;
-
- #ifdef CONFIG_MMU
- atomic_long_t pgtables_bytes; /* PTE page table pages */
- #endif
- int map_count; /* number of VMAs */
-
- spinlock_t page_table_lock; /* Protects page tables and some
- * counters
- */
- /*
- * With some kernel config, the current mmap_lock's offset
- * inside 'mm_struct' is at 0x120, which is very optimal, as
- * its two hot fields 'count' and 'owner' sit in 2 different
- * cachelines, and when mmap_lock is highly contended, both
- * of the 2 fields will be accessed frequently, current layout
- * will help to reduce cache bouncing.
- *
- * So please be careful with adding new fields before
- * mmap_lock, which can easily push the 2 fields into one
- * cacheline.
- */
- struct rw_semaphore mmap_lock;
-
- struct list_head mmlist; /* List of maybe swapped mm's. These
- * are globally strung together off
- * init_mm.mmlist, and are protected
- * by mmlist_lock
- */
-
-
- unsigned long hiwater_rss; /* High-watermark of RSS usage */
- unsigned long hiwater_vm; /* High-water virtual memory usage */
-
- unsigned long total_vm; /* Total pages mapped */
- unsigned long locked_vm; /* Pages that have PG_mlocked set */
- atomic64_t pinned_vm; /* Refcount permanently increased */
- unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
- unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
- unsigned long stack_vm; /* VM_STACK */
- unsigned long def_flags;
-
- /**
- * @write_protect_seq: Locked when any thread is write
- * protecting pages mapped by this mm to enforce a later COW,
- * for instance during page table copying for fork().
- */
- seqcount_t write_protect_seq;
-
- spinlock_t arg_lock; /* protect the below fields */
-
- unsigned long start_code, end_code, start_data, end_data;
- unsigned long start_brk, brk, start_stack;
- unsigned long arg_start, arg_end, env_start, env_end;
-
- unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
-
- /*
- * Special counters, in some configurations protected by the
- * page_table_lock, in other configurations by being atomic.
- */
- struct mm_rss_stat rss_stat;
-
- struct linux_binfmt *binfmt;
-
- /* Architecture-specific MM context */
- mm_context_t context;
-
- unsigned long flags; /* Must use atomic bitops to access */
-
- #ifdef CONFIG_AIO
- spinlock_t ioctx_lock;
- struct kioctx_table __rcu *ioctx_table;
- #endif
- #ifdef CONFIG_MEMCG
- /*
- * "owner" points to a task that is regarded as the canonical
- * user/owner of this mm. All of the following must be true in
- * order for it to be changed:
- *
- * current == mm->owner
- * current->mm != mm
- * new_owner->mm == mm
- * new_owner->alloc_lock is held
- */
- struct task_struct __rcu *owner;
- #endif
- struct user_namespace *user_ns;
-
- /* store ref to file /proc/
/exe symlink points to */ - struct file __rcu *exe_file;
- #ifdef CONFIG_MMU_NOTIFIER
- struct mmu_notifier_subscriptions *notifier_subscriptions;
- #endif
- #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- pgtable_t pmd_huge_pte; /* protected by page_table_lock */
- #endif
- #ifdef CONFIG_NUMA_BALANCING
- /*
- * numa_next_scan is the next time that PTEs will be remapped
- * PROT_NONE to trigger NUMA hinting faults; such faults gather
- * statistics and migrate pages to new nodes if necessary.
- */
- unsigned long numa_next_scan;
-
- /* Restart point for scanning and remapping PTEs. */
- unsigned long numa_scan_offset;
-
- /* numa_scan_seq prevents two threads remapping PTEs. */
- int numa_scan_seq;
- #endif
- /*
- * An operation with batched TLB flushing is going on. Anything
- * that can move process memory needs to flush the TLB when
- * moving a PROT_NONE mapped page.
- */
- atomic_t tlb_flush_pending;
- #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
- /* See flush_tlb_batched_pending() */
- atomic_t tlb_flush_batched;
- #endif
- struct uprobes_state uprobes_state;
- #ifdef CONFIG_PREEMPT_RT
- struct rcu_head delayed_drop;
- #endif
- #ifdef CONFIG_HUGETLB_PAGE
- atomic_long_t hugetlb_usage;
- #endif
- struct work_struct async_put_work;
-
- #ifdef CONFIG_IOMMU_SVA
- u32 pasid;
- #endif
- #ifdef CONFIG_KSM
- /*
- * Represent how many pages of this process are involved in KSM
- * merging.
- */
- unsigned long ksm_merging_pages;
- /*
- * Represent how many pages are checked for ksm merging
- * including merged and not merged.
- */
- unsigned long ksm_rmap_items;
- #endif
- #ifdef CONFIG_LRU_GEN
- struct {
- /* this mm_struct is on lru_gen_mm_list */
- struct list_head list;
- /*
- * Set when switching to this mm_struct, as a hint of
- * whether it has been used since the last time per-node
- * page table walkers cleared the corresponding bits.
- */
- unsigned long bitmap;
- #ifdef CONFIG_MEMCG
- /* points to the memcg of "owner" above */
- struct mem_cgroup *memcg;
- #endif
- } lru_gen;
- #endif /* CONFIG_LRU_GEN */
- } __randomize_layout;
-
- /*
- * The mm_cpumask needs to be at the end of mm_struct, because it
- * is dynamically sized based on nr_cpu_ids.
- */
- unsigned long cpu_bitmap[];
- };
这个结构体笔者认为是Linux内存管理子系统最为核心的结构,它之于内存管理相当于struct task_struct之于进程管理。
对于此核心结构体的详解将有专门的文章,此处只需要弄清楚current->mm的意义就可以了。Linux内核工程师、尤其是对于进程管理比较熟悉的人对于current绝对不会陌生,它是一个宏,简单地说就是通过它可以得到当前执行的任务(的指针)。current的类型为struct task_struct。该结构带么很长,在此不列出,只给出结构中mm成员的定义(include/linux/sched.h中),如下所示:
- struct task_struct {
- ……
- struct mm_struct *mm;
- ……
- };
struct mm_struct *mm成员的意义是:进程空间描述结构指针。对于普通用户来说,mm字段指向其虚拟地址空间的用户空间部分;而对于内核线程来说,该指针为空。此处我们是从用户空间调用下来的,指向的是调用malloc()进程(准确地说是线程)的虚拟地址空间的用户空间部分。
接下来,struct vm_area_struct的定义也是在include/linux/mm_types.h中,代码如下:
- /*
- * This struct describes a virtual memory area. There is one of these
- * per VM-area/task. A VM area is any part of the process virtual memory
- * space that has a special rule for the page-fault handlers (ie a shared
- * library, the executable area etc).
- */
- struct vm_area_struct {
- /* The first cache line has the info for VMA tree walking. */
-
- unsigned long vm_start; /* Our start address within vm_mm. */
- unsigned long vm_end; /* The first byte after our end address
- within vm_mm. */
-
- struct mm_struct *vm_mm; /* The address space we belong to. */
-
- /*
- * Access permissions of this VMA.
- * See vmf_insert_mixed_prot() for discussion.
- */
- pgprot_t vm_page_prot;
- unsigned long vm_flags; /* Flags, see mm.h. */
-
- /*
- * For areas with an address space and backing store,
- * linkage into the address_space->i_mmap interval tree.
- *
- * For private anonymous mappings, a pointer to a null terminated string
- * containing the name given to the vma, or NULL if unnamed.
- */
-
- union {
- struct {
- struct rb_node rb;
- unsigned long rb_subtree_last;
- } shared;
- /*
- * Serialized by mmap_sem. Never use directly because it is
- * valid only when vm_file is NULL. Use anon_vma_name instead.
- */
- struct anon_vma_name *anon_name;
- };
-
- /*
- * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
- * list, after a COW of one of the file pages. A MAP_SHARED vma
- * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
- * or brk vma (with NULL file) can only be in an anon_vma list.
- */
- struct list_head anon_vma_chain; /* Serialized by mmap_lock &
- * page_table_lock */
- struct anon_vma *anon_vma; /* Serialized by page_table_lock */
-
- /* Function pointers to deal with this struct. */
- const struct vm_operations_struct *vm_ops;
-
- /* Information about our backing store: */
- unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
- units */
- struct file * vm_file; /* File we map to (can be NULL). */
- void * vm_private_data; /* was vm_pte (shared mem) */
-
- #ifdef CONFIG_SWAP
- atomic_long_t swap_readahead_info;
- #endif
- #ifndef CONFIG_MMU
- struct vm_region *vm_region; /* NOMMU mapping region */
- #endif
- #ifdef CONFIG_NUMA
- struct mempolicy *vm_policy; /* NUMA policy for the VMA */
- #endif
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
- } __randomize_layout;
根据函数说明,vm_area_struct结构描述了一个虚拟内存区域,每个VM区域/任务都有一个此结构。VM area(虚拟内存区域)是进程虚拟内存空间的任何部分,其具有用于页面错误异常处理(page-fault handlers)的特殊规则(即共享库、可执行区域等)。
关于该结构及其成员的详细解析,请看下回。