• Linux内核有什么之内存管理子系统有什么第五回 —— 小内存分配(3)


    接前一篇文章:Linux内核有什么之内存管理子系统有什么第四回 —— 小内存分配(2)

    本文内容参考:

    内存分配不再神秘:深入剖析malloc函数实现原理与机制

    系统调用与内存管理(sbrk、brk、mmap、munmap)

    Linux系统调用详解(实现机制分析)

    Linux操作系统学习笔记(十)内存管理之内存映射【转】

    《趣谈Linux操作系统 核心原理篇:第四部分 内存管理—— 刘超》

    特此致谢!

    二、小内存分配 —— brk与sbrk

    本回开始对于sys_brk函数的代码内容进行解析。

    1. brk源码解析

    为了方便理解和加深印象,再次贴出源码。brk系统调用实现的入口是sys_brk函数,其代码在内核源码目录下的mm/mmap.c中,代码如下:

    1. SYSCALL_DEFINE1(brk, unsigned long, brk)
    2. {
    3. unsigned long newbrk, oldbrk, origbrk;
    4. struct mm_struct *mm = current->mm;
    5. struct vm_area_struct *brkvma, *next = NULL;
    6. unsigned long min_brk;
    7. bool populate;
    8. bool downgraded = false;
    9. LIST_HEAD(uf);
    10. MA_STATE(mas, &mm->mm_mt, 0, 0);
    11. if (mmap_write_lock_killable(mm))
    12. return -EINTR;
    13. origbrk = mm->brk;
    14. #ifdef CONFIG_COMPAT_BRK
    15. /*
    16. * CONFIG_COMPAT_BRK can still be overridden by setting
    17. * randomize_va_space to 2, which will still cause mm->start_brk
    18. * to be arbitrarily shifted
    19. */
    20. if (current->brk_randomized)
    21. min_brk = mm->start_brk;
    22. else
    23. min_brk = mm->end_data;
    24. #else
    25. min_brk = mm->start_brk;
    26. #endif
    27. if (brk < min_brk)
    28. goto out;
    29. /*
    30. * Check against rlimit here. If this check is done later after the test
    31. * of oldbrk with newbrk then it can escape the test and let the data
    32. * segment grow beyond its set limit the in case where the limit is
    33. * not page aligned -Ram Gupta
    34. */
    35. if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
    36. mm->end_data, mm->start_data))
    37. goto out;
    38. newbrk = PAGE_ALIGN(brk);
    39. oldbrk = PAGE_ALIGN(mm->brk);
    40. if (oldbrk == newbrk) {
    41. mm->brk = brk;
    42. goto success;
    43. }
    44. /*
    45. * Always allow shrinking brk.
    46. * do_brk_munmap() may downgrade mmap_lock to read.
    47. */
    48. if (brk <= mm->brk) {
    49. int ret;
    50. /* Search one past newbrk */
    51. mas_set(&mas, newbrk);
    52. brkvma = mas_find(&mas, oldbrk);
    53. if (!brkvma || brkvma->vm_start >= oldbrk)
    54. goto out; /* mapping intersects with an existing non-brk vma. */
    55. /*
    56. * mm->brk must be protected by write mmap_lock.
    57. * do_brk_munmap() may downgrade the lock, so update it
    58. * before calling do_brk_munmap().
    59. */
    60. mm->brk = brk;
    61. ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
    62. if (ret == 1) {
    63. downgraded = true;
    64. goto success;
    65. } else if (!ret)
    66. goto success;
    67. mm->brk = origbrk;
    68. goto out;
    69. }
    70. if (check_brk_limits(oldbrk, newbrk - oldbrk))
    71. goto out;
    72. /*
    73. * Only check if the next VMA is within the stack_guard_gap of the
    74. * expansion area
    75. */
    76. mas_set(&mas, oldbrk);
    77. next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
    78. if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
    79. goto out;
    80. brkvma = mas_prev(&mas, mm->start_brk);
    81. /* Ok, looks good - let it rip. */
    82. if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
    83. goto out;
    84. mm->brk = brk;
    85. success:
    86. populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
    87. if (downgraded)
    88. mmap_read_unlock(mm);
    89. else
    90. mmap_write_unlock(mm);
    91. userfaultfd_unmap_complete(mm, &uf);
    92. if (populate)
    93. mm_populate(oldbrk, newbrk - oldbrk);
    94. return brk;
    95. out:
    96. mmap_write_unlock(mm);
    97. return origbrk;
    98. }

    代码中先来看第一个赋值语句,也是很核心的一句:

    struct mm_struct *mm = current->mm;

    struct mm_struct的定义在include/linux/mm_types.h中,代码如下:

    1. struct mm_struct {
    2. struct {
    3. struct maple_tree mm_mt;
    4. #ifdef CONFIG_MMU
    5. unsigned long (*get_unmapped_area) (struct file *filp,
    6. unsigned long addr, unsigned long len,
    7. unsigned long pgoff, unsigned long flags);
    8. #endif
    9. unsigned long mmap_base; /* base of mmap area */
    10. unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
    11. #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
    12. /* Base addresses for compatible mmap() */
    13. unsigned long mmap_compat_base;
    14. unsigned long mmap_compat_legacy_base;
    15. #endif
    16. unsigned long task_size; /* size of task vm space */
    17. pgd_t * pgd;
    18. #ifdef CONFIG_MEMBARRIER
    19. /**
    20. * @membarrier_state: Flags controlling membarrier behavior.
    21. *
    22. * This field is close to @pgd to hopefully fit in the same
    23. * cache-line, which needs to be touched by switch_mm().
    24. */
    25. atomic_t membarrier_state;
    26. #endif
    27. /**
    28. * @mm_users: The number of users including userspace.
    29. *
    30. * Use mmget()/mmget_not_zero()/mmput() to modify. When this
    31. * drops to 0 (i.e. when the task exits and there are no other
    32. * temporary reference holders), we also release a reference on
    33. * @mm_count (which may then free the &struct mm_struct if
    34. * @mm_count also drops to 0).
    35. */
    36. atomic_t mm_users;
    37. /**
    38. * @mm_count: The number of references to &struct mm_struct
    39. * (@mm_users count as 1).
    40. *
    41. * Use mmgrab()/mmdrop() to modify. When this drops to 0, the
    42. * &struct mm_struct is freed.
    43. */
    44. atomic_t mm_count;
    45. #ifdef CONFIG_MMU
    46. atomic_long_t pgtables_bytes; /* PTE page table pages */
    47. #endif
    48. int map_count; /* number of VMAs */
    49. spinlock_t page_table_lock; /* Protects page tables and some
    50. * counters
    51. */
    52. /*
    53. * With some kernel config, the current mmap_lock's offset
    54. * inside 'mm_struct' is at 0x120, which is very optimal, as
    55. * its two hot fields 'count' and 'owner' sit in 2 different
    56. * cachelines, and when mmap_lock is highly contended, both
    57. * of the 2 fields will be accessed frequently, current layout
    58. * will help to reduce cache bouncing.
    59. *
    60. * So please be careful with adding new fields before
    61. * mmap_lock, which can easily push the 2 fields into one
    62. * cacheline.
    63. */
    64. struct rw_semaphore mmap_lock;
    65. struct list_head mmlist; /* List of maybe swapped mm's. These
    66. * are globally strung together off
    67. * init_mm.mmlist, and are protected
    68. * by mmlist_lock
    69. */
    70. unsigned long hiwater_rss; /* High-watermark of RSS usage */
    71. unsigned long hiwater_vm; /* High-water virtual memory usage */
    72. unsigned long total_vm; /* Total pages mapped */
    73. unsigned long locked_vm; /* Pages that have PG_mlocked set */
    74. atomic64_t pinned_vm; /* Refcount permanently increased */
    75. unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
    76. unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
    77. unsigned long stack_vm; /* VM_STACK */
    78. unsigned long def_flags;
    79. /**
    80. * @write_protect_seq: Locked when any thread is write
    81. * protecting pages mapped by this mm to enforce a later COW,
    82. * for instance during page table copying for fork().
    83. */
    84. seqcount_t write_protect_seq;
    85. spinlock_t arg_lock; /* protect the below fields */
    86. unsigned long start_code, end_code, start_data, end_data;
    87. unsigned long start_brk, brk, start_stack;
    88. unsigned long arg_start, arg_end, env_start, env_end;
    89. unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
    90. /*
    91. * Special counters, in some configurations protected by the
    92. * page_table_lock, in other configurations by being atomic.
    93. */
    94. struct mm_rss_stat rss_stat;
    95. struct linux_binfmt *binfmt;
    96. /* Architecture-specific MM context */
    97. mm_context_t context;
    98. unsigned long flags; /* Must use atomic bitops to access */
    99. #ifdef CONFIG_AIO
    100. spinlock_t ioctx_lock;
    101. struct kioctx_table __rcu *ioctx_table;
    102. #endif
    103. #ifdef CONFIG_MEMCG
    104. /*
    105. * "owner" points to a task that is regarded as the canonical
    106. * user/owner of this mm. All of the following must be true in
    107. * order for it to be changed:
    108. *
    109. * current == mm->owner
    110. * current->mm != mm
    111. * new_owner->mm == mm
    112. * new_owner->alloc_lock is held
    113. */
    114. struct task_struct __rcu *owner;
    115. #endif
    116. struct user_namespace *user_ns;
    117. /* store ref to file /proc//exe symlink points to */
    118. struct file __rcu *exe_file;
    119. #ifdef CONFIG_MMU_NOTIFIER
    120. struct mmu_notifier_subscriptions *notifier_subscriptions;
    121. #endif
    122. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
    123. pgtable_t pmd_huge_pte; /* protected by page_table_lock */
    124. #endif
    125. #ifdef CONFIG_NUMA_BALANCING
    126. /*
    127. * numa_next_scan is the next time that PTEs will be remapped
    128. * PROT_NONE to trigger NUMA hinting faults; such faults gather
    129. * statistics and migrate pages to new nodes if necessary.
    130. */
    131. unsigned long numa_next_scan;
    132. /* Restart point for scanning and remapping PTEs. */
    133. unsigned long numa_scan_offset;
    134. /* numa_scan_seq prevents two threads remapping PTEs. */
    135. int numa_scan_seq;
    136. #endif
    137. /*
    138. * An operation with batched TLB flushing is going on. Anything
    139. * that can move process memory needs to flush the TLB when
    140. * moving a PROT_NONE mapped page.
    141. */
    142. atomic_t tlb_flush_pending;
    143. #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
    144. /* See flush_tlb_batched_pending() */
    145. atomic_t tlb_flush_batched;
    146. #endif
    147. struct uprobes_state uprobes_state;
    148. #ifdef CONFIG_PREEMPT_RT
    149. struct rcu_head delayed_drop;
    150. #endif
    151. #ifdef CONFIG_HUGETLB_PAGE
    152. atomic_long_t hugetlb_usage;
    153. #endif
    154. struct work_struct async_put_work;
    155. #ifdef CONFIG_IOMMU_SVA
    156. u32 pasid;
    157. #endif
    158. #ifdef CONFIG_KSM
    159. /*
    160. * Represent how many pages of this process are involved in KSM
    161. * merging.
    162. */
    163. unsigned long ksm_merging_pages;
    164. /*
    165. * Represent how many pages are checked for ksm merging
    166. * including merged and not merged.
    167. */
    168. unsigned long ksm_rmap_items;
    169. #endif
    170. #ifdef CONFIG_LRU_GEN
    171. struct {
    172. /* this mm_struct is on lru_gen_mm_list */
    173. struct list_head list;
    174. /*
    175. * Set when switching to this mm_struct, as a hint of
    176. * whether it has been used since the last time per-node
    177. * page table walkers cleared the corresponding bits.
    178. */
    179. unsigned long bitmap;
    180. #ifdef CONFIG_MEMCG
    181. /* points to the memcg of "owner" above */
    182. struct mem_cgroup *memcg;
    183. #endif
    184. } lru_gen;
    185. #endif /* CONFIG_LRU_GEN */
    186. } __randomize_layout;
    187. /*
    188. * The mm_cpumask needs to be at the end of mm_struct, because it
    189. * is dynamically sized based on nr_cpu_ids.
    190. */
    191. unsigned long cpu_bitmap[];
    192. };

    这个结构体笔者认为是Linux内存管理子系统最为核心的结构,它之于内存管理相当于struct task_struct之于进程管理。

    对于此核心结构体的详解将有专门的文章,此处只需要弄清楚current->mm的意义就可以了。Linux内核工程师、尤其是对于进程管理比较熟悉的人对于current绝对不会陌生,它是一个宏,简单地说就是通过它可以得到当前执行的任务(的指针)。current的类型为struct task_struct。该结构带么很长,在此不列出,只给出结构中mm成员的定义(include/linux/sched.h中),如下所示:

    1. struct task_struct {
    2. ……
    3. struct mm_struct *mm;
    4. ……
    5. };

    struct mm_struct *mm成员的意义是:进程空间描述结构指针。对于普通用户来说,mm字段指向其虚拟地址空间的用户空间部分;而对于内核线程来说,该指针为空。此处我们是从用户空间调用下来的,指向的是调用malloc()进程(准确地说是线程)的虚拟地址空间的用户空间部分。

    接下来,struct vm_area_struct的定义也是在include/linux/mm_types.h中,代码如下:

    1. /*
    2. * This struct describes a virtual memory area. There is one of these
    3. * per VM-area/task. A VM area is any part of the process virtual memory
    4. * space that has a special rule for the page-fault handlers (ie a shared
    5. * library, the executable area etc).
    6. */
    7. struct vm_area_struct {
    8. /* The first cache line has the info for VMA tree walking. */
    9. unsigned long vm_start; /* Our start address within vm_mm. */
    10. unsigned long vm_end; /* The first byte after our end address
    11. within vm_mm. */
    12. struct mm_struct *vm_mm; /* The address space we belong to. */
    13. /*
    14. * Access permissions of this VMA.
    15. * See vmf_insert_mixed_prot() for discussion.
    16. */
    17. pgprot_t vm_page_prot;
    18. unsigned long vm_flags; /* Flags, see mm.h. */
    19. /*
    20. * For areas with an address space and backing store,
    21. * linkage into the address_space->i_mmap interval tree.
    22. *
    23. * For private anonymous mappings, a pointer to a null terminated string
    24. * containing the name given to the vma, or NULL if unnamed.
    25. */
    26. union {
    27. struct {
    28. struct rb_node rb;
    29. unsigned long rb_subtree_last;
    30. } shared;
    31. /*
    32. * Serialized by mmap_sem. Never use directly because it is
    33. * valid only when vm_file is NULL. Use anon_vma_name instead.
    34. */
    35. struct anon_vma_name *anon_name;
    36. };
    37. /*
    38. * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
    39. * list, after a COW of one of the file pages. A MAP_SHARED vma
    40. * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack
    41. * or brk vma (with NULL file) can only be in an anon_vma list.
    42. */
    43. struct list_head anon_vma_chain; /* Serialized by mmap_lock &
    44. * page_table_lock */
    45. struct anon_vma *anon_vma; /* Serialized by page_table_lock */
    46. /* Function pointers to deal with this struct. */
    47. const struct vm_operations_struct *vm_ops;
    48. /* Information about our backing store: */
    49. unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE
    50. units */
    51. struct file * vm_file; /* File we map to (can be NULL). */
    52. void * vm_private_data; /* was vm_pte (shared mem) */
    53. #ifdef CONFIG_SWAP
    54. atomic_long_t swap_readahead_info;
    55. #endif
    56. #ifndef CONFIG_MMU
    57. struct vm_region *vm_region; /* NOMMU mapping region */
    58. #endif
    59. #ifdef CONFIG_NUMA
    60. struct mempolicy *vm_policy; /* NUMA policy for the VMA */
    61. #endif
    62. struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
    63. } __randomize_layout;

    根据函数说明,vm_area_struct结构描述了一个虚拟内存区域,每个VM区域/任务都有一个此结构。VM area(虚拟内存区域)是进程虚拟内存空间的任何部分,其具有用于页面错误异常处理(page-fault handlers)的特殊规则(即共享库、可执行区域等)。

    关于该结构及其成员的详细解析,请看下回。

  • 相关阅读:
    CSS基础笔记
    【C语言】万字详讲操作符
    dhrystone和coremark测试比较
    Java中ReentrantLock测试线程的安全
    偶数科技:基于OushuDB的新一代云原生湖仓一体为企业助力
    【操作系统】保姆级教程(VMware)Ubuntu+qemu+xv6安装调试
    vue pc商城---最终篇
    UGUI自动布局Layout
    Python数据分析--Numpy常用函数介绍(9)-- 与线性代数有关的模块linalg
    Linux之open/close/read/write/lseek记录
  • 原文地址:https://blog.csdn.net/phmatthaus/article/details/134318769