读懂Linux内核汇编代码对理解ARMv8架构和指令集有很大裨益。下面将分析从内核汇编入口到C语言入口start_kernel()函数之间的一大段汇编代码。
<arch/arm64/kernel/vmlinux.lds.S>
OUTPUT_ARCH(aarch64)
ENTRY(_text)
SECTIONS
{
. = KIMAGE_VADDR + TEXT_OFFSET;
.text 段(代码段)
.rodata 段(只读数据段)
.init 段(初始化数据段)
.data 段(数据段)
.bss 段
}
#define VA_BITS (CONFIG_ARM64_VA_BITS)
#define _PAGE_OFFSET(va) (-(UL(1) << (va)))
#define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS))
#define KIMAGE_VADDR (MODULES_END)
#define BPF_JIT_REGION_START (KASAN_SHADOW_END)
#define BPF_JIT_REGION_SIZE (SZ_128M)
#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR (BPF_JIT_REGION_END)
#define MODULES_VSIZE (SZ_128M)
TEXT_OFFSET := 0x00080000
所以Linux内核的链接地址就是0xFFFF_0000_1008_0000,该地址也是Linux内核代码的起始地址。
.head.text : {
_text = .;
HEAD_TEXT
}
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
...
TEXT_TEXT
...
. = ALIGN(16);
*(.got) /* Global offset table */
}
. = ALIGN(SEGMENT_ALIGN);
_etext = .; /* End of text section */
代码段从_start开始。到_etext结束。
RO_DATA(PAGE_SIZE) /* everything from this point to */
EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */
NOTES
. = ALIGN(SEGMENT_ALIGN);
__init_begin = .;
其中RO_DATA()是一个宏,它实现在include/asm-generic/vmlinux.lds.h头文件中。
#define RO_DATA(align) RO_DATA_SECTION(align)
#define RO_DATA_SECTION(align) \
. = ALIGN((align)); \
.rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \
VMLINUX_SYMBOL(__start_rodata) = .; \
*(.rodata) *(.rodata.*) \
...
从上面的定义可知,只读数据段以PAGE_SIZE大小对齐。只读数据段从__start_rodata标记开始,到__init_begin标记结束。只读数据段里包含了系统PGD页表swapper_pd_dir、为软件PAN功能准备的特殊页表reserved_ttbr0等信息。
__init_begin = .;
__inittext_begin = .;
...
. = ALIGN(PAGE_SIZE);
__inittext_end = .;
__initdata_begin = .;
.init.data : {
INIT_DATA
INIT_SETUP(16)
INIT_CALLS
CON_INITCALL
INIT_RAM_FS
*(.init.rodata.* .init.bss) /* from the EFI stub */
}
.exit.data : {
ARM_EXIT_KEEP(EXIT_DATA)
}
...
. = ALIGN(SEGMENT_ALIGN);
__initdata_end = .;
__init_end = .;
.init段包含一些系统初始化时的数据,如模块加载函数core_initcall()或者module_init()函数。.init段从__init_begin标记开始,到__init_end标记结束。
_data = .;
_sdata = .;
RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
...
PECOFF_EDATA_PADDING
_edata = .;
数据段从_sdata标记开始,到_edata标记结束。
BSS_SECTION(0, 0, 0)
#define BSS_SECTION(sbss_align, bss_align, stop_align) \
. = ALIGN(sbss_align); \
VMLINUX_SYMBOL(__bss_start) = .; \
SBSS(sbss_align) \
BSS(bss_align) \
. = ALIGN(stop_align); \
VMLINUX_SYMBOL(__bss_stop) = .;
该段从__bss_start标记开始,到__bss_stop标记结束。
名称 | 区间范围 | 说明 |
---|---|---|
代码段 | _stext~_etext | 存放内核的代码段 |
只读数据段 | _start_rodata~__init_begin | 存放只读数据,包括PGD页表swapper_pg_dir、特殊页表reserved_ttbr0等信息 |
init段 | __init_begin ~__initend | 存放内核初始化数据 |
数据段 | _sdata~_edata | 存放可读/可写的数据 |
未初始化数据段 | __bss_start~__bss_stop | 存放初始化为0的数据以及未初始化的全局变量和静态变量 |
启动引导程序会做必要的初始化,如内存设备初始化、磁盘设备初始化以及将内核映像文件加载到运行地址等,然后跳转到Linux内核入口。
ARMv8架构处理器支持虚拟化扩展的EL2和安全模式的EL3,这些异常等级都可以引导(切换)Linux内核的运行。Linux内核运行在EL1.总的来说,启动引导程序会做如下的一些引导动作。
arch/arm64/kernel/head.S文件中第48~62行的注释里描述了Linux内核中关于入口的约定
48 /*
49 * Kernel startup entry point.
50 * ---------------------------
51 *
52 * The requirements are:
53 * MMU = off, D-cache = off, I-cache = on or off,
54 * x0 = physical address to the FDT blob.
55 *
56 * This code is mostly position independent so you call this at
57 * __pa(PAGE_OFFSET + TEXT_OFFSET).
58 *
59 * Note that the callee-saved registers are used for storing variables
60 * that are useful before the MMU is enabled. The allocations are described
61 * in the entry routines.
62 */
相关约定如下:
stext函数包括如下几个重要的函数。
129 preserve_boot_args:
130 mov x21, x0 // x21=FDT
131
132 adr_l x0, boot_args // record the contents of
133 stp x21, x1, [x0] // x0 .. x3 at kernel entry
134 stp x2, x3, [x0, #16]
135
136 dmb sy // needed before dc ivac with
137 // MMU off
138
139 mov x1, #0x20 // 4 x 8 bytes
140 b __inval_dcache_area // tail call
141 ENDPROC(preserve_boot_args)
preserve_boot_args函数主要用于把从启动引导程序传递过来的4个参数X0~X3保存到boot_args[]数组中。
el2_setup函数实现在arch/arm64/kernel/head.S文件中。
489 ENTRY(el2_setup)
490 msr SPsel, #1 // We want to use SP_EL{1,2}
491 mrs x0, CurrentEL
492 cmp x0, #CurrentEL_EL2
493 b.eq 1f
494 mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)
495 msr sctlr_el1, x0
496 mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1
497 isb
498 ret
499
500 1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2)
501 msr sctlr_el2, x0
502 ...
从Linux内核关于入口的约定可知,处理器的状态有两种——或者处于EL2,或者处理非安全模式的EL1.因此el2_setup函数会做判断,然后进行相应的处理。约定处理器处于非安全模式的EL1。
656 set_cpu_boot_mode_flag:
657 adr_l x1, __boot_cpu_mode
658 cmp w0, #BOOT_CPU_MODE_EL2
659 b.ne 1f
660 add x1, x1, #4
661 1: str w0, [x1] // This CPU has booted in EL1
662 dmb sy
663 dc ivac, x1 // Invalidate potentially stale cache line
664 ret
665 ENDPROC(set_cpu_boot_mode_flag)
set_cpu_boot_mode_flag函数用来设置__boot_cpu_mode[]变量。系统定义了一个全局变量__boot_cpu_mode[]来记录处理器是在哪个异常等级启动的。该全局变量定义在arch/arm64/include/asm/virt.h头文件。
为了降低启动代码的复杂性,进入Linux内核入口时MMU时关闭的。关闭了MMU意味着不能利用高速缓存的性能。因此我们在初始化的某个阶段需要把MMU打开并且使能数据高速缓存,以获得更高的性能。但是,如何打开MMU?我们需要小心,否则会发生意想不到的问题。
279 __create_page_tables:
280 mov x28, lr
281
282 /*
283 * Invalidate the init page tables to avoid potential dirty cache lines
284 * being evicted. Other page tables are allocated in rodata as part of
285 * the kernel image, and thus are clean to the PoC per the boot
286 * protocol.
287 */
288 adrp x0, init_pg_dir
289 adrp x1, init_pg_end
290 sub x1, x1, x0
291 bl __inval_dcache_area
292
293 /*
294 * Clear the init page tables.
295 */
296 adrp x0, init_pg_dir
297 adrp x1, init_pg_end
298 sub x1, x1, x0
299 1: stp xzr, xzr, [x0], #16
300 stp xzr, xzr, [x0], #16
301 stp xzr, xzr, [x0], #16
302 stp xzr, xzr, [x0], #16
303 subs x1, x1, #64
304 b.ne 1b
305
306 mov x7, SWAPPER_MM_MMUFLAGS
307
308 /*
309 * Create the identity mapping.
310 */
311 adrp x0, idmap_pg_dir
312 adrp x3, __idmap_text_start // __pa(__idmap_text_start)
313
314 #ifdef CONFIG_ARM64_VA_BITS_52
315 mrs_s x6, SYS_ID_AA64MMFR2_EL1
316 and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
317 mov x5, #52
318 cbnz x6, 1f
319 #endif
320 mov x5, #VA_BITS_MIN
321 1:
322 adr_l x6, vabits_actual
323 str x5, [x6]
324 dmb sy
325 dc ivac, x6 // Invalidate potentially stale cache line
326
327 /*
328 * VA_BITS may be too small to allow for an ID mapping to be created
329 * that covers system RAM if that is located sufficiently high in the
330 * physical address space. So for the ID map, use an extended virtual
331 * range in that case, and configure an additional translation level
332 * if needed.
333 *
334 * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
335 * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
336 * this number conveniently equals the number of leading zeroes in
337 * the physical address of __idmap_text_end.
338 */
339 adrp x5, __idmap_text_end
340 clz x5, x5
341 cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough?
342 b.ge 1f // .. then skip VA range extension
343
344 adr_l x6, idmap_t0sz
345 str x5, [x6]
346 dmb sy
347 dc ivac, x6 // Invalidate potentially stale cache line
348
349 #if (VA_BITS < 48)
350 #define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
351 #define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))
352
353 /*
354 * If VA_BITS < 48, we have to configure an additional table level.
355 * First, we have to verify our assumption that the current value of
356 * VA_BITS was chosen such that all translation levels are fully
357 * utilised, and that lowering T0SZ will always result in an additional
358 * translation level to be configured.
359 */
360 #if VA_BITS != EXTRA_SHIFT
361 #error "Mismatch between VA_BITS and page size/number of translation levels"
362 #endif
363
364 mov x4, EXTRA_PTRS
365 create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
366 #else
367 /*
368 * If VA_BITS == 48, we don't have to configure an additional
369 * translation level, but the top-level table has more entries.
370 */
371 mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
372 str_l x4, idmap_ptrs_per_pgd, x5
373 #endif
374 1:
375 ldr_l x4, idmap_ptrs_per_pgd
376 mov x5, x3 // __pa(__idmap_text_start)
377 adr_l x6, __idmap_text_end // __pa(__idmap_text_end)
378
379 map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
arch/arm64/kernel/vmlinux.lds.S
. = ALIGN(PAGE_SIZE);
init_pg_dir = .;
. += INIT_DIR_SIZE;
init_pg_end = .;
arch/arm64/include/asm/kernel-pgtable.h
#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
#define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
. = ALIGN(PAGE_SIZE);
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;
idmap_pg_end = .;
arch/arm64/include/asm/kernel-pgtable.h
#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
map_memory宏也实现在arch/arm64/kernel/head.S文件中。
245 .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
246 add \rtbl, \tbl, #PAGE_SIZE
247 mov \sv, \rtbl
248 mov \count, #0
249 compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
250 populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
251 mov \tbl, \sv
252 mov \sv, \rtbl
253
254 #if SWAPPER_PGTABLE_LEVELS > 3
255 compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
256 populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
257 mov \tbl, \sv
258 mov \sv, \rtbl
259 #endif
260
261 #if SWAPPER_PGTABLE_LEVELS > 2
262 compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
263 populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
264 mov \tbl, \sv
265 #endif
266
267 compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
268 bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
269 populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
270 .endm
下面来看一下compute_indices宏的实现
210 .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
211 lsr \iend, \vend, \shift
212 mov \istart, \ptrs
213 sub \istart, \istart, #1
214 and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1)
215 mov \istart, \ptrs
216 mul \istart, \istart, \count
217 add \iend, \iend, \istart // iend += (count - 1) * ptrs
218 // our entries span multiple tables
219
220 lsr \istart, \vstart, \shift
221 mov \count, \ptrs
222 sub \count, \count, #1
223 and \istart, \istart, \count
224
225 sub \count, \iend, \istart
226 .endm
182 .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1
183 .Lpe\@: phys_to_pte \tmp1, \rtbl
184 orr \tmp1, \tmp1, \flags // tmp1 = table entry
185 str \tmp1, [\tbl, \index, lsl #3]
186 add \rtbl, \rtbl, \inc // rtbl = pa next level
187 add \index, \index, #1
188 cmp \index, \eindex
189 b.ls .Lpe\@
190 .endm
综合上述分析可知,__create_page_tables函数的第379行中,map_memory创建了一个恒等映射,把.idmap.text段的虚拟地址映射到了相同的物理地址上,这个映射的页表在idmap_pg_dir中如下图,物理内存的起始地址是从0x4000_0000开始的。
恒等映射的成员到链接到了.idmap.text中,究竟有哪些成员呢?恒等映射的起始地址为__idmap_text_start,结束地址为__idmap_text_end。我们可以从System.map文件中找出哪些函数在.idmap.text段里。
ffff0000105da000 t __idmap_text_start
ffff0000105da000 t kimage_vaddr
ffff0000105da008 t el2_setup
ffff0000105da060 t set_hcr
ffff0000105da130 t install_el2_stub
ffff0000105da184 t set_cpu_boot_mode_flag
ffff0000105da1a8 t secondary_holding_pen
ffff0000105da1cc t pen
ffff0000105da1e0 t secondary_entry
ffff0000105da1ec t secondary_startup
ffff0000105da204 t __secondary_switched
ffff0000105da240 t __secondary_too_slow
ffff0000105da24c t __enable_mmu
ffff0000105da2a4 t __cpu_secondary_check52bitva
ffff0000105da2a8 t __no_granule_support
ffff0000105da2cc t __relocate_kernel
ffff0000105da314 t __primary_switch
ffff0000105da388 t cpu_resume
ffff0000105da3a8 t __cpu_soft_restart
ffff0000105da3e8 t cpu_do_resume
ffff0000105da478 t idmap_cpu_replace_ttbr1
ffff0000105da4ac t __idmap_kpti_flag
ffff0000105da4b0 t idmap_kpti_install_ng_mappings
ffff0000105da4ec t do_pgd
ffff0000105da504 t next_pgd
ffff0000105da514 t skip_pgd
ffff0000105da554 t walk_puds
ffff0000105da55c t next_pud
ffff0000105da560 t walk_pmds
ffff0000105da568 t do_pmd
ffff0000105da580 t next_pmd
ffff0000105da590 t skip_pmd
ffff0000105da5a0 t walk_ptes
ffff0000105da5a8 t do_pte
ffff0000105da5cc t skip_pte
ffff0000105da5dc t __idmap_kpti_secondary
ffff0000105da624 t __cpu_setup
ffff0000105da700 t __idmap_text_end
arch/arm64/kernel/head.S
476 .section ".idmap.text","awx"
最后我们再来看内核映像的页表映射,继续看__create_page_tables函数的实现。
384 adrp x0, init_pg_dir
385 mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text)
386 add x5, x5, x23 // add KASLR displacement
387 mov x4, PTRS_PER_PGD
388 adrp x6, _end // runtime __pa(_end)
389 adrp x3, _text // runtime __pa(_text)
390 sub x6, x6, x3 // _end - _text
391 add x6, x6, x5 // runtime __va(_end)
392
393 map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
394
395 /*
396 * Since the page tables have been populated with non-cacheable
397 * accesses (MMU disabled), invalidate those tables again to
398 * remove any speculatively loaded cache lines.
399 */
400 dmb sy
401
402 adrp x0, idmap_pg_dir
403 adrp x1, idmap_pg_end
404 sub x1, x1, x0
405 bl __inval_dcache_area
406
407 adrp x0, init_pg_dir
408 adrp x1, init_pg_end
409 sub x1, x1, x0
410 bl __inval_dcache_area
411
412 ret x28
413 ENDPROC(__create_page_tables)
__cpu_setup函数打开MMU以做一些与处理器相关的初始化,它的代码实现在arch/arm64/mm/proc.S文件中。
arch/arm64/mm/proc.S
422 .pushsection ".idmap.text", "awx"
423 ENTRY(__cpu_setup)
424 tlbi vmalle1 // Invalidate local TLB
425 dsb nsh
426
427 mov x0, #3 << 20
428 msr cpacr_el1, x0 // Enable FP/ASIMD
429 mov x0, #1 << 12 // Reset mdscr_el1 and disable
430 msr mdscr_el1, x0 // access to the DCC from EL0
431 isb // Unmask debug exceptions now,
432 enable_dbg // since this is per-cpu
433 reset_pmuserenr_el0 x0 // Disable PMU access from EL0
434 reset_amuserenr_el0 x0 // Disable AMU access from EL0
435
436 /*
437 * Memory region attributes for LPAE:
438 *
439 * n = AttrIndx[2:0]
440 * n MAIR
441 * DEVICE_nGnRnE 000 00000000
442 * DEVICE_nGnRE 001 00000100
443 * DEVICE_GRE 010 00001100
444 * NORMAL_NC 011 01000100
445 * NORMAL 100 11111111
446 * NORMAL_WT 101 10111011
447 */
448 ldr x5, =MAIR(0x00, MT_DEVICE_nGnRnE) | \
449 MAIR(0x04, MT_DEVICE_nGnRE) | \
450 MAIR(0x0c, MT_DEVICE_GRE) | \
451 MAIR(0x44, MT_NORMAL_NC) | \
452 MAIR(0xff, MT_NORMAL) | \
453 MAIR(0xbb, MT_NORMAL_WT)
454 msr mair_el1, x5
455 /*
456 * Prepare SCTLR
457 */
458 mov_q x0, SCTLR_EL1_SET
459 /*
460 * Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
461 * both user and kernel.
462 */
463 ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
464 TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
465 TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS
466 tcr_clear_errata_bits x10, x9, x5
467
468 #ifdef CONFIG_ARM64_VA_BITS_52
469 ldr_l x9, vabits_actual
470 sub x9, xzr, x9
471 add x9, x9, #64
472 tcr_set_t1sz x10, x9
473 #else
474 ldr_l x9, idmap_t0sz
475 #endif
476 tcr_set_t0sz x10, x9
477
478 /*
479 * Set the IPS bits in TCR_EL1.
480 */
481 tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6
482 #ifdef CONFIG_ARM64_HW_AFDBM
483 /*
484 * Enable hardware update of the Access Flags bit.
485 * Hardware dirty bit management is enabled later,
486 * via capabilities.
487 */
488 mrs x9, ID_AA64MMFR1_EL1
489 and x9, x9, #0xf
490 cbz x9, 1f
491 orr x10, x10, #TCR_HA // hardware Access flag update
492 1:
493 #endif /* CONFIG_ARM64_HW_AFDBM */
494 msr tcr_el1, x10
495 ret // return to head.S
496 ENDPROC(__cpu_setup)
#define SCTLR_EL1_SET (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |\
SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I |\
SCTLR_EL1_DZE | SCTLR_EL1_UCT |\
SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\
ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_RES1)
952 __primary_switch:
953 #ifdef CONFIG_RANDOMIZE_BASE
954 mov x19, x0 // preserve new SCTLR_EL1 value
955 mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value
956 #endif
957
958 adrp x1, init_pg_dir
959 bl __enable_mmu
960 #ifdef CONFIG_RELOCATABLE
961 #ifdef CONFIG_RELR
962 mov x24, #0 // no RELR displacement yet
963 #endif
964 bl __relocate_kernel
965 #ifdef CONFIG_RANDOMIZE_BASE
966 ldr x8, =__primary_switched
967 adrp x0, __PHYS_OFFSET
968 blr x8
969
970 /*
971 * If we return here, we have a KASLR displacement in x23 which we need
972 * to take into account by discarding the current kernel mapping and
973 * creating a new one.
974 */
975 pre_disable_mmu_workaround
976 msr sctlr_el1, x20 // disable the MMU
977 isb
978 bl __create_page_tables // recreate kernel mapping
979
980 tlbi vmalle1 // Remove any stale TLB entries
981 dsb nsh
982
983 msr sctlr_el1, x19 // re-enable the MMU
984 isb
985 ic iallu // flush instructions fetched
986 dsb nsh // via old mapping
987 isb
988
989 bl __relocate_kernel
990 #endif
991 #endif
992 ldr x8, =__primary_switched
993 adrp x0, __PHYS_OFFSET
994 br x8
995 ENDPROC(__primary_switch)
arch/arm64/kernel/head.S
790 ENTRY(__enable_mmu)
791 mrs x2, ID_AA64MMFR0_EL1
792 ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
793 cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED
794 b.ne __no_granule_support
795 update_early_cpu_boot_status 0, x2, x3
796 adrp x2, idmap_pg_dir
797 phys_to_ttbr x1, x1
798 phys_to_ttbr x2, x2
799 msr ttbr0_el1, x2 // load TTBR0
800 offset_ttbr1 x1, x3
801 msr ttbr1_el1, x1 // load TTBR1
802 isb
803 msr sctlr_el1, x0
804 isb
805 /*
806 * Invalidate the local I-cache so that any instructions fetched
807 * speculatively from the PoC are discarded, since they may have
808 * been dynamically patched at the PoU.
809 */
810 ic iallu
811 dsb nsh
812 isb
813 ret
814 ENDPROC(__enable_mmu)
前面提到__enable_mmu函数传递了两个参数。X0表示SCTLR_EL1的值,X1表示init页表的PGD页表基地址。
421 __primary_switched:
422 adrp x4, init_thread_union
423 add sp, x4, #THREAD_SIZE
424 adr_l x5, init_task
425 msr sp_el0, x5 // Save thread_info
426
427 adr_l x8, vectors // load VBAR_EL1 with virtual
428 msr vbar_el1, x8 // vector table address
429 isb
430
431 stp xzr, x30, [sp, #-16]!
432 mov x29, sp
433
434 #ifdef CONFIG_SHADOW_CALL_STACK
435 adr_l x18, init_shadow_call_stack // Set shadow call stack
436 #endif
437
438 str_l x21, __fdt_pointer, x5 // Save FDT pointer
439
440 ldr_l x4, kimage_vaddr // Save the offset between
441 sub x4, x4, x0 // the kernel virtual and
442 str_l x4, kimage_voffset, x5 // physical mappings
443
444 // Clear BSS
445 adr_l x0, __bss_start
446 mov x1, xzr
447 adr_l x2, __bss_stop
448 sub x2, x2, x0
449 bl __pi_memset
450 dsb ishst // Make zero page visible to PTW
451
452 #ifdef CONFIG_KASAN
453 bl kasan_early_init
454 #endif
455 #ifdef CONFIG_RANDOMIZE_BASE
456 tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
457 b.ne 0f
458 mov x0, x21 // pass FDT address in x0
459 bl kaslr_early_init // parse FDT for KASLR options
460 cbz x0, 0f // KASLR disabled? just proceed
461 orr x23, x23, x0 // record KASLR offset
462 ldp x29, x30, [sp], #16 // we must enable KASLR, return
463 ret // to __primary_switch()
464 0:
465 #endif
466 add sp, sp, #16
467 mov x29, #0
468 mov x30, #0
469 b start_kernel
470 ENDPROC(__primary_switched)
__primary_switched函数传递一个参数__PHYS_OFFSET,它的值为KERNEL_START - TEXT_OFFSET。
include/linux/sched/task.h
40 extern union thread_union init_thread_union;
include/linux/sched.h
1664 union thread_union {
1665 #ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
1666 struct task_struct task;
1667 #endif
1668 #ifndef CONFIG_THREAD_INFO_IN_TASK
1669 struct thread_info thread_info;
1670 #endif
1671 unsigned long stack[THREAD_SIZE/sizeof(long)];
1672 };
thread_union存储在内核映像的数据段里。
init/init.c
struct task_struct init_task
= {
.state = 0,
.stack = init_stack,
.usage = REFCOUNT_INIT(2),
.flags = PF_KTHREAD,
.prio = MAX_PRIO - 20,
.static_prio = MAX_PRIO - 20,
.normal_prio = MAX_PRIO - 20,
...
};
参考文献:奔跑吧LInux内核