ARM64内存管理二:创建启动阶段的页表

<上一篇中,已经介绍了stext宏的preserve_boot_args el2_setup set_cpu_boot_mode_flag __vet_fdt函数,其中

  • preserve_boot_args函数主要任务是按arm64规范,在x0-x3寄存器中存入dtb pa还有0;并初始化boot_args地址段

  • el2_setup:重置sctlr_elx寄存器,确定CPU处于哪个exception level, …..待补充

  • set_cpu_boot_mode_flag:保存exception level到全局变量__boot_cpu_mode,以便系统启动后使用

  • __vet_fdt:校验fdt是否符号arm64规范要求,见document/boot.txt要求

本文场景前提:

ARM64,VA是48 bit,page size是4K===》在地址映射过程中,地址被分成9(level 0) + 9(level 1) + 9(level 2) + 9(level 3) + 12(page offset)

主要描述ARM64启动过程中,如何建立初始化阶段页表,以便在打开MMU后能正确执行linux内核代码;场景前提:ARM64,VA是48 bit,page size是4K

在一般程序中,要想内核执行我们的一个程序,我们只要告知内核2个东西即可:1. 代码地址及长度;2. 程序参数地址及长度;linux kernel就是一个特殊的程序,因此在打开MMU前,我们得先准备好打开MMU后,Linux image地址及长度,传入kernel的参数地址及长度

因此在初始化阶段,我们需要mapping三段地址

  • identify mapping: 把物理地址直接mapping到物理地址上,仅仅为了解决cpu pipeline机制对于MMU打开前后的冲击,具体可以查看arm手册说明

  • kernel image mapping: 准备好MMU打开后linux image地址

  • blob memory mapping: 准备好MMU打开后dtb参数地址

<以下均以ARM64, 48bit为例,位宽由#define VA_BITS (CONFIG_ARM64_VA_BITS)配置

虚拟地址空间布局

参考linux-4.1.10\Documentation\arm64memory.txt

翻译过程

  1. 通过虚拟地址最高位确定属于userspace还是kernel space,从而分别选择TTBR0_EL1或者TTBR1_EL1;该寄存器保存了PGD基地址,PGD中每个entry都是描述符,可能是table descriptor, block descriptor, page descriptor,可以根据entry中标志位确认

  2. 如果是block descriptor,翻译就结束了;否则指向下一节的translation table(PUD),同理……

section mapping

ARM64_SWAPPER_USES_SECTION_MAPS宏定义了swapper/idpmap是否使用section map;什么是section mapping?

我们用一个实际的例子来描述。假设VA是48 bit,page size是4K,那么,在地址映射过程中,地址被分成9(level 0) + 9(level 1) + 9(level 2) + 9(level 3) + 12(page offset),对于kernel image这样的big block memory region,使用4K的page来mapping有点得不偿失,在这种情况下,可以考虑让level 2的Translation table entry指向一个2M 的memory region,而不是下一级的Translation table。所谓的section map就是指使用2M的为单位进行映射。

当然,不是什么情况都是可以使用section map,对于kernel image,其起始地址是2M对齐的,因此block size是2M的情况下才OK,对于PAGE SIZE是16K,其Block descriptor指向了一个32M的内存块,PAGE SIZE是64K的时候,Block descriptor指向了一个512M的内存块,因此,只有4K page size的情况下,才可以启用section map。

代码分析__create_page_tables

以48bit(9+9+9+9+12),Page size为4K为例,如果是seciton map, PGD(Level 0)、PUD(Level 1)、PMD(Level 2)的translation table中的entry都是512项,每个描述符是8个byte,因此这些translation table都是4KB,恰好是一个page size; 共3page size


/*
 * Setup the initial page tables. We only setup the barest amount which is
 * required to get the kernel running. The following sections are required:
 *   - identity mapping to enable the MMU (low address, TTBR0)
 *   - first few MB of the kernel linear mapping to jump to once the MMU has
 *     been enabled, including the FDT blob (TTBR1)
 *   - pgd entry for fixed mappings (TTBR1)
 * . = ALIGN(PAGE_SIZE); 
 * idmap_pg_dir = .; 
 * . += IDMAP_DIR_SIZE; //在section map场景下,IDMAP_DIR_SIZE=3
 * swapper_pg_dir = .; 
 * . += SWAPPER_DIR_SIZE;   //在section map场景下, SWAPPER_DIR_SIZE =3
 */
__create_page_tables:
	adrp	x25, idmap_pg_dir    //获取idmap_pg_dir物理地址(由于MMU还未打开),adrp:计算指定的符号地址到run time pc值的offset,4K对齐
	adrp	x26, swapper_pg_dir  //获取swapper_pg_dir物理地址
	mov	x27, lr                  //保存lr

	/*
	 * Invalidate the idmap and swapper page tables to avoid potential
	 * dirty cache lines being evicted.
	 * 根据boot protocol,代码执行到此,对于cache的要求是kernel image对应的那段空间的cache line是clean到PoC的,不过idmap_pg_dir和swapper_pg_dir对应页表空间不属于kernel image的一部分,因此其对应的cacheline很可能有一些旧的,无效的数据,必须要清理掉。
	 */
	mov	x0, x25                      //准备要inval_cache段的首地址
	add	x1, x26, #SWAPPER_DIR_SIZE   //准备要inval_cache段的尾地址
	bl	__inval_cache_range            //将idmap和swapper页表地址段对应的cacheline设定为无效

	/*
	 * Clear the idmap and swapper page tables.
	 * 初始化idmap page和swapper page对应的物理地址
	 */
	mov	x0, x25
	add	x6, x26, #SWAPPER_DIR_SIZE   //x6=swapper_pg_dir+SWAPPER_DIR_SIZE
1:	stp	xzr, xzr, [x0], #16          //*x0=xzr(零寄存器), x0+=16
	stp	xzr, xzr, [x0], #16
	stp	xzr, xzr, [x0], #16
	stp	xzr, xzr, [x0], #16
	cmp	x0, x6
	b.lo	1b

	ldr	x7, =MM_MMUFLAGS           //在x7中保存MM_MMUFLAGS,用来填充PGD等

	/*
	 * Create the identity mapping.
	 * 创建一致性映射,一致性映射的是kernel image, 从KERNEL_START到KERNEL_END
	 */
	mov	x0, x25				// idmap_pg_dir
	adrp	x3, KERNEL_START		// __pa(KERNEL_START)
	/*  中间部分见附录  */
    /* x0: idmap_pg_dir(pgd address)
	 * x3: KERNEL_START(需要创建哪一个地址的描述符)
	 * x5:临时变量
	 * x6:临时变量
	 * 用来在PGD中创建一个描述符,如果需要下一级translation table,也需要同时建立;保证完成所有中间level的translation table的建立(实际是在每一级table中建立一个描述符)
	 * 由于是identity mapping,x3实际为物理地址,其实也是虚拟地址
	 * create_pgd_entry, tbl, virt, tmp1, tmp2
	*/
	create_pgd_entry x0, x3, x5, x6
	mov	x5, x3				// __pa(KERNEL_START)
	adr_l	x6, KERNEL_END			// __pa(KERNEL_END)
	/* 
	 * x0: PTE, L3 index
	 * x7: MM_MMUFLAGS
	 * x3: KERNEL_START
	 * x5:KERNEL_START
	 * x6:KERNEL_END
	 * PMD设定,在tbl指定的translation table中建立block descriptor,以便完成address mapping
	 * x3=x5保证identify mapping
	 * create_block_map, tbl, flags, phys, start, end
	*/
	create_block_map x0, x7, x3, x5, x6

	/*
	 * Map the kernel image (starting with PHYS_OFFSET).
	 * 完成identify mapping后,进行kernel mapping
	 */
	mov	x0, x26				// swapper_pg_dir
	mov	x5, #PAGE_OFFSET
	/* 
	 * x0: swapper_pg_dir, swapper进程的地址空间,即内核地址空间PGD的basic addr
	 * x5:page offset, 内核空间虚拟首地址, 对于48bit,4k, 该地址为ffff0000-00000000
	 * 
	 * 创建PAGE_OFFSET(即kernel image首地址)对应的PGD和PUD中的描述符,返回时,x0=PTE addr
	*/
	create_pgd_entry x0, x5, x3, x6
	ldr	x6, =KERNEL_END			// __va(KERNEL_END), 区分下ldr, adr的区别
	mov	x3, x24				// phys offset
	/*
	 * 创建PMD中的描述符. x24保存了__PHYS_OFFSET,实际上也就是kernel image的物理首地址
	 */
	create_block_map x0, x7, x3, x5, x6

	/*
	 * Map the FDT blob (maximum 2MB; must be within 512MB of
	 * PHYS_OFFSET).
	 * 由于dtb image不能越过2M边界,所以只需要创建一个PMD即可
	 * 要求在512M之内是为了保证在同一个PUD entry中
	 * 要求在2M内是为了保证只需要再映射一个PMD entry
	 */
	mov	x3, x21				// FDT phys address
	and	x3, x3, #~((1 << 21) - 1)	// 2MB aligned
	mov	x6, #PAGE_OFFSET            // x6=内核空间首地址的虚拟地址
	sub	x5, x3, x24			// subtract PHYS_OFFSET, ARM64中, fdt的物理地址紧跟在kernel image之后的,为防止在text_offset段和boot_arg冲突
	tst	x5, #~((1 << 29) - 1)		// within 512MB?
	csel	x21, xzr, x21, ne		// zero the FDT pointer;校验失败后初始化
	b.ne	1f
	add	x5, x5, x6			// __va(FDT blob), fdt va = kernel va + (fdt pa - kernel pa)
	add	x6, x5, #1 << 21		// 2MB for the FDT blob
	sub	x6, x6, #1			// inclusive range
	create_block_map x0, x7, x3, x5, x6
1:
	/*
	 * Since the page tables have been populated with non-cacheable
	 * accesses (MMU disabled), invalidate the idmap and swapper page
	 * tables again to remove any speculatively loaded cache lines.
	 */
	mov	x0, x25             // 再次invalid上文中建立的page table memory对应的cache
	add	x1, x26, #SWAPPER_DIR_SIZE
	dmb	sy
	bl	__inval_cache_range

	mov	lr, x27             //恢复lr
	ret
ENDPROC(__create_page_tables)


/*
 * Macro to create a table entry to the next page.
 *
 * Preserves:	virt
 * Corrupts:	tmp1, tmp2
 * Returns:	tbl -> next level table page address
 */
	.macro	create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
	lsr	\tmp1, \virt, #\shift        //tmp1=virt>shift,找到virt在tbl表中的偏移量
	and	\tmp1, \tmp1, #\ptrs - 1	// table index, tmp1 &= ptrs-1, 这时tmp1即为virt在tbl中偏移量
	add	\tmp2, \tbl, #PAGE_SIZE      //tmp2 = tbl + PAGE_SIZE 由于每个translation page为4k,此时tmp2为下一级页表地址
	orr	\tmp2, \tmp2, #PMD_TYPE_TABLE	// address of next table and entry type, PMD_TYPE_TABLE=(_AT(pmdval_t, 3) << 0), 设置描述符为有效,为table descriptor
	str	\tmp2, [\tbl, \tmp1, lsl #3]    //将tmp2写入tbl+tmp1>>3
	add	\tbl, \tbl, #PAGE_SIZE		// next level table page,此时tbl变为pud地址,再建立下一级页表
	.endm
/*
 * Macro to populate the PGD (and possibily PUD) for the corresponding
 * block entry in the next level (tbl) for the given virtual address.
 *
 * Preserves:	tbl, next, virt
 * Corrupts:	tmp1, tmp2
 */
	.macro	create_pgd_entry, tbl, virt, tmp1, tmp2
	create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2
#if SWAPPER_PGTABLE_LEVELS == 3
	create_table_entry \tbl, \virt, TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2
#endif
	.endm
	


/*
 * Macro to populate block entries in the page table for the start..end
 * virtual range (inclusive).
 * 
 * Preserves:	tbl, flags
 * Corrupts:	phys, start, end, pstate
 */
	.macro	create_block_map, tbl, flags, phys, start, end
	lsr	\phys, \phys, #BLOCK_SHIFT          //phy=phy>21
	lsr	\start, \start, #BLOCK_SHIFT        //start=start>21
	and	\start, \start, #PTRS_PER_PTE - 1	// table index, start &= 0x1ff
	orr	\phys, \flags, \phys, lsl #BLOCK_SHIFT	// table entry, phys = flags|phys>21
	lsr	\end, \end, #BLOCK_SHIFT                // end = end>21
	and	\end, \end, #PTRS_PER_PTE - 1		// table end index , end &= 0x1ff
9999:	str	\phys, [\tbl, \start, lsl #3]		// store the entry, (tbl + start<3) = phy
	add	\start, \start, #1			// next entry, 
	add	\phys, \phys, #BLOCK_SIZE		// next block
	cmp	\start, \end
	b.ls	9999b
	.endm

参考文献

http://www.wowotech.net/armv8a_arch/create_page_tables.html

附录

#ifndef CONFIG_ARM64_VA_BITS_48
#define EXTRA_SHIFT	(PGDIR_SHIFT + PAGE_SHIFT - 3)
#define EXTRA_PTRS	(1 << (48 - EXTRA_SHIFT))

	/*
	 * If VA_BITS < 48, it may be too small to allow for an ID mapping to be
	 * created that covers system RAM if that is located sufficiently high
	 * in the physical address space. So for the ID map, use an extended
	 * virtual range in that case, by configuring an additional translation
	 * level.
	 * First, we have to verify our assumption that the current value of
	 * VA_BITS was chosen such that all translation levels are fully
	 * utilised, and that lowering T0SZ will always result in an additional
	 * translation level to be configured.
	 */
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif

	/*
	 * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
	 * entire kernel image can be ID mapped. As T0SZ == (64 - #bits used),
	 * this number conveniently equals the number of leading zeroes in
	 * the physical address of KERNEL_END.
	 * clz Rd Rm : 前导0计数
	 * 计算kernel image是否可以ID mapped;如果kernel_end地址在0-2^VA_BITS即可映射
	 * 否则要使用extra_shift
	 */
	adrp	x5, KERNEL_END
	clz	x5, x5
	cmp	x5, TCR_T0SZ(VA_BITS)	// default T0SZ small enough?
	b.ge	1f			// .. then skip additional level

	adr_l	x6, idmap_t0sz
	str	x5, [x6]
	dmb	sy
	dc	ivac, x6		// Invalidate potentially stale cache line

	create_table_entry x0, x3, EXTRA_SHIFT, EXTRA_PTRS, x5, x6
1:
#endif