<上一篇中,已经介绍了stext宏的preserve_boot_args
el2_setup
set_cpu_boot_mode_flag
__vet_fdt
函数,其中
-
preserve_boot_args
函数主要任务是按arm64规范,在x0-x3寄存器中存入dtb pa还有0;并初始化boot_args地址段 -
el2_setup
:重置sctlr_elx寄存器,确定CPU处于哪个exception level, …..待补充 -
set_cpu_boot_mode_flag
:保存exception level到全局变量__boot_cpu_mode,以便系统启动后使用 -
__vet_fdt
:校验fdt是否符号arm64规范要求,见document/boot.txt要求
本文场景前提:
ARM64,VA是48 bit,page size是4K===》在地址映射过程中,地址被分成9(level 0) + 9(level 1) + 9(level 2) + 9(level 3) + 12(page offset)
主要描述ARM64启动过程中,如何建立初始化阶段页表,以便在打开MMU后能正确执行linux内核代码;场景前提:ARM64,VA是48 bit,page size是4K
在一般程序中,要想内核执行我们的一个程序,我们只要告知内核2个东西即可:1. 代码地址及长度;2. 程序参数地址及长度;linux kernel就是一个特殊的程序,因此在打开MMU前,我们得先准备好打开MMU后,Linux image地址及长度,传入kernel的参数地址及长度
因此在初始化阶段,我们需要mapping三段地址
-
identify mapping: 把物理地址直接mapping到物理地址上,仅仅为了解决cpu pipeline机制对于MMU打开前后的冲击,具体可以查看arm手册说明
-
kernel image mapping: 准备好MMU打开后linux image地址
-
blob memory mapping: 准备好MMU打开后dtb参数地址
<以下均以ARM64, 48bit为例,位宽由#define VA_BITS (CONFIG_ARM64_VA_BITS)
配置
虚拟地址空间布局
参考linux-4.1.10\Documentation\arm64memory.txt
翻译过程
-
通过虚拟地址最高位确定属于userspace还是kernel space,从而分别选择TTBR0_EL1或者TTBR1_EL1;该寄存器保存了PGD基地址,PGD中每个entry都是描述符,可能是table descriptor, block descriptor, page descriptor,可以根据entry中标志位确认
-
如果是block descriptor,翻译就结束了;否则指向下一节的translation table(PUD),同理……
section mapping
ARM64_SWAPPER_USES_SECTION_MAPS
宏定义了swapper/idpmap是否使用section map;什么是section mapping?
我们用一个实际的例子来描述。假设VA是48 bit,page size是4K,那么,在地址映射过程中,地址被分成9(level 0) + 9(level 1) + 9(level 2) + 9(level 3) + 12(page offset),对于kernel image这样的big block memory region,使用4K的page来mapping有点得不偿失,在这种情况下,可以考虑让level 2的Translation table entry指向一个2M 的memory region,而不是下一级的Translation table。所谓的section map就是指使用2M的为单位进行映射。
当然,不是什么情况都是可以使用section map,对于kernel image,其起始地址是2M对齐的,因此block size是2M的情况下才OK,对于PAGE SIZE是16K,其Block descriptor指向了一个32M的内存块,PAGE SIZE是64K的时候,Block descriptor指向了一个512M的内存块,因此,只有4K page size的情况下,才可以启用section map。
代码分析__create_page_tables
以48bit(9+9+9+9+12),Page size为4K为例,如果是seciton map, PGD(Level 0)、PUD(Level 1)、PMD(Level 2)的translation table中的entry都是512项,每个描述符是8个byte,因此这些translation table都是4KB,恰好是一个page size; 共3page size
/*
* Setup the initial page tables. We only setup the barest amount which is
* required to get the kernel running. The following sections are required:
* - identity mapping to enable the MMU (low address, TTBR0)
* - first few MB of the kernel linear mapping to jump to once the MMU has
* been enabled, including the FDT blob (TTBR1)
* - pgd entry for fixed mappings (TTBR1)
* . = ALIGN(PAGE_SIZE);
* idmap_pg_dir = .;
* . += IDMAP_DIR_SIZE; //在section map场景下,IDMAP_DIR_SIZE=3
* swapper_pg_dir = .;
* . += SWAPPER_DIR_SIZE; //在section map场景下, SWAPPER_DIR_SIZE =3
*/
__create_page_tables:
adrp x25, idmap_pg_dir //获取idmap_pg_dir物理地址(由于MMU还未打开),adrp:计算指定的符号地址到run time pc值的offset,4K对齐
adrp x26, swapper_pg_dir //获取swapper_pg_dir物理地址
mov x27, lr //保存lr
/*
* Invalidate the idmap and swapper page tables to avoid potential
* dirty cache lines being evicted.
* 根据boot protocol,代码执行到此,对于cache的要求是kernel image对应的那段空间的cache line是clean到PoC的,不过idmap_pg_dir和swapper_pg_dir对应页表空间不属于kernel image的一部分,因此其对应的cacheline很可能有一些旧的,无效的数据,必须要清理掉。
*/
mov x0, x25 //准备要inval_cache段的首地址
add x1, x26, #SWAPPER_DIR_SIZE //准备要inval_cache段的尾地址
bl __inval_cache_range //将idmap和swapper页表地址段对应的cacheline设定为无效
/*
* Clear the idmap and swapper page tables.
* 初始化idmap page和swapper page对应的物理地址
*/
mov x0, x25
add x6, x26, #SWAPPER_DIR_SIZE //x6=swapper_pg_dir+SWAPPER_DIR_SIZE
1: stp xzr, xzr, [x0], #16 //*x0=xzr(零寄存器), x0+=16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
cmp x0, x6
b.lo 1b
ldr x7, =MM_MMUFLAGS //在x7中保存MM_MMUFLAGS,用来填充PGD等
/*
* Create the identity mapping.
* 创建一致性映射,一致性映射的是kernel image, 从KERNEL_START到KERNEL_END
*/
mov x0, x25 // idmap_pg_dir
adrp x3, KERNEL_START // __pa(KERNEL_START)
/* 中间部分见附录 */
/* x0: idmap_pg_dir(pgd address)
* x3: KERNEL_START(需要创建哪一个地址的描述符)
* x5:临时变量
* x6:临时变量
* 用来在PGD中创建一个描述符,如果需要下一级translation table,也需要同时建立;保证完成所有中间level的translation table的建立(实际是在每一级table中建立一个描述符)
* 由于是identity mapping,x3实际为物理地址,其实也是虚拟地址
* create_pgd_entry, tbl, virt, tmp1, tmp2
*/
create_pgd_entry x0, x3, x5, x6
mov x5, x3 // __pa(KERNEL_START)
adr_l x6, KERNEL_END // __pa(KERNEL_END)
/*
* x0: PTE, L3 index
* x7: MM_MMUFLAGS
* x3: KERNEL_START
* x5:KERNEL_START
* x6:KERNEL_END
* PMD设定,在tbl指定的translation table中建立block descriptor,以便完成address mapping
* x3=x5保证identify mapping
* create_block_map, tbl, flags, phys, start, end
*/
create_block_map x0, x7, x3, x5, x6
/*
* Map the kernel image (starting with PHYS_OFFSET).
* 完成identify mapping后,进行kernel mapping
*/
mov x0, x26 // swapper_pg_dir
mov x5, #PAGE_OFFSET
/*
* x0: swapper_pg_dir, swapper进程的地址空间,即内核地址空间PGD的basic addr
* x5:page offset, 内核空间虚拟首地址, 对于48bit,4k, 该地址为ffff0000-00000000
*
* 创建PAGE_OFFSET(即kernel image首地址)对应的PGD和PUD中的描述符,返回时,x0=PTE addr
*/
create_pgd_entry x0, x5, x3, x6
ldr x6, =KERNEL_END // __va(KERNEL_END), 区分下ldr, adr的区别
mov x3, x24 // phys offset
/*
* 创建PMD中的描述符. x24保存了__PHYS_OFFSET,实际上也就是kernel image的物理首地址
*/
create_block_map x0, x7, x3, x5, x6
/*
* Map the FDT blob (maximum 2MB; must be within 512MB of
* PHYS_OFFSET).
* 由于dtb image不能越过2M边界,所以只需要创建一个PMD即可
* 要求在512M之内是为了保证在同一个PUD entry中
* 要求在2M内是为了保证只需要再映射一个PMD entry
*/
mov x3, x21 // FDT phys address
and x3, x3, #~((1 << 21) - 1) // 2MB aligned
mov x6, #PAGE_OFFSET // x6=内核空间首地址的虚拟地址
sub x5, x3, x24 // subtract PHYS_OFFSET, ARM64中, fdt的物理地址紧跟在kernel image之后的,为防止在text_offset段和boot_arg冲突
tst x5, #~((1 << 29) - 1) // within 512MB?
csel x21, xzr, x21, ne // zero the FDT pointer;校验失败后初始化
b.ne 1f
add x5, x5, x6 // __va(FDT blob), fdt va = kernel va + (fdt pa - kernel pa)
add x6, x5, #1 << 21 // 2MB for the FDT blob
sub x6, x6, #1 // inclusive range
create_block_map x0, x7, x3, x5, x6
1:
/*
* Since the page tables have been populated with non-cacheable
* accesses (MMU disabled), invalidate the idmap and swapper page
* tables again to remove any speculatively loaded cache lines.
*/
mov x0, x25 // 再次invalid上文中建立的page table memory对应的cache
add x1, x26, #SWAPPER_DIR_SIZE
dmb sy
bl __inval_cache_range
mov lr, x27 //恢复lr
ret
ENDPROC(__create_page_tables)
/*
* Macro to create a table entry to the next page.
*
* Preserves: virt
* Corrupts: tmp1, tmp2
* Returns: tbl -> next level table page address
*/
.macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
lsr \tmp1, \virt, #\shift //tmp1=virt>shift,找到virt在tbl表中的偏移量
and \tmp1, \tmp1, #\ptrs - 1 // table index, tmp1 &= ptrs-1, 这时tmp1即为virt在tbl中偏移量
add \tmp2, \tbl, #PAGE_SIZE //tmp2 = tbl + PAGE_SIZE 由于每个translation page为4k,此时tmp2为下一级页表地址
orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type, PMD_TYPE_TABLE=(_AT(pmdval_t, 3) << 0), 设置描述符为有效,为table descriptor
str \tmp2, [\tbl, \tmp1, lsl #3] //将tmp2写入tbl+tmp1>>3
add \tbl, \tbl, #PAGE_SIZE // next level table page,此时tbl变为pud地址,再建立下一级页表
.endm
/*
* Macro to populate the PGD (and possibily PUD) for the corresponding
* block entry in the next level (tbl) for the given virtual address.
*
* Preserves: tbl, next, virt
* Corrupts: tmp1, tmp2
*/
.macro create_pgd_entry, tbl, virt, tmp1, tmp2
create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2
#if SWAPPER_PGTABLE_LEVELS == 3
create_table_entry \tbl, \virt, TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2
#endif
.endm
/*
* Macro to populate block entries in the page table for the start..end
* virtual range (inclusive).
*
* Preserves: tbl, flags
* Corrupts: phys, start, end, pstate
*/
.macro create_block_map, tbl, flags, phys, start, end
lsr \phys, \phys, #BLOCK_SHIFT //phy=phy>21
lsr \start, \start, #BLOCK_SHIFT //start=start>21
and \start, \start, #PTRS_PER_PTE - 1 // table index, start &= 0x1ff
orr \phys, \flags, \phys, lsl #BLOCK_SHIFT // table entry, phys = flags|phys>21
lsr \end, \end, #BLOCK_SHIFT // end = end>21
and \end, \end, #PTRS_PER_PTE - 1 // table end index , end &= 0x1ff
9999: str \phys, [\tbl, \start, lsl #3] // store the entry, (tbl + start<3) = phy
add \start, \start, #1 // next entry,
add \phys, \phys, #BLOCK_SIZE // next block
cmp \start, \end
b.ls 9999b
.endm
参考文献
http://www.wowotech.net/armv8a_arch/create_page_tables.html
附录
#ifndef CONFIG_ARM64_VA_BITS_48
#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
#define EXTRA_PTRS (1 << (48 - EXTRA_SHIFT))
/*
* If VA_BITS < 48, it may be too small to allow for an ID mapping to be
* created that covers system RAM if that is located sufficiently high
* in the physical address space. So for the ID map, use an extended
* virtual range in that case, by configuring an additional translation
* level.
* First, we have to verify our assumption that the current value of
* VA_BITS was chosen such that all translation levels are fully
* utilised, and that lowering T0SZ will always result in an additional
* translation level to be configured.
*/
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif
/*
* Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
* entire kernel image can be ID mapped. As T0SZ == (64 - #bits used),
* this number conveniently equals the number of leading zeroes in
* the physical address of KERNEL_END.
* clz Rd Rm : 前导0计数
* 计算kernel image是否可以ID mapped;如果kernel_end地址在0-2^VA_BITS即可映射
* 否则要使用extra_shift
*/
adrp x5, KERNEL_END
clz x5, x5
cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough?
b.ge 1f // .. then skip additional level
adr_l x6, idmap_t0sz
str x5, [x6]
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line
create_table_entry x0, x3, EXTRA_SHIFT, EXTRA_PTRS, x5, x6
1:
#endif