ARM64内存管理:mmap

前沿

往篇回顾

本片主要内容

mmap

  • mmap系统调用并不完全是为了共享内存而设计的,它本身提供了不同于一般对普通文件的访问方式,进程可以像读写内存一样对普通文件操作

  • mmap并不分配空间,只是将文件映射到调用进程的地址空间里(占用虚拟地址空间), 然后就可以使用

映射关系分类

  • 文件映射:磁盘文件映射进程的虚拟地址空间,使用文件内容初始化物理内存

  • 匿名映射:初始化全为0的内存空间

  • 私有映射:多进程数据共享,修改不反应到磁盘实际文件,是一个copy-on-write(写时复制)的映射方式

  • 共享映射:多进程间数据共享,修改反应到磁盘实际文件中

基本流程

  • 调用mmap系统调用,传入相关参数

  • 内核在用户空间mmap区域分配一个空闲的 vm_area_struct 对象

  • 修改页目录表项把对象的地址和设备的内存对应起来

代码分析

sys_mmap_pgoff

/* 
 * addr: 如果不为NULL,内核会在此地址创建映射;否则,会选择一个合适的虚拟地址
 * length:映射到进程地址空间的大小;prot: 内存区域的读/写/执行属性
 * flags:内存映射的属性,共享、私有、匿名、文件等
 * fd:表示这是一个文件映射,fd是打开文件的句柄;匿名映射就制定一个特殊的-1
 * offset:在文件映射时,表示相对文件头的偏移量,返回地址是偏移量对应的虚拟地址
 */
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
		unsigned long, prot, unsigned long, flags,
		unsigned long, fd, unsigned long, pgoff)
{
	struct file *file = NULL;
	unsigned long retval = -EBADF;

	audit_mmap_fd(fd, flags);
	if (!(flags & MAP_ANONYMOUS)) {
	/* 如果不是匿名映射,则先获取要映射文件 */
		file = fget(fd);
		if (!file)
			goto out;
	}
	/* 去掉权限中的可执行、拒绝写入 */
	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);

	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);

	if (file)
		fput(file);
out:
	return retval;
}

vm_mmap_pgoff

/* 执行权限检查后,使用 do_mmap_pgoff 函数进行映射 */
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long pgoff)
{
	unsigned long ret;
	struct mm_struct *mm = current->mm;
	unsigned long populate;
	/* 映射权限检查 */
	ret = security_mmap_file(file, prot, flag);
	if (!ret) {
		/* 获取 mm->mmap_sem锁,在调用 do_mmap_pgoff 钱必须要加锁 */
		down_write(&mm->mmap_sem);
		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
				    &populate);
		/* 释放锁 */
		up_write(&mm->mmap_sem);
		if (populate)
			mm_populate(ret, populate);
	}
	return ret;
}

do_mmap_pgoff

/*
 * The caller must hold down_write(&current->mm->mmap_sem).
 * 调用get_unmapped_area获得未使用的vm_area_struct
 * 
 */
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
			unsigned long len, unsigned long prot,
			unsigned long flags, unsigned long pgoff,
			unsigned long *populate)
{
	struct mm_struct *mm = current->mm;
	vm_flags_t vm_flags;

	*populate = 0;

	/*
	 * Does the application expect PROT_READ to imply PROT_EXEC?
	 *
	 * (the exception is when the underlying filesystem is noexec
	 *  mounted, in which case we dont add PROT_EXEC.)
	 */
	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
		if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
			prot |= PROT_EXEC;

	if (!len)
		return -EINVAL;
	/* MAP_FIXED时判断输入的目标映射起始地址是否<最小映射地址 */
	if (!(flags & MAP_FIXED))
		addr = round_hint_to_min(addr);

	/* 如果参数len异常,则返回错误 */
	len = PAGE_ALIGN(len);
	if (!len)
		return -ENOMEM;

	/* offset overflow? */
	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
		return -EOVERFLOW;

	/* 每个进程有最多可以用来映射的内存个数 */
	if (mm->map_count > sysctl_max_map_count)
		return -ENOMEM;

	/* 获得未使用的vm_area_struct */
	addr = get_unmapped_area(file, addr, len, pgoff, flags);
	if (addr & ~PAGE_MASK)
		return addr;//如果返回的地址不是按page对齐的,则一定是错误码

	/* calc_vm_prot_bits: 将mmap prot参数合并到内部使用的vm_flags中
	 * 		PROR_READ,PROT_WRITE,PROT_EXEC等对应VM_READ,VM_WRITE,VM_EXEC
	 * calc_vm_flag_bits: 将mmap flag部分参数合并到内部使用的vm_flags中
	 * 		MAP_GROWSDOWN、MAP_EXECUTABLE等对应VM_GROWSDOWN、VM_EXECUTABLE
	 */
	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
			
	/* 如果flags标志位中MAP_LOCKED置1,则需要判断能否锁定内存
	 * 如果允许锁定内存,查看分配后,是否超过了进程允许锁定的最大内存大小,如果超过返回error
	 * 内存锁:保证分配的内存始终位于真实内存中,禁止被swap out
	 */
	if (flags & MAP_LOCKED)
		if (!can_do_mlock())
			return -EPERM;

	if (mlock_future_check(mm, vm_flags, len))
		return -EAGAIN;
	/* 如果是文件映射 */
	if (file) {
		struct inode *inode = file_inode(file);/* 获取文件对应的inode管理结构体 */

		switch (flags & MAP_TYPE) {
		case MAP_SHARED://共享内存,内存中任何改动,必须同步到磁盘的文件中去
			/* 权限检查,文件属性不可写,但是映射属性为write,则返回错误 */
			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
				return -EACCES;

			/* 不能往只能追加的文件写 */
			if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
				return -EACCES;

			/* 确保文件没有被锁定 */
			if (locks_verify_locked(file))
				return -EAGAIN;

			vm_flags |= VM_SHARED | VM_MAYSHARE;
			if (!(file->f_mode & FMODE_WRITE))
				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);

			/* fall through */
		case MAP_PRIVATE://私有共享
			if (!(file->f_mode & FMODE_READ))
				return -EACCES;//不可读报错
			if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
				if (vm_flags & VM_EXEC)
					return -EPERM;
				vm_flags &= ~VM_MAYEXEC;
			}

			if (!file->f_op->mmap)
				return -ENODEV;
			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
				return -EINVAL;
			break;

		default:
			return -EINVAL;
		}
	} else {//匿名映射
		switch (flags & MAP_TYPE) {
		case MAP_SHARED://共享映射
			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
				return -EINVAL;
			/*
			 * Ignore pgoff.
			 */
			pgoff = 0;
			vm_flags |= VM_SHARED | VM_MAYSHARE;
			break;
		case MAP_PRIVATE://私有映射
			/* 匿名私有映射,根据addr设置pgoff */
			pgoff = addr >> PAGE_SHIFT;
			break;
		default:
			return -EINVAL;
		}
	}

	/*
	 * Set 'VM_NORESERVE' if we should not account for the
	 * memory use of this mapping.
	 */
	if (flags & MAP_NORESERVE) {
		/* 如果传入参数有 MAP_NORESERVE,且系统变量sysctl_overcommit_memory没有被设置为 OVERCOMMIT_NEVER, 则设置相应的VM标志*/
		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
			vm_flags |= VM_NORESERVE;

		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
        /* 如果为 hugepages, 则设置VM标志 */
		if (file && is_file_hugepages(file))
			vm_flags |= VM_NORESERVE;
	}

	addr = mmap_region(file, addr, len, vm_flags, pgoff);
	if (!IS_ERR_VALUE(addr) &&
	    ((vm_flags & VM_LOCKED) ||
	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
		*populate = len;
	return addr;
}

mmap_region

unsigned long mmap_region(struct file *file, unsigned long addr,
		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma, *prev;
	int error;
	struct rb_node **rb_link, *rb_parent;
	unsigned long charged = 0;
	
	/* 检查本进程是否还有address space完成本次内存申请. 对应ulimit -a可以看到本进程一些资源限制 */
	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
		unsigned long nr_pages;

		/* 进程address space剩余空间不足的情况下,如果不是MAP_FIXED,则分配失败*/
		if (!(vm_flags & MAP_FIXED))
			return -ENOMEM;
		/* 如果是固定地址的,则找到指定的虚拟地址,查看addr->addr+len这段空间是否已经有部分被申请使用
		 * 并返回重叠部分大小
		 */
		nr_pages = count_vma_pages_range(mm, addr, addr + len);
		/* 再次检查如果去掉重叠部分(因为固定分配会释放重叠部分内存),是否能满足进程的资源限制,不满足则返回失败 */
		if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
			return -ENOMEM;
	}

	/* 检查[addr, addr+len)的区间是否存在映射空间,假如存在重合的映射空间需要munmap
	 * 如果是非MAP_FIXED,则分配的addr->addr+len一定是空闲的
	 */
	error = -ENOMEM;
	while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
			      &rb_parent)) {
		if (do_munmap(mm, addr, len))
			return -ENOMEM;
	}

	/*
	 * Private writable mapping: check memory availability
	 */
	if (accountable_mapping(file, vm_flags)) {
		charged = len >> PAGE_SHIFT;
		if (security_vm_enough_memory_mm(mm, charged))
			return -ENOMEM;
		vm_flags |= VM_ACCOUNT;
	}

	/* 检查是否可以合并[addr, addr+len)区间内的虚拟地址空间vma
	 * vm_flags一致,地址临接则可以合并
	 */
	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
			NULL);
	if (vma)/* 假如合并成功,即使用合并后的vma,并跳转至out */
		goto out;

	/* 不可以合并,则需要申请一个新的 vma 管理结构体 */
	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
	if (!vma) {
		error = -ENOMEM;
		goto unacct_error;
	}
	/* 初始化vma */
	vma->vm_mm = mm;
	vma->vm_start = addr;
	vma->vm_end = addr + len;
	vma->vm_flags = vm_flags;
	vma->vm_page_prot = vm_get_page_prot(vm_flags);
	vma->vm_pgoff = pgoff;
	INIT_LIST_HEAD(&vma->anon_vma_chain);

	if (file) {/* 文件映射 */
		if (vm_flags & VM_DENYWRITE) {
	/* 映射的文件不允许写入,调用 deny_write_access(file) 排斥常规的文件操作 */
			error = deny_write_access(file);
			if (error)
				goto free_vma;
		}
		if (vm_flags & VM_SHARED) {/* 如果映射为共享的,则标记文件为可写 */
			error = mapping_map_writable(file->f_mapping);
			if (error)
				goto allow_write_and_free_vma;
		}

		/* ->mmap() can change vma->vm_file, but must guarantee that
		 * vma_link() below can deny write-access if VM_DENYWRITE is set
		 * and map writably if VM_SHARED is set. This usually means the
		 * new file must not have been exposed to user-space, yet.
		 */
		vma->vm_file = get_file(file);/* 文件引用次数+1,返回file给vma_file */
        /* 调用文件系统指定的mmap函数, 一般对应 generic_file_mmap()
         * 将 vm->vm_ops = generic_file_vm_op
         */
		error = file->f_op->mmap(file, vma);
		if (error)
			goto unmap_and_free_vma;

		/* Can addr have changed??
		 *
		 * Answer: Yes, several device drivers can do it in their
		 *         f_op->mmap method. -DaveM
		 * Bug: If addr is changed, prev, rb_link, rb_parent should
		 *      be updated for vma_link()
		 */
		WARN_ON_ONCE(addr != vma->vm_start);

		addr = vma->vm_start;
		vm_flags = vma->vm_flags;
	} else if (vm_flags & VM_SHARED) {
	/* 如果匿名映射且为共享映射,则调用shmem_zero_setup()映射文件 /dev/zero */
		error = shmem_zero_setup(vma);
		if (error)
			goto free_vma;
	}
	/* 将申请的新vma加入mm中的vma链表 */
	vma_link(mm, vma, prev, rb_link, rb_parent);
	/* Once vma denies write, undo our temporary denial count */
	if (file) {
		if (vm_flags & VM_SHARED)
			mapping_unmap_writable(file->f_mapping);
		if (vm_flags & VM_DENYWRITE)
			allow_write_access(file);
	}
	file = vma->vm_file;
out:
	perf_event_mmap(vma);/* 添加一个mmap_event事件 */
	/* 到此,虚拟地址实际已经申请成功,更新进程的虚拟地址空间 mm */
	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
	if (vm_flags & VM_LOCKED) {/* 如果需要锁定这段内存,则将标志位设置到vm结构体中 */
		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
					vma == get_gate_vma(current->mm)))
			mm->locked_vm += (len >> PAGE_SHIFT);
		else
			vma->vm_flags &= ~VM_LOCKED;
	}

	if (file)
		uprobe_mmap(vma);

	/*
	 * New (or expanded) vma always get soft dirty status.
	 * Otherwise user-space soft-dirty page tracker won't
	 * be able to distinguish situation when vma area unmapped,
	 * then new mapped in-place (which must be aimed as
	 * a completely new data area).
	 */
	vma->vm_flags |= VM_SOFTDIRTY;

	vma_set_page_prot(vma);

	return addr;

unmap_and_free_vma:
	vma->vm_file = NULL;
	fput(file);

	/* Undo any partial mapping done by a device driver. */
	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
	charged = 0;
	if (vm_flags & VM_SHARED)
		mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
	if (vm_flags & VM_DENYWRITE)
		allow_write_access(file);
free_vma:
	kmem_cache_free(vm_area_cachep, vma);
unacct_error:
	if (charged)
		vm_unacct_memory(charged);
	return error;
}

sys_mmap_pgoff

参考资料

get_unmapped_area函数