前沿
往篇回顾
本片主要内容
mmap
-
mmap系统调用并不完全是为了共享内存而设计的,它本身提供了不同于一般对普通文件的访问方式,进程可以像读写内存一样对普通文件操作
-
mmap并不分配空间,只是将文件映射到调用进程的地址空间里(占用虚拟地址空间), 然后就可以使用
映射关系分类
-
文件映射:磁盘文件映射进程的虚拟地址空间,使用文件内容初始化物理内存
-
匿名映射:初始化全为0的内存空间
-
私有映射:多进程数据共享,修改不反应到磁盘实际文件,是一个copy-on-write(写时复制)的映射方式
-
共享映射:多进程间数据共享,修改反应到磁盘实际文件中
基本流程
-
调用mmap系统调用,传入相关参数
-
内核在用户空间mmap区域分配一个空闲的 vm_area_struct 对象
-
修改页目录表项把对象的地址和设备的内存对应起来
代码分析
sys_mmap_pgoff
/*
* addr: 如果不为NULL,内核会在此地址创建映射;否则,会选择一个合适的虚拟地址
* length:映射到进程地址空间的大小;prot: 内存区域的读/写/执行属性
* flags:内存映射的属性,共享、私有、匿名、文件等
* fd:表示这是一个文件映射,fd是打开文件的句柄;匿名映射就制定一个特殊的-1
* offset:在文件映射时,表示相对文件头的偏移量,返回地址是偏移量对应的虚拟地址
*/
SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, pgoff)
{
struct file *file = NULL;
unsigned long retval = -EBADF;
audit_mmap_fd(fd, flags);
if (!(flags & MAP_ANONYMOUS)) {
/* 如果不是匿名映射,则先获取要映射文件 */
file = fget(fd);
if (!file)
goto out;
}
/* 去掉权限中的可执行、拒绝写入 */
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
out:
return retval;
}
vm_mmap_pgoff
/* 执行权限检查后,使用 do_mmap_pgoff 函数进行映射 */
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long pgoff)
{
unsigned long ret;
struct mm_struct *mm = current->mm;
unsigned long populate;
/* 映射权限检查 */
ret = security_mmap_file(file, prot, flag);
if (!ret) {
/* 获取 mm->mmap_sem锁,在调用 do_mmap_pgoff 钱必须要加锁 */
down_write(&mm->mmap_sem);
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate);
/* 释放锁 */
up_write(&mm->mmap_sem);
if (populate)
mm_populate(ret, populate);
}
return ret;
}
do_mmap_pgoff
/*
* The caller must hold down_write(¤t->mm->mmap_sem).
* 调用get_unmapped_area获得未使用的vm_area_struct
*
*/
unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flags, unsigned long pgoff,
unsigned long *populate)
{
struct mm_struct *mm = current->mm;
vm_flags_t vm_flags;
*populate = 0;
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
* (the exception is when the underlying filesystem is noexec
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
if (!len)
return -EINVAL;
/* MAP_FIXED时判断输入的目标映射起始地址是否<最小映射地址 */
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
/* 如果参数len异常,则返回错误 */
len = PAGE_ALIGN(len);
if (!len)
return -ENOMEM;
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
return -EOVERFLOW;
/* 每个进程有最多可以用来映射的内存个数 */
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
/* 获得未使用的vm_area_struct */
addr = get_unmapped_area(file, addr, len, pgoff, flags);
if (addr & ~PAGE_MASK)
return addr;//如果返回的地址不是按page对齐的,则一定是错误码
/* calc_vm_prot_bits: 将mmap prot参数合并到内部使用的vm_flags中
* PROR_READ,PROT_WRITE,PROT_EXEC等对应VM_READ,VM_WRITE,VM_EXEC
* calc_vm_flag_bits: 将mmap flag部分参数合并到内部使用的vm_flags中
* MAP_GROWSDOWN、MAP_EXECUTABLE等对应VM_GROWSDOWN、VM_EXECUTABLE
*/
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* 如果flags标志位中MAP_LOCKED置1,则需要判断能否锁定内存
* 如果允许锁定内存,查看分配后,是否超过了进程允许锁定的最大内存大小,如果超过返回error
* 内存锁:保证分配的内存始终位于真实内存中,禁止被swap out
*/
if (flags & MAP_LOCKED)
if (!can_do_mlock())
return -EPERM;
if (mlock_future_check(mm, vm_flags, len))
return -EAGAIN;
/* 如果是文件映射 */
if (file) {
struct inode *inode = file_inode(file);/* 获取文件对应的inode管理结构体 */
switch (flags & MAP_TYPE) {
case MAP_SHARED://共享内存,内存中任何改动,必须同步到磁盘的文件中去
/* 权限检查,文件属性不可写,但是映射属性为write,则返回错误 */
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES;
/* 不能往只能追加的文件写 */
if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
return -EACCES;
/* 确保文件没有被锁定 */
if (locks_verify_locked(file))
return -EAGAIN;
vm_flags |= VM_SHARED | VM_MAYSHARE;
if (!(file->f_mode & FMODE_WRITE))
vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
/* fall through */
case MAP_PRIVATE://私有共享
if (!(file->f_mode & FMODE_READ))
return -EACCES;//不可读报错
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
if (!file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
break;
default:
return -EINVAL;
}
} else {//匿名映射
switch (flags & MAP_TYPE) {
case MAP_SHARED://共享映射
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
/*
* Ignore pgoff.
*/
pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE;
break;
case MAP_PRIVATE://私有映射
/* 匿名私有映射,根据addr设置pgoff */
pgoff = addr >> PAGE_SHIFT;
break;
default:
return -EINVAL;
}
}
/*
* Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping.
*/
if (flags & MAP_NORESERVE) {
/* 如果传入参数有 MAP_NORESERVE,且系统变量sysctl_overcommit_memory没有被设置为 OVERCOMMIT_NEVER, 则设置相应的VM标志*/
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE;
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
/* 如果为 hugepages, 则设置VM标志 */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE;
}
addr = mmap_region(file, addr, len, vm_flags, pgoff);
if (!IS_ERR_VALUE(addr) &&
((vm_flags & VM_LOCKED) ||
(flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
*populate = len;
return addr;
}
mmap_region
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
/* 检查本进程是否还有address space完成本次内存申请. 对应ulimit -a可以看到本进程一些资源限制 */
if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/* 进程address space剩余空间不足的情况下,如果不是MAP_FIXED,则分配失败*/
if (!(vm_flags & MAP_FIXED))
return -ENOMEM;
/* 如果是固定地址的,则找到指定的虚拟地址,查看addr->addr+len这段空间是否已经有部分被申请使用
* 并返回重叠部分大小
*/
nr_pages = count_vma_pages_range(mm, addr, addr + len);
/* 再次检查如果去掉重叠部分(因为固定分配会释放重叠部分内存),是否能满足进程的资源限制,不满足则返回失败 */
if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
/* 检查[addr, addr+len)的区间是否存在映射空间,假如存在重合的映射空间需要munmap
* 如果是非MAP_FIXED,则分配的addr->addr+len一定是空闲的
*/
error = -ENOMEM;
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
&rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
}
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
vm_flags |= VM_ACCOUNT;
}
/* 检查是否可以合并[addr, addr+len)区间内的虚拟地址空间vma
* vm_flags一致,地址临接则可以合并
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
NULL);
if (vma)/* 假如合并成功,即使用合并后的vma,并跳转至out */
goto out;
/* 不可以合并,则需要申请一个新的 vma 管理结构体 */
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
if (!vma) {
error = -ENOMEM;
goto unacct_error;
}
/* 初始化vma */
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma->vm_pgoff = pgoff;
INIT_LIST_HEAD(&vma->anon_vma_chain);
if (file) {/* 文件映射 */
if (vm_flags & VM_DENYWRITE) {
/* 映射的文件不允许写入,调用 deny_write_access(file) 排斥常规的文件操作 */
error = deny_write_access(file);
if (error)
goto free_vma;
}
if (vm_flags & VM_SHARED) {/* 如果映射为共享的,则标记文件为可写 */
error = mapping_map_writable(file->f_mapping);
if (error)
goto allow_write_and_free_vma;
}
/* ->mmap() can change vma->vm_file, but must guarantee that
* vma_link() below can deny write-access if VM_DENYWRITE is set
* and map writably if VM_SHARED is set. This usually means the
* new file must not have been exposed to user-space, yet.
*/
vma->vm_file = get_file(file);/* 文件引用次数+1,返回file给vma_file */
/* 调用文件系统指定的mmap函数, 一般对应 generic_file_mmap()
* 将 vm->vm_ops = generic_file_vm_op
*/
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
* Bug: If addr is changed, prev, rb_link, rb_parent should
* be updated for vma_link()
*/
WARN_ON_ONCE(addr != vma->vm_start);
addr = vma->vm_start;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
/* 如果匿名映射且为共享映射,则调用shmem_zero_setup()映射文件 /dev/zero */
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
/* 将申请的新vma加入mm中的vma链表 */
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
}
file = vma->vm_file;
out:
perf_event_mmap(vma);/* 添加一个mmap_event事件 */
/* 到此,虚拟地址实际已经申请成功,更新进程的虚拟地址空间 mm */
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {/* 如果需要锁定这段内存,则将标志位设置到vm结构体中 */
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
mm->locked_vm += (len >> PAGE_SHIFT);
else
vma->vm_flags &= ~VM_LOCKED;
}
if (file)
uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
vma_set_page_prot(vma);
return addr;
unmap_and_free_vma:
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
if (vm_flags & VM_DENYWRITE)
allow_write_access(file);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
if (charged)
vm_unacct_memory(charged);
return error;
}
sys_mmap_pgoff