前沿
往篇回顾
在上一篇中,主要学习了lru及lruvec机制;lru用于内存回收,lruvec则用于辅助lru,避免频繁加锁访问lru
-
lru链表有5种不同类型,分别装5种不同状态的page
-
lruvec则是根据lru的操作进行分类的
-
将不处于lru链表的新页放入到lru链表中
-
将非活动lru链表中的页移动到非活动lru链表尾部
-
将处于活动lru链表尾部的页移动到非活动lru链表
-
将处于非活动lru链表的页移动到活动lru链表头
-
内存压缩基本知识
-
内存压缩主要为解决系统长时间运行后内存碎片化问题,主要方法:
-
从内存区段的前面扫描可移动页框,再从内存区段后面扫描空闲页框;
-
扫描结束后,尝试将可移动的页框内容迁移入空闲页框中;
-
内存压缩模式migrate_mode
-
kswapd中只进行异步压缩,在非kswapd的场景,当推迟次数(compact_considered)超过了阀值(compact_defer_shift)后,会将该block的PB_migrate_skip清除
-
轻同步模式是在异步压缩无法解决情况下可能触发(例如是内核线程触发的压缩)
内存压缩发生时机
-
内核从伙伴系统以Min阀值获取连续页框,但是无法成功
-
当需要从指定地方获取连续页框,但是中间有页框正在使用,直接触发同步压缩
-
应内存短缺导致kswapd被唤醒,在进行内存回收后会进行内存压缩
-
将1写入/etc/sysfs/vm/compact_memory时,触发对所有zone进行内存压缩
内存压缩停止时机compact_finished
当内存压缩启动后,在如下情况下会停止
-
可移动页框扫描位置超过空闲页框扫描位置; 此时相当于对zone进行了一次完整的内存压缩,此时会重置zone中compact_cached_free_pfn、compact_cached_migrate_pfn
-
zone的空闲页框足够完成分配(zone的low阀值 + 1«order + zone的保留页框)
-
判断伙伴系统中是否有比order值大的空闲连续页框块,有则结束压缩,如果order为-1,则忽略此条件
本篇主要内容
有了lru做铺垫,本篇主要分析内存回收之内存压缩的基本过程
代码解析
try_to_compact_pages
/*
* 遍历zonelist中每个zone:
* 1. 判断此zone的内存压缩是否需要推迟 compaction_deferred()
* 2. 调用compact_zone_order->compact_zone()进行内存压缩
* a. 如果压缩后能分配到内存,则停止对其余zone的内存压缩
* b. 如果压缩不成功则需要进行收尾处理 defer_compaction(), 减少后续对此zone的压缩频率,因为很可能还是无法成功
* compact_considered: 内存压缩推迟计数器;compact_defer_shift: 内存压缩推迟阀值; compact_order_failed: 内存压缩失败最大order值
*/
unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
int alloc_flags, const struct alloc_context *ac,
enum migrate_mode mode, int *contended)
{
/* 用于检查是否能使用文件系统的IO操作 */
int may_enter_fs = gfp_mask & __GFP_FS;
/* 用于检查是否能使用磁盘IO操作 */
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_DEFERRED;
int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
*contended = COMPACT_CONTENDED_NONE;
/* 如果order = 0或者不允许使用文件系统IO和磁盘IO,则跳过本次压缩,因为不使用IO有可能导致死锁 */
if (!order || !may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
/* Compact each zone in the list */
/* 遍历zonelist中的每一个zone */
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
int status;
int zone_contended;
/* 检查是否跳过此次压缩,两种情况:
* 1. zone->compact_considered < (1<<zone->compact_defer_shift),由于压缩耗费资源,要限制压缩的频次
* 2. order>=zone->compact_order_failed
*/
if (compaction_deferred(zone, order))
continue;
/* 对遍历到的zone进行内存压缩 */
status = compact_zone_order(zone, order, gfp_mask, mode,
&zone_contended, alloc_flags,
ac->classzone_idx);
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
* to clear all_zones_contended.
*/
all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */
/* 判断压缩后此zone分配1 << order个页框后剩余的页框数量, 是否 大于 此zone的低阀值 + 保留的页框数量 */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
ac->classzone_idx, alloc_flags)) {
/* 当zone内存满足low阀值 + (1 << order) + 保留的内存 时,则将compact_order_failed设置为本次压缩的order + 1
* 因为这里还不确定内存压缩是否成功了,只是此zone的剩余页框满足了要求,因此第三个参数传入false
*/
compaction_defer_reset(zone, order, false);
/* 异步情况下需要被调度时会设置 */
if (zone_contended == COMPACT_CONTENDED_SCHED)
*contended = COMPACT_CONTENDED_SCHED;
/* 当zone中空闲内存达到 low阀值 + (1 << order) + 保留的内存 时,就不对下面的zone进行内存压缩了 */
goto break_loop;
}
/* 本zone压缩失败,则认为后续在此zone中分配也会失败,为避免多余尝试,进行如下处理:
* 如果是同步压缩或者轻同步压缩,则增加推迟计数器阀值zone->compact_defer_shift, zone->compact_considered=0 */
if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
/*
* We think that allocation won't succeed in this zone
* so we defer compaction there. If it ends up
* succeeding after all, it will be reset.
*/
defer_compaction(zone, order);
}
/*
* We might have stopped compacting due to need_resched() in
* async compaction, or due to a fatal signal detected. In that
* case do not try further zones and signal need_resched()
* contention.
*/
if ((zone_contended == COMPACT_CONTENDED_SCHED)
|| fatal_signal_pending(current)) {
*contended = COMPACT_CONTENDED_SCHED;
goto break_loop;
}
continue;
break_loop:
/*
* We might not have tried all the zones, so be conservative
* and assume they are not all lock contended.
*/
all_zones_contended = 0;
break;
}
/*
* If at least one zone wasn't deferred or skipped, we report if all
* zones that were tried were lock contended.
*/
if (rc > COMPACT_SKIPPED && all_zones_contended)
*contended = COMPACT_CONTENDED_LOCK;
return rc;
}
compact_zone_order
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
static unsigned long compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum migrate_mode mode, int *contended,
int alloc_flags, int classzone_idx)
{
unsigned long ret;
struct compact_control cc = {
/* 压缩结束后空闲页框数量 */
.nr_freepages = 0,
/* 压缩结束后移动的页框数量 */
.nr_migratepages = 0,
.order = order,
/* 表示需要移动的页框类型,有movable和reclaimable两种,可以同时设置 */
.gfp_mask = gfp_mask,
/* 管理区 */
.zone = zone,
/* 同步或异步 */
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
};
/* 初始化一个空闲页框链表头 */
INIT_LIST_HEAD(&cc.freepages);
/* 初始化一个movable页框链表头 */
INIT_LIST_HEAD(&cc.migratepages);
/* 进行内存压缩 */
ret = compact_zone(zone, &cc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
*contended = cc.contended;
return ret;
}
compact_zone
/*
* zone压缩主函数:
* 1. 首先通过compaction_suitable()判断此zone是否适合压缩;通过compaction_restarting()判断是否需要需要重置defer参数
* 2. 从zone中获取migrate page及free page的起始页
* 3. 通过isolate_migratepages()按pageblock维度扫描zone
其中通过isolate_migratepages_block()按page维度扫描pageblock,找出migrate page,存在cc->migratepages链表中
* 4. 在migrate_pages()中对cc->migratepages链表中的pages进行迁移
* 5. 如果迁移失败,则使用putback_movable_pages()释放收集的migrate page
* 6. 使用compact_finished()判断是否满足结束compact,不满足再次循环2,3,4,5
* 7. 在compact结束后,如果cc->freepages链表非空,则使用release_freepages->__free_page()将多余的free page释放回伙伴系统
a. order=0的单页:free_hot_cold_page(),将单页添加到per_cpu_pages响应的migrate类型的链表中
b. order>=1: __free_pages_ok->free_one_page->__free_one_page()将多个页释放到伙伴系统
*/
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
unsigned long start_pfn = zone->zone_start_pfn;/* 管理区开始页框号 */
unsigned long end_pfn = zone_end_pfn(zone);/* 管理区结束页框号 */
/* 获取可进行移动的页框类型(__GFP_RECLAIMABLE、__GFP_MOVABLE) */
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
unsigned long last_migrated_pfn = 0;
/* 判断是否进行内存压缩
* 1. compact_partical: 此zone内存足够分配2^order个页框,不用进行内存压缩
* 2. COMPACT_SKIPPED:
a. 此zone内存不足以进行内存压缩,即zone的空闲页框数量少于 zone的低阀值 + (2 << order)
b. 经fragmentation_index()计算出此zone的内存碎片指数太高
* 3. COMPACT_CONTINUE: 此zone可以进行内存压缩
* fragmentation_index()会计算此zone内所有free_pages, free_blocks_total, free_blocks_suitable
*/
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
switch (ret) {
case COMPACT_PARTIAL:
case COMPACT_SKIPPED:
/* Compaction is likely to fail */
return ret;
case COMPACT_CONTINUE:
/* Fall through to compaction */
;
}
/* 如果不是在kswapd线程中并且此zone的内存压缩推迟次数>=COMPACT_MAX_DEFER_SHIFT
* 则通过__reset_isolation_suitable重置compact参数
*/
if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
/*
* zone->compact_cached_migrate_pfn[sync/async]设置为此zone的起始页框,compact_cached_free_pfn设置为此zone的结束页框
* zone->compact_blockskip_flush = false
* 将zone中所有pageblock的PB_migrate_skip清空
*/
__reset_isolation_suitable(zone);
/* 将可移动页框扫描起始页框号设为zone->compact_cached_migrate_pfn[sync/async] */
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
/* 空闲页框扫描起始页框号设置为zone->compact_cached_free_pfn */
cc->free_pfn = zone->compact_cached_free_pfn;
/* 检查cc->free_pfn,如果空闲页框扫描起始页框不在zone的范围内,则将空闲页框扫描起始页框设置为zone的最后一个页框
* 并且也会将zone->compact_cached_free_pfn设置为zone的最后一个页框
*/
if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
zone->compact_cached_free_pfn = cc->free_pfn;
}
/* 同上,检查cc->migrate_pfn,如果可移动页框扫描起始页框不在zone的范围内,则将可移动页框扫描起始页框设置为zone的第一个页框
* 并且也会将zone->compact_cached_free_pfn设置为zone的第一个页框
*/
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
/* 通过pagevec的三种操作,将处于pagevec中的页都放回原本所属的lru中,这一步很重要 */
migrate_prep_local();
/* 判断是否结束本次内存压缩
* 1.可移动页框扫描的位置>=空闲页框扫描的位置,且重置zone->compact_cached_free_pfn和zone->compact_cached_migrate_pfn
* 2.判断zone的空闲页框数量是否达到标准,如果没达到zone的low阀值标准则继续
* 3.判断伙伴系统中是否有比order值大的空闲连续页框块,有则结束压缩
* 如果是管理员写入到/proc/sys/vm/compact_memory进行强制内存压缩的情况,则判断条件只有第1条
*/
while ((ret = compact_finished(zone, cc, migratetype)) ==
COMPACT_CONTINUE) {
int err;
unsigned long isolate_start_pfn = cc->migrate_pfn;
/* 将可移动页(MOVABLE和CMA和RECLAIMABLE)从zone->lru隔离出来,存到cc->migratepages这个链表
* 一个一个pageblock进行扫描,当一个pageblock扫描成功获取到可移动页后就返回
* 一次扫描最多32*1024个页框
*/
/* 异步不处理RECLAIMABLE页 */
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
/* 失败,把这些页放回到lru或者原来的地方 */
ret = COMPACT_PARTIAL;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
/* 没有隔离出任何页面 */
goto check_drain;
case ISOLATE_SUCCESS:
;
}
/* 将隔离出来的页进行迁移,如果到这里,cc->migratepages中最多也只有一个pageblock的页框数量,并且这些页框都是可移动的
* 空闲页框会在compaction_alloc中获取
* 也就是把隔离出来的一个pageblock中可移动页进行移动
*/
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
/* 设置所有可移动页框为0 */
cc->nr_migratepages = 0;
if (err) {
/* 将剩余的可移动页框返回原来的位置 */
putback_movable_pages(&cc->migratepages);
/*
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
ret = COMPACT_PARTIAL;
goto out;
}
}
/*
* Record where we could have freed pages by migration and not
* yet flushed them to buddy allocator. We use the pfn that
* isolate_migratepages() started from in this loop iteration
* - this is the lowest page that could have been isolated and
* then freed by migration.
*/
if (!last_migrated_pfn)
last_migrated_pfn = isolate_start_pfn;
check_drain:
/*
* Has the migration scanner moved away from the previous
* cc->order aligned block where we migrated from? If yes,
* flush the pages that were freed, so that they can merge and
* compact_finished() can detect immediately if allocation
* would succeed.
*/
if (cc->order > 0 && last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
cc->migrate_pfn & ~((1UL << cc->order) - 1);
if (last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
drain_local_pages(zone);
put_cpu();
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
}
}
out:
/*
* Release free pages and update where the free scanner should restart,
* so we don't leave any returned pages behind in the next attempt.
*/
if (cc->nr_freepages > 0) {
/* 将剩余的空闲页框放回伙伴系统 */
unsigned long free_pfn = release_freepages(&cc->freepages);
cc->nr_freepages = 0;
VM_BUG_ON(free_pfn == 0);
/* The cached pfn is always the first in a pageblock */
free_pfn &= ~(pageblock_nr_pages-1);
/*
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
if (free_pfn > zone->compact_cached_free_pfn)
zone->compact_cached_free_pfn = free_pfn;
}
trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync, ret);
return ret;
}
__compact_finished
/* 判断是否结束本次内存压缩
* 1.可移动页框扫描的位置>=空闲页框扫描的位置,且重置zone->compact_cached_free_pfn和zone->compact_cached_migrate_pfn
* 2.判断zone的空闲页框数量是否达到标准,如果没达到zone的low阀值标准则继续
* 3.判断伙伴系统中是否有比order值大的空闲连续页框块,有则结束压缩
* 如果是管理员写入到/proc/sys/vm/compact_memory进行强制内存压缩的情况,则判断条件只有第1条
*/
static int __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned int order;
unsigned long watermark;
if (cc->contended || fatal_signal_pending(current))
return COMPACT_PARTIAL;
/* 可移动页框扫描的位置>=空闲页框扫描的位置,说明已经完成对此zone的一次完整扫描了 */
if (cc->free_pfn <= cc->migrate_pfn) {
/* 重置zone->compact_cached_free_pfn和zone->compact_cached_migrate_pfn. */
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
zone->compact_cached_free_pfn = zone_end_pfn(zone);
/*
* Mark that the PG_migrate_skip information should be cleared
* by kswapd when it goes to sleep. kswapd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
if (!current_is_kswapd())
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
}
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
*/
if (cc->order == -1)
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
/* 检测水位 */
watermark = low_wmark_pages(zone);
/* 判断zone的空闲页框分配2^order个页框后是否达到标准,如果没达到zone的low阀值标准则继续 */
if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
cc->alloc_flags))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
/* 判断伙伴系统中是否有比order值大的空闲连续页框块,有则结束压缩 */
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
bool can_steal;
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
!list_empty(&area->free_list[MIGRATE_CMA]))
return COMPACT_PARTIAL;
#endif
/*
* Job done if allocation would steal freepages from
* other migratetype buddy lists.
* 检测是否可以从fallback列表中的area获取到足够的空闲页面
*/
if (find_suitable_fallback(area, order, migratetype,
true, &can_steal) != -1)
return COMPACT_PARTIAL;
}
return COMPACT_NO_SUITABLE_PAGE;
}
isolate_migratepages
/*
* 从cc->migrate_pfn(保存的是扫描可移动页框指针所在的页框号)开始到第一个获取到可移动页框的pageblock结束,获取可移动页框,并放入到cc->migratepages
* 1. 从migrate_pfn获取搜索此zone的第一个pageblock = low_pfn
* 2. 遍历从low_pfn开始本zone中的所有pageblock
* 3. 使用isolate_migratepages_block遍历此pageblock中所有的page
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
struct page *page;
/* 保存同步/异步方式,只有异步的情况下能进行移动页框 ISOLATE_ASYNC_MIGRATE */
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/* 扫描起始页框 */
low_pfn = cc->migrate_pfn;
/* Only scan within a pageblock boundary */
/* 以pageblock大小对齐 */
end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
* 按pageblock迭代搜索可转移page
*/
for (; end_pfn <= cc->free_pfn;
low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
* need to schedule, or even abort async compaction.
*/
/* 执行时间过长则睡眠,32个1024页框休眠一下,异步的情况还需要判断运行进程是否需要被调度 */
if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
&& compact_should_abort(cc))
break;
/* 获取第一个页框,需要检查是否属于此zone */
page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
if (!page)
continue;
/* If isolation recently failed, do not retry */
/* 获取页框的PB_migrate_skip标志,如果设置了则跳过这个1024个页框 */
if (!isolation_suitable(cc, page))
continue;
/* 异步情况,如果不是MIGRATE_MOVABLE或MIGRATE_CMA类型则跳过这段页框块 */
/* 异步不处理RECLAIMABLE的页 */
if (cc->mode == MIGRATE_ASYNC &&
!migrate_async_suitable(get_pageblock_migratetype(page)))
continue;
/* 执行完隔离,将low_pfn到end_pfn中正在使用的页框从zone->lru中取出来,返回的是可移动页扫描扫描到的页框号
* 而UNMOVABLE类型的页框是不会处于lru链表中的,所以所有不在lru链表中的页都会被跳过
* 返回的是扫描到的最后的页
*/
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
isolate_mode);
/* low_pfn == 0 则说明隔离pages出现异常停止;一般为异步compact碰到阻塞 */
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
return ISOLATE_ABORT;
}
/* 跳出,说明这里如果成功只会扫描一个pageblock */
break;
}
/* 统计,遍历cc中所有收集到的可移动页,统计其中匿名页、文件页的数量 */
acct_isolated(zone, cc);
/*
* Record where migration scanner will be restarted. If we end up in
* the same pageblock as the free scanner, make the scanners fully
* meet so that compact_finished() terminates compaction.
*/
/* 可移动页扫描到的页框修正 */
cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
isolate_migratepages_block
/*
* 将一个pageblock中所有可以移动的页框隔离出来,并放入struct compact_control->migratepages链表中
* 1. 检查本zone是否已经隔离了很多page
* 2. 遍历pageblock中pages,将通过__isolate_lru_page检查的页面隔离出来
*/
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
/* 待移动的页框链表 */
struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
/* 如果LRU链表的isolated>=(inactive + active) / 2,表示已经将许多页框隔离出来 */
while (unlikely(too_many_isolated(zone))) {
/* 如果是异步的,则停止隔离 */
if (cc->mode == MIGRATE_ASYNC)
return 0;
/* 进行100ms的休眠,等待设备没那么繁忙 */
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
return 0;
}
/*
* 如果进程需要调度
* 如果异步compact,则直接abort,不再继续执行隔离
* 否则,等待调度结束
*/
if (compact_should_abort(cc))
return 0;
/* Time to isolate some pages for migration */
/* 遍历每一个页框,执行搜索隔离任务 */
for (; low_pfn < end_pfn; low_pfn++) {
/* 这里会释放掉zone->lru_lock这个锁 */
if (!(low_pfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(&zone->lru_lock, flags,
&locked, cc))
break;
if (!pfn_valid_within(low_pfn))
continue;
/* 扫描次数++ */
nr_scanned++;
/* 根据页框号获取页描述符 */
page = pfn_to_page(low_pfn);
/* 设置valid_page */
if (!valid_page)
valid_page = page;
/* 检查此页是否处于伙伴系统中,主要是通过page->_mapcount判断
* 如果在伙伴系统中,则跳过以本page为起始的空闲内存块
*/
if (PageBuddy(page)) {
/* 获取这个页开始的2^order为伙伴系统的一块内存 */
unsigned long freepage_order = page_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
* a valid page order. Consider only values in the
* valid order range to prevent low_pfn overflow.
*/
if (freepage_order > 0 && freepage_order < MAX_ORDER)
low_pfn += (1UL << freepage_order) - 1;
continue;
}
/* 以下处理此页不在伙伴系统中的情况,表明此页是在使用的页*/
/* 如果页不处于lru中的处理,isolated的页是不处于lru中的,用于balloon的页也不处于lru中?
* 可移动的页都会在LRU中,不在LRU中的页都会被跳过,这里就把UNMOVABLE进行跳过
*/
if (!PageLRU(page)) {
if (unlikely(balloon_page_movable(page))) {
if (balloon_page_isolate(page)) {
/* Successfully isolated */
goto isolate_success;
}
}
continue;
}
/* 如果此页是透明大页的处理,也是跳过。透明大页是在系统活动时可以实时配置,不需要重启生效 */
if (PageTransHuge(page)) {
if (!locked)
low_pfn = ALIGN(low_pfn + 1,
pageblock_nr_pages) - 1;
else
low_pfn += (1 << compound_order(page)) - 1;
continue;
}
/* 如果是一个匿名页,并且被引用次数大于page->_mapcount,则跳过此页,注释说此页很有可能被锁定在内存中不允许换出,但不知道如何判断的 ?*/
if (!page_mapping(page) &&
page_count(page) > page_mapcount(page))
continue;
/* If we already hold the lock, we can skip some rechecking */
/* 检查是否有上锁,这个锁是zone->lru_lock */
if (!locked) {
/* 尝试加锁 */
locked = compact_trylock_irqsave(&zone->lru_lock,
&flags, cc);
if (!locked)
break;
/* Recheck PageLRU and PageTransHuge under lock */
/* 没上锁的情况,需要检查是否处于LRU中 */
if (!PageLRU(page))
continue;
/* 如果在lru中,检查是否是大页,做个对齐,防止low_pfn不是页首 */
if (PageTransHuge(page)) {
low_pfn += (1 << compound_order(page)) - 1;
continue;
}
}
lruvec = mem_cgroup_page_lruvec(page, zone);
/* 将此页从lru中隔离出来 */
if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* 如果在cgroup的lru缓冲区,则将此页从lru缓冲区中拿出来 */
del_page_from_lru_list(page, lruvec, page_lru(page));
isolate_success:
/* 将此页加入到本次压缩需要移动页链表中 */
list_add(&page->lru, migratelist);
/* 需要移动的页框数量++ */
cc->nr_migratepages++;
/* 隔离数量++ */
nr_isolated++;
/* Avoid isolating too much */
/* COMPACT_CLUSTER_MAX代表每次内存压缩所能移动的最大页框数量 */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
}
/*
* The PageBuddy() check could have potentially brought us outside
* the range to be scanned.
*/
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
/* 解锁 */
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
/* 如果全部的页框块都扫描过了,并且没有隔离任何一个页,则标记最后这个页所在的pageblock为PB_migrate_skip,然后
* if (pfn > zone->compact_cached_migrate_pfn[0])
zone->compact_cached_migrate_pfn[0] = pfn;
if (cc->mode != MIGRATE_ASYNC &&
pfn > zone->compact_cached_migrate_pfn[1])
zone->compact_cached_migrate_pfn[1] = pfn;
*
*/
if (low_pfn == end_pfn)
update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
/* 统计 */
count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
return low_pfn;
}
migrate_pages
/*
* migrate_pages - migrate the pages specified in a list, to the free pages
* supplied as the target for the page migration
* 经过可移动页框隔离后,使用本函数对隔离的页框执行迁移操作
* 其中get_new_page==compaction_alloc, put_new_page==compaction_free为两个函数指针
* 将from中的页移到新页上,新页来自于get_new_page
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
int nr_succeeded = 0;
int pass = 0;
struct page *page;
struct page *page2;
/* 获取当前进程是否允许将页写到swap */
int swapwrite = current->flags & PF_SWAPWRITE;
int rc;
/* 如果当前进程不支持将页写到swap,要强制其支持 */
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
for(pass = 0; pass < 10 && retry; pass++) {
retry = 0;
/* page是主要遍历的页,page2是page在from中的下一个页 */
list_for_each_entry_safe(page, page2, from, lru) {
cond_resched();
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
pass > 2, mode);
else
/* 迁移页框关键函数 */
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode);
switch(rc) {
case -ENOMEM:
goto out;
case -EAGAIN:
retry++;
break;
case MIGRATEPAGE_SUCCESS:
nr_succeeded++;
break;
default:
/*
* Permanent failure (-EBUSY, -ENOSYS, etc.):
* unlike -EAGAIN case, the failed page is
* removed from migration page list and not
* retried in the next outer loop.
*/
nr_failed++;
break;
}
}
}
rc = nr_failed + retry;
out:
if (nr_succeeded)
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
if (nr_failed)
count_vm_events(PGMIGRATE_FAIL, nr_failed);
trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
/* 恢复PF_SWAPWRITE标记 */
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
return rc;
}
compaction_alloc
/*
* 获取用于转移页面的空闲页;
* 1. 如果cc->freepages不为空,则从中获取一个空闲页并返回;
* 2. 如果为空,则使用isolate_freepage() 从pageblock中扫描一个空闲页用于转移;
*/
static struct page *compaction_alloc(struct page *migratepage,
unsigned long data,
int **result)
{
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
/* 如果cc中的空闲页框链表为空 */
if (list_empty(&cc->freepages)) {
/* 并且cc->contended没有记录错误代码 */
if (!cc->contended)
/* 从cc->free_pfn开始向前获取空闲页,主要功能函数 */
isolate_freepages(cc);
/* 如果经过isolate_freepages还是没有收集到空闲页,则任务失败 */
if (list_empty(&cc->freepages))
return NULL;
}
/* 从cc->freepages链表中拿出一个空闲page */
freepage = list_entry(cc->freepages.next, struct page, lru);
list_del(&freepage->lru);
cc->nr_freepages--;
/* 返回空闲页框 */
return freepage;
}
isolate_freepages
/*
* 按pageblock维度,从后向前扫描zone,获取足够的free pages
*/
static void isolate_freepages(struct compact_control *cc)
{
struct zone *zone = cc->zone;
struct page *page;
unsigned long block_start_pfn; /* start of current pageblock */
unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
struct list_head *freelist = &cc->freepages;
isolate_start_pfn = cc->free_pfn;
/* 获取开始扫描页框所在的pageblock */
block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
/* 获取此pageblock的最后pfn */
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
/* 按pageblock_nr_pages对齐,low_pfn=可迁移页框扫描所在的页框号,这是扫描的截止位置 */
low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
/* 循环条件,
* 1. 扫描截止位置:low_pfn,扫描可迁移页框停止位置
* 2. 扫描截止条件:cc->nr_migratepages <= cc->nr_freepages, 即获取到足够的free page迁移数据
* 循环步长:pageblock
*/
for (; block_start_pfn >= low_pfn &&
cc->nr_migratepages > cc->nr_freepages;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
/* 执行时间过长则睡眠,32个1024页框休眠一下,异步的情况还需要判断运行进程是否需要被调度 */
if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
&& compact_should_abort(cc))
break;
/* 检查block_start_pfn和block_end_pfn,
* 如果没问题,返回block_start_pfn所指的页描述符,也就是pageblock第一页描述符 */
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
if (!page)
continue;
/* Check the block is suitable for migration */
/* 判断是否能够用于异步迁移页框
* 判断条件1: 如果处于伙伴系统中,它所代表的这段连续页框的order值必须小于pageblock的order值
* 判断条件2: 此pageblock必须为MIGRATE_MOVABLE或者MIGRATE_CMA类型,而为MIGRATE_RECLAIMABLE类型的pageblock则跳过
*/
if (!suitable_migration_target(page))
continue;
/* 判断本pageblock是否标记了PB_migrate_skip标志 */
if (!isolation_suitable(cc, page))
continue;
/* 按page维度扫描从isolate_start_pfn到block_end_pfn的空闲页框
*
* 并把它们放入到freelist中,返回此pageblock中总共获得的空闲页框数量
* 第一轮扫描可能会跳过,应该第一次isolate_start_pfn是等于zone最后一个页框的
*/
isolate_freepages_block(cc, &isolate_start_pfn,
block_end_pfn, freelist, false);
/*
* 下次循环开始的页框;
* 在isolate_freepages_block中会实时更新isolate_start_pfn;
* 如果isolate_start_pfn < block_end_pfn,则说明本pageblock还未扫描完
*/
cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
isolate_start_pfn :
block_start_pfn - pageblock_nr_pages;
/* 检查contended,此用于表明是否需要终止 */
if (cc->contended)
break;
}
/* 设置页表项,设置为内核使用 */
map_pages(freelist);
/*
* free_pfn: 扫描freepage的起始pfn;migrate_pfn: 扫描migratable page的起始页
* 如果是由于block_start_pfn < low_pfn导致的循环结束:
* 保证free_pfn不超过migrate_pfn
*/
if (block_start_pfn < low_pfn)
cc->free_pfn = cc->migrate_pfn;
}
isolate_freepages_block
/* 按page维度扫描从start_pfn到end_pfn的空闲页框,一般都是一个pageblock的开始页框ID和结束页框ID
* 并把它们放入到freelist中,返回此pageblock中总共获得的空闲页框数量
* 扫描后会修改start_pfn,传出函数,作为下次扫描起始点
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
struct page *cursor, *valid_page = NULL;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
/* 获取起始页框指针 */
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
/* 从pageblock的start向end进行扫描 */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
int isolated, i;
struct page *page = cursor;
/* 执行时间过长则睡眠,32个页框休眠一下,异步的情况还需要判断运行进程是否需要被调度 */
if (!(blockpfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(&cc->zone->lock, flags,
&locked, cc))
break;
nr_scanned++;
/* 检查此页框号是否正确 */
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
if (!valid_page)
valid_page = page;
/* 检查此页是否在伙伴系统中,不在说明是正在使用的页框,则跳过 */
if (!PageBuddy(page))
goto isolate_fail;
if (!locked) {
locked = compact_trylock_irqsave(&cc->zone->lock,
&flags, cc);
if (!locked)
break;
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
goto isolate_fail;
}
/* 将page开始的连续空闲页框拆分为连续的单个页框,返回数量,order值会在page的页描述符中,这里有可能会设置pageblock的类型 */
isolated = split_free_page(page);
/* 更新总共隔离的空闲页框数量;这里可能会导致收集的free pages>需要的 */
total_isolated += isolated;
/* 将isolated数量个单个页框放入freelist中 */
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
page++;
}
/* 如果有分割出空闲页框,则判断目前已有的空闲页框是否足够
* 如果足够则跳出循环,否则继续
*/
if (isolated) {
cc->nr_freepages += isolated;
if (!strict &&
cc->nr_migratepages <= cc->nr_freepages) {
blockpfn += isolated;
break;
}
blockpfn += isolated - 1;
cursor += isolated - 1;
continue;
}
isolate_fail:
if (strict)
break;
else
continue;
}
trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
nr_scanned, total_isolated);
/* Record how far we have got within the block */
*start_pfn = blockpfn;
/*
* If strict isolation is requested by CMA then check that all the
* pages requested were isolated. If there were any failures, 0 is
* returned and CMA will fail.
*/
if (strict && blockpfn < end_pfn)
total_isolated = 0;
if (locked)
spin_unlock_irqrestore(&cc->zone->lock, flags);
/* Update the pageblock-skip if the whole pageblock was scanned */
/* 扫描完了此pageblock,如果此pageblock中没有隔离出空闲页框,则标记此pageblock为跳过 */
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
/* 返回总共获得的空闲页框 */
return total_isolated;
}
compaction_free
/* 当从compaction_alloc()获取一个空闲页框用于移动时,但由于某些原因失败,就要把这个空闲页框重新放回cc->freepages链表中 */
static void compaction_free(struct page *page, unsigned long data)
{
struct compact_control *cc = (struct compact_control *)data;
list_add(&page->lru, &cc->freepages);
cc->nr_freepages++;
}
unmap_and_move
/*
* 将old page的页描述符数据和页内数据移动到new page上;
* 1. new page来源于compaction_alloc()扫描空闲页框获得
* 2. 调用迁移核心函数__unmap_and_move()进行页面数据转移
* 2. 如果迁移成功则将Old page释放到伙伴系统中的每CPU页高速缓存中
* 3. 如果不成功,则根据page类型不同情况处理
*/
static ICE_noinline int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
/* 获取一个空闲页,具体见compaction_alloc() */
struct page *newpage = get_new_page(page, private, &result);
if (!newpage)
return -ENOMEM;
/* 如果页的page_count == 1,说明此页必定是非文件页而且没有进程映射了此页,此页可以直接释放掉 */
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
goto out;
}
/* 此页是透明大页则跳过 */
if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page(page)))
goto out;
/* 将page取消映射,并把page的数据复制到newpage中 */
rc = __unmap_and_move(page, newpage, force, mode);
out:
if (rc != -EAGAIN) {
/* 迁移成功,将旧的page放回到LRU链表中,因为旧的page已经是一个空闲的page了 */
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/* 在这里会把新页放回到原来的地方 */
putback_lru_page(page);
}
/* 迁移不成功 */
if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
ClearPageSwapBacked(newpage);
/* 将新页放回到cc的空闲页链表中,具体见compaction_free() */
put_new_page(newpage, private);
} else if (unlikely(__is_movable_balloon_page(newpage))) {
/* drop our reference, page already in the balloon */
/* 如果新页是属于balloon使用的页,则放回到相应地方 */
put_page(newpage);
} else
/* 在这里会把新页放回到原来的地方 */
putback_lru_page(newpage);
if (result) {
if (rc)
*result = rc;
else
*result = page_to_nid(newpage);
}
return rc;
}
附录
compact_control
struct compact_control {
/* 扫描到的空闲页的页的链表 */
struct list_head freepages; /* List of free pages to migrate to */
/* 扫描到的可移动的页的链表 */
struct list_head migratepages; /* List of pages being migrated */
/* 空闲页链表中的页数量 */
unsigned long nr_freepages; /* Number of isolated free pages */
/* 可移动页链表中的页数量 */
unsigned long nr_migratepages; /* Number of pages to migrate */
/* 空闲页框扫描所在页框号 */
unsigned long free_pfn; /* isolate_freepages search base */
/* 可移动页框扫描所在页框号 */
unsigned long migrate_pfn; /* isolate_migratepages search base */
/* 内存压缩使用的模式: 同步,轻同步,异步 */
enum migrate_mode mode; /* Async or sync migration mode */
/* 是否忽略pageblock的PB_migrate_skip标志对需要跳过的pageblock进行扫描 ,并且也不会对pageblock设置跳过
* 只有两种情况会使用
* 1.调用alloc_contig_range()尝试分配一段指定了开始页框号和结束页框号的连续页框时;
* 2.通过写入1到sysfs中的/vm/compact_memory文件手动实现同步内存压缩。
*/
bool ignore_skip_hint; /* Scan blocks even if marked skip */
/* 本次内存压缩是否隔离到了空闲页框,会影响zone的空闲页扫描起始位置 */
bool finished_update_free; /* True when the zone cached pfns are
* no longer being updated
*/
/* 本次内存压缩是否隔离到了可移动页框,会影响zone的可移动页扫描起始位置 */
bool finished_update_migrate;
/* 申请内存时需要的页框的order值 */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
/* 扫描的管理区 */
struct zone *zone;
/* 保存结果,比如异步模式下是否因为需要阻塞而结束了本次内存压缩 */
int contended; /* Signal need_sched() or lock
* contention detected during
* compaction
*/
};
### migrate_mode
``` c++
enum migrate_mode {
/*
* 异步模式:遇到阻塞和需要调度的时候直接返回,返回前会把隔离出来的页框放回去;不会增加推迟计数器阀值
* 仅处理MIGRATE_MOVABLE、MIGRATE_CMA类型的页
*/
MIGRATE_ASYNC,
/*
* 轻同步模式:在异步压缩无法解决问题时,有可能进行同步模式;允许大多数的阻塞操作;会增加推迟计数器阀值
* 但不会阻塞等待正在回写的页结束,也不会对脏页进行回写
* 仅处理MIGRATE_MOVABLE、MIGRATE_CMA、MIRGATE_RECLAIMABLE类型的页
*/
MIGRATE_SYNC_LIGHT,
/*
* 同步模式:忽略pageblock的PB_migrate_skip, 会对隔离出来需要移动的脏页进行回写操作,等待回写结束;会增加推迟计数器阀值
* 有三种情况触发:
* 1. cma分配
* 2. 通过alloc_contig_range()尝试分配一段指定了开始页框号和结束页框号的连续页框时
* 3. 将1写入sysfs中的vm/compact_memory
*/
MIGRATE_SYNC,
};