ARM64内存管理二十一：内存回收入口

前沿

往篇回顾

在上两篇中内存压缩算法主要流程内存压缩算法之数据同步基本说明白了内存压缩的基本过程；

在内存压缩算法主要流程中，主要是分析了如何从zone中隔离出合适的migrate pages以及相应的从zone中隔离出足够数量的free pages,然后调用unmap_and_move函数进行迁移页面的数据同步
在内存压缩算法之数据同步中，负责进行页面迁移，过程中涉及细节处理较多，大部分内容其实并不要细看，了解大致内容即可
1. 在__unmap_and_move函数对单个page进行迁移中，首先会将old page置位旧业的PG_locked
2. 新建一个swap类型的页表项数据，映射到old page上
3. try_to_unmap中通过RMAP将new swap entry数据写入到所有映射了此页的进程页表项中(此时开始，再无法访问Old Page, 如果访问就会被加入一个等待队列中)
4. move_to_new_page中将old page数据和参数复制到new page中
5. 新建一个常规的页表项数据，用于映射new page
6. 再次通过RMAP将新的常规页表项数据写入到所有映射了旧业的进程页表项中
7. 清除旧页的PG_lock标志，唤醒所有访问此页的进程

内存回收主要内容(lru链表中页的内存回收)

回收的页都是非活动匿名页lru链表或者非活动文件页lru链表上的页，包括: 进程堆、栈、匿名mmap共享内存映射、shmem共享内存映射使用的页、映射磁盘文件页
根本方法：将page->_count降到0；然后就根据不同的情况分别处理：
1. 干净页，并且映射了磁盘文件的页，直接回收
2. 没有进程映射，并且没有映射磁盘文件的页，直接回收
3. 文件页：脏页(PG_dirty置位)，回写到对应磁盘文件中，然后回收
4. 匿名页：有进程映射，并且没有映射磁盘文件的页，回写到swap分区中，然后回收
触发路径：但无论哪种路径，最后的回收入口函数均为shrink_zone,都是着眼于node维度
1. 被动式：由内核线程kswapd直接调用内存回收逻辑进行回收，对每个zone从低到高的方向扫描
2. 主动式：在从伙伴系统申请内存(__alloc_pages)时进入分配逻辑时进行内存回收，从高到低扫描

本篇主要内容

内存回收分为两篇来描述，本篇主要说明触发内存回收的不同入口，以及他们在启动内存回收前的处理措施

代码分析

由kswapd触发内核内存回收路径

kswapd_init

setup_arch()->paging_init()->bootmem_init()->zone_sizes_init()->free_area_init_node()->free_area_init_core()->init_waitqueue_head(&pgdat->kswapd_wait)

static int __init kswapd_init(void)
{
	int nid;

	swap_setup();
	for_each_node_state(nid, N_MEMORY)
 		kswapd_run(nid);
	hotcpu_notifier(cpu_callback, 0);
	return 0;
}

kswapd_run

int kswapd_run(int nid)
{
	/* 获取内存节点对应的pg_data_t指针 */
	pg_data_t *pgdat = NODE_DATA(nid);
	int ret = 0;

	if (pgdat->kswapd)
		return 0;
	/* kswapd函数，pgdat作为参数传入kswapd函数 */
	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
	if (IS_ERR(pgdat->kswapd)) {
		/* failure at boot is fatal */
		BUG_ON(system_state == SYSTEM_BOOTING);
		pr_err("Failed to start kswapd on node %d\n", nid);
		ret = PTR_ERR(pgdat->kswapd);
		pgdat->kswapd = NULL;
	}
	return ret;
}

kswapd

/*
 * kswapd线程每隔一段时间就自动平衡空闲页面，从低到高扫描每个zone，使每个zone的空闲页面维持在一定水平
 * 
 */
static int kswapd(void *p)
{
	unsigned long order, new_order;
	unsigned balanced_order;
	int classzone_idx, new_classzone_idx;
	int balanced_classzone_idx;
	/* 每个node一个kswapd线程，对应一个pg_data_t结构体 */
	pg_data_t *pgdat = (pg_data_t*)p;
	struct task_struct *tsk = current;

	struct reclaim_state reclaim_state = {
		.reclaimed_slab = 0,
	};
	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

	lockdep_set_current_reclaim_state(GFP_KERNEL);

	if (!cpumask_empty(cpumask))
		set_cpus_allowed_ptr(tsk, cpumask);
	current->reclaim_state = &reclaim_state;

	/*
	 * Tell the memory management that we're a "memory allocator",
	 * and that if we need more memory we should get access to it
	 * regardless (see "__alloc_pages()"). "kswapd" should
	 * never get caught in the normal page freeing logic.
	 *
	 * (Kswapd normally doesn't need memory anyway, but sometimes
	 * you need a small amount of memory in order to be able to
	 * page out something else, and this flag essentially protects
	 * us from recursively trying to free more memory as we're
	 * trying to free the first piece of memory in the first place).
	 */
	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
	set_freezable();

	order = new_order = 0;
	balanced_order = 0;
	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
	balanced_classzone_idx = classzone_idx;
	for ( ; ; ) {
		bool ret;

		/*
		 * If the last balance_pgdat was unsuccessful it's unlikely a
		 * new request of a similar or harder type will succeed soon
		 * so consider going to sleep on the basis we reclaimed at
		 */
		if (balanced_classzone_idx >= new_classzone_idx &&
					balanced_order == new_order) {
			new_order = pgdat->kswapd_max_order;
			new_classzone_idx = pgdat->classzone_idx;
			pgdat->kswapd_max_order =  0;
			pgdat->classzone_idx = pgdat->nr_zones - 1;
		}

		if (order < new_order || classzone_idx > new_classzone_idx) {
			/*
			 * Don't sleep if someone wants a larger 'order'
			 * allocation or has tigher zone constraints
			 */
			order = new_order;
			classzone_idx = new_classzone_idx;
		} else {
			/* 在此处睡眠，等待wakeup_kswapd来唤醒 */
			kswapd_try_to_sleep(pgdat, balanced_order,
						balanced_classzone_idx);
			/* pgdata->kswapd_max_order和pgdat->classzone_id已经在wakeup_kswapd中进行了更新 */
			order = pgdat->kswapd_max_order;
			classzone_idx = pgdat->classzone_idx;
			new_order = order;
			new_classzone_idx = classzone_idx;
			pgdat->kswapd_max_order = 0;
			pgdat->classzone_idx = pgdat->nr_zones - 1;
		}
		/* 内核进程冻结技术，详细介绍参考文献
		 * 1. 内核为每个进程在适当的时候调用try_to_freeze来设置PF_FREEZE标志，
		 * 2. 当系统要suspend的时候，系统那边会调用freeze_process函数来将所有可冷冻的任务的TIF_FREEZE标志置位
		 * 3. 然后所有有TIF_FREEZE标志的进程会睡眠
		 */
		ret = try_to_freeze();
		/* 判断内核线程是否应该停止，当有人调用kthread_stop()时，此处为真 */
		if (kthread_should_stop())
			break;

		/*
		 * We can speed up thawing tasks if we don't call balance_pgdat
		 * after returning from the refrigerator
		 */
		if (!ret) {
			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
			balanced_classzone_idx = classzone_idx;
			/* 启动内存回收具体任务 */
			balanced_order = balance_pgdat(pgdat, order,
						&balanced_classzone_idx);
		}
	}
	/* 当kthread被停止，修改进程标识，具体什么意思？ */
	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
	current->reclaim_state = NULL;
	lockdep_clear_current_reclaim_state();

	return 0;
}

balance_pgdat

/* 对node中zone，从低往高进行内存释放 
 * 以默认优先级12，执行多次遍历node内存释放；只有当在当前优先优先级无法回收到内存情况下，才进行sc->priority--
 * 1. 从高到底找出第一不平衡的zone
 * 2. 循环对第一个不平衡的其下面的zone进行内存回收
		检测node是否需要内存压缩：只要有一个zone能以low level分配order页框则不需要
		mem_cgroup_soft_limit_reclaim：对zone内memcg中超过soft_limit_in_bytes的进程内存进行软回收
		kswapd_shrink_zone->shrink_zone: 进行内存回收(如果内存平衡则不需要进行回收)，回收不成功则降低回收优先级
		pfmemalloc_watermark_ok判断OK，则唤醒pfmemalloc_wait队列中的等待进程
		如果要进行内存压缩的话，则启动
 * 本node内存balance(pgdat_balanced) || sc->priority==0 停止对node循环内存回收
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
							int *classzone_idx)
{
	int i;
	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
	unsigned long nr_soft_reclaimed;
	unsigned long nr_soft_scanned;
	struct scan_control sc = {
		.gfp_mask = GFP_KERNEL,
		.order = order,
/* 优先级使用默认为的12，会执行多次遍历node(并不是node中的所有zone)，但并不会每次遍历都进行sc->priority--，当能够回收的内存时，才进行sc->priority-- */
		.priority = DEF_PRIORITY,
		.may_writepage = !laptop_mode,
		.may_unmap = 1,
		.may_swap = 1,
	};
	count_vm_event(PAGEOUTRUN);

	do {
		unsigned long nr_attempted = 0;
		bool raise_priority = true;
		/* 如果要求收集的内存order>0,则在回收内存后，启动内存压缩，整理碎片 */
		bool pgdat_needs_compaction = (order > 0);

		sc.nr_reclaimed = 0;

		/*
		 * 确定内存释放区的zone id上限：
		 * 1. 通过age_active_anon检查该zone lru链表中inactive_list中page是否足够
		 * 2. 为什么上限是第一个buffer_header，或不需要进行内存平衡的区域？
		 */
		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
			struct zone *zone = pgdat->node_zones + i;
			/* 如果zone中没有page，则跳过本zone */
			if (!populated_zone(zone))
				continue;
			/* ？？ */
			if (sc.priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone))
				continue;

			/* 通过inactive_anon_is_low()判断本zone的lru inactive链表中页面数量是否足够
			 * 如果不够，则使用shrink_active_list()释放更多active页面到inactive链表
			 * shrink_active_list、shrink_zone等更多的收缩流程在下一篇中介绍
			 */
			age_active_anon(zone, &sc);

			/*
			 * If the number of buffer_heads in the machine
			 * exceeds the maximum allowed level and this node
			 * has a highmem zone, force kswapd to reclaim from
			 * it to relieve lowmem pressure.
			 */
			if (buffer_heads_over_limit && is_highmem_idx(i)) {
				end_zone = i;
				break;
			}

			if (!zone_balanced(zone, order, 0, 0)) {
				/* 找到第一个不平衡的zone，平衡条件
				 * 1.  此zone分配页框后剩余的页框数量 > 此zone的high阀值 + 此zone保留的页框数量
				 * 2. 申请的order阶内存，可以通过内存压缩达到
				 */
				end_zone = i;
				break;
			} else {
				/*
				 * If balanced, clear the dirty and congested
				 * flags
				 */
				clear_bit(ZONE_CONGESTED, &zone->flags);
				clear_bit(ZONE_DIRTY, &zone->flags);
			}
		}

		if (i < 0)
			goto out;

		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone))
				continue;

			/* 如果内存回收后，足够申请order阶的新内存，则不启动内存压缩
			 * 所有需要扫描的低端zone只有有一个满足上述条件即可
			 */
			if (pgdat_needs_compaction &&
					zone_watermark_ok(zone, order,
						low_wmark_pages(zone),
						*classzone_idx, 0))
				pgdat_needs_compaction = false;
		}

		/*
		 * If we're getting trouble reclaiming, start doing writepage
		 * even in laptop mode.
		 */
		/* 优先级越低，则要求回收的内存越多 */
		if (sc.priority < DEF_PRIORITY - 2)
			sc.may_writepage = 1;

		/*
		 * 接下来从低到高扫描zone，停止在end_zone上
		 */
		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone))
				continue;

			if (sc.priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone))
				continue;

			sc.nr_scanned = 0;

			nr_soft_scanned = 0;
			/*
			 * 回收该zone上超过soft_limit最多的mem_cgroup在该zone上mem_cgroup_per_zone对应的lru链表
			 * 直接回收、间接回收都会调用该函数，是调用shrink_zone()前的必备动作
			 * 通过全局mem group结构体soft_limit_tree->rb_tree_per_node->rb_tree_per_zone->rb_root找到mem_cgroup_per_zone
			 */
			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
							order, sc.gfp_mask,
							&nr_soft_scanned);
			sc.nr_reclaimed += nr_soft_reclaimed;

			/*
			 * There should be no need to raise the scanning
			 * priority if enough pages are already being scanned
			 * that that high watermark would be met at 100%
			 * efficiency.
			 */
			 /* 如果回收成功则不需要降低回收优先级 */
			if (kswapd_shrink_zone(zone, end_zone,
					       &sc, &nr_attempted))
				raise_priority = false;
		}

		/*
		 * If the low watermark is met there is no need for processes
		 * to be throttled on pfmemalloc_wait as they should not be
		 * able to safely make forward progress. Wake them
		 */
		/* 执行完一轮内存回收后，唤醒挂在pfmemalloc_wait队列中等待进行内存分配的任务
		 * 1. 通过waitqueue_active判断pfmemalloc_wait队列中是否有成员
		 * 2. pfmemalloc_watermark_ok判断本zone是否有足够空闲内存用于分配，如果没有会唤醒kswapd线程回收内存
		 */
		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
				pfmemalloc_watermark_ok(pgdat))
			wake_up_all(&pgdat->pfmemalloc_wait);

		/*
		 * Fragmentation may mean that the system cannot be rebalanced
		 * for high-order allocations in all zones. If twice the
		 * allocation size has been reclaimed and the zones are still
		 * not balanced then recheck the watermarks at order-0 to
		 * prevent kswapd reclaiming excessively. Assume that a
		 * process requested a high-order can direct reclaim/compact.
		 */
		if (order && sc.nr_reclaimed >= 2UL << order)
			order = sc.order = 0;

		/* Check if kswapd should be suspending */
		if (try_to_freeze() || kthread_should_stop())
			break;

		/* 如果需要内存压缩则启动内存压缩 */
		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
			compact_pgdat(pgdat, order);

		/*
		 * Raise priority if scanning rate is too low or there was no
		 * progress in reclaiming pages
		 */
		if (raise_priority || !sc.nr_reclaimed)
			sc.priority--;
	} while (sc.priority >= 1 &&
		 !pgdat_balanced(pgdat, order, *classzone_idx));

out:
	/*
	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
	 * makes a decision on the order we were last reclaiming at. However,
	 * if another caller entered the allocator slow path while kswapd
	 * was awake, order will remain at the higher level
	 */
	/* 记录本次均衡操作的最终zone id */
	*classzone_idx = end_zone;
	return order;
}

kswapd_shrink_zone

/*
 * 对zone执行内存回收：
 * 1. 确定一个zone需要回收的内存数量
 * 2. 检测目前zone的内存压力，尽量少执行内存回收
 * 3. 通过shrink_zone()实际执行内存回收
 */
static bool kswapd_shrink_zone(struct zone *zone,
			       int classzone_idx,
			       struct scan_control *sc,
			       unsigned long *nr_attempted)
{
	int testorder = sc->order;
	unsigned long balance_gap;
	bool lowmem_pressure;

	/* 确定本zone需要回收的页框数量；目标是要使水位回到high levle. */
	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));

	/*
	 * Kswapd reclaims only single pages with compaction enabled. Trying
	 * too hard to reclaim until contiguous free pages have become
	 * available can hurt performance by evicting too much useful data
	 * from memory. Do not reclaim more than needed for compaction.
	 */
	 /* 如果可以内存压缩可以解决需求order，只要回收单个页面即可.避免连续回收页面对系统性能造成影响 */
	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
			compaction_suitable(zone, sc->order, 0, classzone_idx)
							!= COMPACT_SKIPPED)
		testorder = 0;

	/*
	 * We put equal pressure on every zone, unless one zone has way too
	 * many pages free already. The "too many pages" is defined as the
	 * high wmark plus a "gap" where the gap is either the low
	 * watermark or 1% of the zone, whichever is smaller.
	 */
	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));

	/*
	 * If there is no low memory pressure or the zone is balanced then no
	 * reclaim is necessary
	 */
	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
	/* 如果低端内存压力不大，则不需要进行内存回收 */
	if (!lowmem_pressure && zone_balanced(zone, testorder,
						balance_gap, classzone_idx))
		return true;
	/* 内存回收入口函数 */
	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);

	/* 计算尝试回收的各个zone内存数量总和 */
	*nr_attempted += sc->nr_to_reclaim;

	clear_bit(ZONE_WRITEBACK, &zone->flags);

	/*
	 * If a zone reaches its high watermark, consider it to be no longer
	 * congested. It's possible there are dirty pages backed by congested
	 * BDIs but as pressure is relieved, speculatively avoid congestion
	 * waits.
	 */
	if (zone_reclaimable(zone) &&
	    zone_balanced(zone, testorder, 0, classzone_idx)) {
		clear_bit(ZONE_CONGESTED, &zone->flags);
		clear_bit(ZONE_DIRTY, &zone->flags);
	}

	return sc->nr_scanned >= sc->nr_to_reclaim;
}

总结

开始标志：zonelist的所有zone都不能通过min阀值获取到页框时，会唤醒所有node的kswapd内核线程，然后在kswapd中会对不满足 zone分配页框后剩余的页框数量 > 此zone的high阀值 + 此zone保留的页框数量的zone进行内存回收
结束标志：node中所有zone都满足 zone分配页框后剩余的页框数量 > 此zone的high阀值 + 此zone保留的页框数量(可能会进行多次shrink_zone()的调用)
回收对象：超过所在memcg的soft_limit_in_bytes的进程的内存、zone的可回收页框、slab

由__alloc_pages()触发内核内存回收路径

快速分配触发内存回收路径

__alloc_pages()->__alloc_pages_nodemask()->get_page_from_freelist()->zone_reclaim->__zone_reclaim()->shrink_zone()

__zone_reclaim

/*
 * 快速内存分配失败后触发本函数进行内存回收；但是并不能保证回收后必然可以分配order阶内存块？
 * 1. 根据目前zone_reclaim_mode确定内存回收sc控制结构体
 * 2. zone_pagecache_reclaimable:根据zone_reclaim_mdoe状态计算可供回收的页面数量
 * 3. 对不同priority级别进行循环内存回收，直到回收到足够的页面
 */
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
	/* Minimum pages needed in order to stay on node */
	const unsigned long nr_pages = 1 << order;
	struct task_struct *p = current;
	struct reclaim_state reclaim_state;
	struct scan_control sc = {
/* 最少一次回收SWAP_CLUSTER_MAX，最多一次回收1 << order个，应该是1024个 */
		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
/* 当前进程明确禁止分配内存的IO操作(禁止__GFP_IO，__GFP_FS标志)，那么则清除__GFP_IO，__GFP_FS标志，表示不进行IO操作 */
		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
		.order = order,
/* 优先级为4，默认是12，会比12一次扫描更多lru链表中的页框，而且扫描次数会比优先级为12的少，并且如果回收过程中回收到了足够页框，就会返回 */
		.priority = ZONE_RECLAIM_PRIORITY,
		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),/* 根据zone_reclaim_mdoe判断，允许file页回写磁盘操作 */
		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),/* 根据zone_reclaim_mdoe判断，允许unmap操作 */
		.may_swap = 1,/* 允许匿名页swap */
	};

	cond_resched();
	/*
	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
	 * and we also need to be able to write out pages for RECLAIM_WRITE
	 * and RECLAIM_SWAP.
	 */
	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
	lockdep_set_current_reclaim_state(gfp_mask);
	reclaim_state.reclaimed_slab = 0;
	p->reclaim_state = &reclaim_state;
	/*
	 * 根据目前的zone_reclaim_mode计算有多少潜在的page可以被回收
	 * 0x0:只会对最有zone附近的几个需要进行内存回收的zone进行内存回收；其余的会对所有zone回收
	 * 0x1:开启zone的内存回收
	 * 0x2:允许回写
	 * 0x4:允许unmap操作
	 */
	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
		/*
		 * Free memory by calling shrink zone with increasing
		 * priorities until we have enough memory freed.
		 */
		do {
			/* 内存回收入口函数，对本zone进行循环内存回收，直到收集到足够的页面 */
			shrink_zone(zone, &sc, true);
/* 最多进行4次调用shrink_zone()，并且每次调用shrink_zone()扫描的页框会越来越多，直到回收到了1<<order个页框为止 */
		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
	}

	p->reclaim_state = NULL;
	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
	lockdep_clear_current_reclaim_state();
	return sc.nr_reclaimed >= nr_pages;
}

总结

开始标志是：此zone分配后剩余的页框数量 > 此zone的阀值 + 此zone的保留页框数量(阀值可能是:min,low,high其中一个)
结束标志是：对此zone回收到了本次分配时需要的页框数量或者 sc->priority降为0(可能会进行多次shrink_zone()的调用)
回收对象：zone的可回收页框、slab

慢速分配触发内存回收路径

__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->__alloc_pages_direct_reclaim()->__perform_reclaim()->try_to_free_pages()->do_try_to_free_pages()->shrink_zone()

try_to_free_pages

/* 初始化内存回收控制结构体，并检测当前进程是否需要进入pgdat->pfmemalloc_wait等待队列
 * 1. 初始化sc，回收数量目标是SWAP_CLUSTER_MAX？
 * 2. throttle_direct_reclaim：判断当前内存申请进程是否要进入等待队列
 * 3. do_try_to_free_pages：触发内存回收；直到回收到目标数量page或内存压缩可以解决问题
 */
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
				gfp_t gfp_mask, nodemask_t *nodemask)
{
	unsigned long nr_reclaimed;
	struct scan_control sc = {
		.nr_to_reclaim = SWAP_CLUSTER_MAX,/* 只打算回收的页框为32个? */
		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
		.order = order,
		.nodemask = nodemask,
		.priority = DEF_PRIORITY,
		/* 与/proc/sys/vm/laptop_mode文件有关
         * laptop_mode为0，则允许进行回写操作，即使允许回写，直接内存回收也不能对脏文件页进行回写
         * 不过允许回写时，可以对非文件页进行回写
         */
		.may_writepage = !laptop_mode,
		.may_unmap = 1,/* 允许进行unmap操作 */
		.may_swap = 1,/* 允许进行非文件页的操作 */
	};

	/* 判断node是否平衡，如果平衡，则继续进行内存回收
	 * 如果不平衡，如果此node的kswapd没有被唤醒，则唤醒，并且这里唤醒kswapd只会对ZONE_NORMAL以下的zone进行内存回收
	 * 此node的ZONE_DMA和ZONE_NORMAL的总共空闲页框数量 > 此node的ZONE_DMA和ZONE_NORMAL的平均min阀值数量
	 */
	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
		return 1;

	trace_mm_vmscan_direct_reclaim_begin(order,
				sc.may_writepage,
				gfp_mask);

	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);

	return nr_reclaimed;
}

throttle_direct_reclaim

/*
 * 判断当前进程是否加入到pgdat->pfmemalloc_wait等待队列中
 * 1. 如果为内核进程，则返回
 * 2. 如果当前进程已接收到kill信号，则返回
 * 3. 判断最优node(最优zone所属的node)是否pfmemalloc balance，否则加入pgdat->pfmemalloc_wait等待队列中
		a. 有__GFP_FS: 加入队列没有超时限制，并且进程状态为TASK_KILLABLE
		b. 无__GFP_FS: 超时时间为1s，且状态为TASK_INTERRUPTABLE
 */
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
					nodemask_t *nodemask)
{
	struct zoneref *z;
	struct zone *zone;
	pg_data_t *pgdat = NULL;

	 /* 如果标记了PF_KTHREAD，表示此进程是一个内核线程，则不会往下执行 */
	if (current->flags & PF_KTHREAD)
		goto out;

	/*
	 * If a fatal signal is pending, this process should not throttle.
	 * It should return quickly so it can exit and free its memory
	 */
	 /* 此进程已经接收到了kill信号，准备要被杀掉了 */
	if (fatal_signal_pending(current))
		goto out;

	/* 遍历zonelist，但是里面只会在获取到第一个pgdat时就跳出
	 * 但不能是zone_highmem，因为有可能pfmemalloc队列中等待的任务有GFP_KERNEL分配标志，不能从zone_highmem分配内存
	 */
	for_each_zone_zonelist_nodemask(zone, z, zonelist,
					gfp_zone(gfp_mask), nodemask) {
		/* 只遍历ZONE_NORMAL和ZONE_DMA区 */
		if (zone_idx(zone) > ZONE_NORMAL)
			continue;

		/* 获取zone对应的node */
		pgdat = zone->zone_pgdat;
		/* 判断node是否平衡，如果平衡，则返回真
         * 如果不平衡，如果此node的kswapd没有被唤醒，则唤醒，并且这里唤醒kswapd只会对ZONE_NORMAL以下的zone进行内存回收
         * node是否平衡的判断标准是:
         * 此node的ZONE_DMA和ZONE_NORMAL的总共空闲页框数量 > 此node的ZONE_DMA和ZONE_NORMAL的平均min阀值数量
         */
		if (pfmemalloc_watermark_ok(pgdat))
			goto out;
		break;
	}

	/* If no zone was usable by the allocation flags then do not throttle */
	if (!pgdat)
		goto out;

	/* Account for the throttling */
	count_vm_event(PGSCAN_DIRECT_THROTTLE);

	/* 如果分配标志禁止了文件系统操作，则将要进行内存回收的进程设置为TASK_INTERRUPTIBLE状态，然后加入到node的pgdat->pfmemalloc_wait，并且会设置超时时间为1s 
	 * 1.pfmemalloc_watermark_ok(pgdat)为真时被唤醒，而1s没超时，返回剩余timeout(jiffies)
	 * 2.睡眠超过1s时会唤醒，而pfmemalloc_watermark_ok(pgdat)此时为真，返回1
	 * 3.睡眠超过1s时会唤醒，而pfmemalloc_watermark_ok(pgdat)此时为假，返回0
	 * 4.接收到信号被唤醒，返回-ERESTARTSYS
	 */
	if (!(gfp_mask & __GFP_FS)) {
		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
			pfmemalloc_watermark_ok(pgdat), HZ);

		goto check_pending;
	}

    /* 如果分配标志没有禁止了文件系统操作
	 * 将本进程加入到node的pgdat->pfmemalloc_wait，并设置为TASK_KILLABLE状态，表示允许 TASK_UNINTERRUPTIBLE 响应致命信号的状态 
     * 这些进程在两种情况下被唤醒
     * 1.pfmemalloc_watermark_ok(pgdat)为真时
     * 2.接收到致命信号时
     */
	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
		pfmemalloc_watermark_ok(pgdat));

check_pending:
    /* 如果加入到了pgdat->pfmemalloc_wait后被唤醒，就会执行到这
     * 唤醒后再次检查当前进程是否接受到了kill信号，准备退出 
	 */
	if (fatal_signal_pending(current))
		return true;

out:
	return false;
}

do_try_to_free_pages

/*
 * 尝试对所有优先级别进行内存回收，直到以下两个条件或成立
 * 1. 已经回收到足够的页面
 * 2. 回收到的页面已经足够，只要启动内存压缩即可满足要求
 * 3. 期间如果累计扫描过的页面较多，会触发所有bdi设备启动flush回写线程
 */
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
					  struct scan_control *sc)
{
	int initial_priority = sc->priority;
	unsigned long total_scanned = 0;
	unsigned long writeback_threshold;
	bool zones_reclaimable;
retry:
	delayacct_freepages_start();

	if (global_reclaim(sc))
		count_vm_event(ALLOCSTALL);
	/*
	 * 尝试对所有优先级别进行内存回收，直到以下两个条件或成立
	 * 1. 已经回收到足够的页面
	 * 2. 回收到的页面已经足够，只要启动内存压缩即可满足要求
	 */
	do {
		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
				sc->priority);
		sc->nr_scanned = 0;
		zones_reclaimable = shrink_zones(zonelist, sc);

		total_scanned += sc->nr_scanned;
		/* 回收到足够的page，则退出 */
		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
			break;

		if (sc->compaction_ready)
			break;

		/*
		 * If we're getting trouble reclaiming, start doing
		 * writepage even in laptop mode.
		 */
		if (sc->priority < DEF_PRIORITY - 2)
			sc->may_writepage = 1;

		/*
		 * 如果扫描过的page数量>需要回收页面数量*1.5，还没有完成回收任务，则触发强制回写dirty page到磁盘
		 * 在default_bdi_init()中会初始化两个线程 sysnc_supers(周期性同步所有superblocks), bdi_default(在必要时create,start,stop flusher线程)
		 * 系统每新增一个bdi设备,会通过bdi_register()注册到全局bdi_list链表中，每个bdi设备都会有自己的flusher线程，用来回写本设备的脏页
		 */
		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
		if (total_scanned > writeback_threshold) {
			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
						WB_REASON_TRY_TO_FREE_PAGES);
			sc->may_writepage = 1;
		}

	} while (--sc->priority >= 0);

	delayacct_freepages_end();
	/* 如果已经回收到page，则返回成功回收的page num */
	if (sc->nr_reclaimed)
		return sc->nr_reclaimed;

	/* Aborted reclaim to try compaction? don't OOM, then */
	/* ready则表示已经回收到足够页面，但需要内存压缩来整理碎片才能满足内存分配要求 */
	if (sc->compaction_ready)
		return 1;

	/* Untapped cgroup reserves?  Don't OOM, retry. */
	if (!sc->may_thrash) {
		sc->priority = initial_priority;
		sc->may_thrash = 1;
		goto retry;
	}

	/* Any of the zones still reclaimable?  Don't OOM. */
	if (zones_reclaimable)
		return 1;

	return 0;
}

shrink_zones

/*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
 *
 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
 * Because:
 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
 *    allocation or
 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
 *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
 *    zone defense algorithm.
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 *
 * Returns true if a zone was reclaimable.
 */
static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
	struct zoneref *z;
	struct zone *zone;
	unsigned long nr_soft_reclaimed;
	unsigned long nr_soft_scanned;
	gfp_t orig_mask;
	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
	bool reclaimable = false;

	/*
	 * If the number of buffer_heads in the machine exceeds the maximum
	 * allowed level, force direct reclaim to scan the highmem zone as
	 * highmem pages could be pinning lowmem pages storing buffer_heads
	 */
	orig_mask = sc->gfp_mask;
	if (buffer_heads_over_limit)
		sc->gfp_mask |= __GFP_HIGHMEM;

	for_each_zone_zonelist_nodemask(zone, z, zonelist,
					requested_highidx, sc->nodemask) {
		enum zone_type classzone_idx;

		if (!populated_zone(zone))
			continue;

		classzone_idx = requested_highidx;
		while (!populated_zone(zone->zone_pgdat->node_zones +
							classzone_idx))
			classzone_idx--;

		/* 判断是zone级别的还是mem group级别的内存回收 */
		if (global_reclaim(sc)) {
			if (!cpuset_zone_allowed(zone,
						 GFP_KERNEL | __GFP_HARDWALL))
				continue;

			if (sc->priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone))
				continue;	/* Let kswapd poll it */

			/*
			 * If we already have plenty of memory free for
			 * compaction in this zone, don't free any more.
			 * Even though compaction is invoked for any
			 * non-zero order, only frequent costly order
			 * reclamation is disruptive enough to become a
			 * noticeable problem, like transparent huge
			 * page allocations.
			 */
			/* 
			 * 检测此zone目前是否能通过内存压缩满足内存分配要求：
			 * 1. 通过内存压缩能满足order阶内存的分配
			 * 2. 不用内存压缩也能满足order阶内存的分配
			 */
			if (IS_ENABLED(CONFIG_COMPACTION) &&
			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
			    zonelist_zone_idx(z) <= requested_highidx &&
			    compaction_ready(zone, sc->order)) {
				sc->compaction_ready = true;
				continue;
			}

			/*
			 * 回收该zone上超过soft_limit最多的mem_cgroup在该zone上mem_cgroup_per_zone对应的lru链表
			 * 直接回收、间接回收都会调用该函数，是调用shrink_zone()前的必备动作
			 * 通过全局mem group结构体soft_limit_tree->rb_tree_per_node->rb_tree_per_zone->rb_root找到mem_cgroup_per_zone
			 */
			nr_soft_scanned = 0;
			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
						sc->order, sc->gfp_mask,
						&nr_soft_scanned);
			sc->nr_reclaimed += nr_soft_reclaimed;
			sc->nr_scanned += nr_soft_scanned;
			if (nr_soft_reclaimed)
				reclaimable = true;
			/* need some check for avoid more shrink_zone() */
		}
		/* 如果是zone级别的内存回收，则直接调用shrink_zone(); 只要回收到了内存就返回true */
		if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
			reclaimable = true;
		
		if (global_reclaim(sc) &&
		    !reclaimable && zone_reclaimable(zone))
			reclaimable = true;
	}

	/*
	 * Restore to original mask to avoid the impact on the caller if we
	 * promoted it to __GFP_HIGHMEM.
	 */
	sc->gfp_mask = orig_mask;

	return reclaimable;
}

总结

慢速分配中，首先唤醒所有node节点的kswap内核线程
然后调用get_page_from_freelist()尝试用min阀值从zonelist的zone中获取连续页框
如果失败则对zonelist的zone进行异步压缩，然后调用get_page_from_freelist再次以min阀值尝试分配内存
如果还是失败则进入内存回收__alloc_pages_direct_reclaim
在进行直接内存回收时，进程是有可能加入到node的pgdat->pfmemalloc_wait这个等待队列中，当kswapd进行内存回收后如果node空闲内存达到平衡，那么就会唤醒pgdat->pfmemalloc_wait中的进程
开始标志：进程申请内存时，zonelist的所有zone都不能通过min阀值获取到页框时
结束标志：回收到32个页框，或者sc->priority降到0，或者空闲页框足够进行内存压缩了(可能会进行多次shrink_zone()的调用)
回收对象：超过所在memcg的soft_limit_in_bytes的进程的内存、zone的可回收页框、slab

附录

PG flag说明

PG_lru：表示页在lru链表中
PG_referenced: 表示页最近被访问(只有文件页使用)
PG_dirty：页为脏页，文件页被修改，以及非文件页加入到swap cache后，就会被标记为脏页。在此页回写前会被清除，但是回写失败时又会被置位
PG_active：页为活动页，配合PG_lru就可以得出页是处于非活动页lru链表还是活动页lru链表
PG_private：页描述符中的page->private保存有数据
PG_writeback：页正在进行回写
PG_swapbacked：此页可写入swap分区，一般用于表示此页是非文件页
PG_swapcache：页已经加入到了swap cache中(只有非文件页使用)
PG_reclaim：页正在进行回收，只有在内存回收时才会对需要回收的页进行此标记
PG_mlocked：页被锁在内存中

三个影响内存回收的全局参数

`/proc/sys/vm/zone_reclaim_mode`

只影响快速内存回收

0x0：快速内存回收只会对最优zone附近的几个需要进行内存回收的zone进行内存回收
0x1：开启zone的内存回收
0x2：开启zone的内存回收，并且允许回写
0x4：开启zone的内存回收，允许进行unmap操作

`/proc/sys/vm/laptop_mode`

影响所有内存回收

0：允许直接内存回收对匿名页lru链表中的页进行回写操作，并且允许直接内存回收唤醒flush内核线程
非0：直接内存回收不会对匿名页lru链表中的页进行回写操作

`/proc/sys/vm/swapiness`

影响所有内存回收,在shrink_zones中用于确定扫描匿名页lru链表和文件页lru链表的比例，范围是0~200，系统默认是30；

接近0：进行内存回收时，更多地去扫描文件页lru链表，如果为0，那么就不会去扫描匿名页lru链表。
接近200：进行内存回收时，更多地去扫描匿名页lru链表。

scan_control

/* 扫描控制结构，用于内存回收和内存压缩 */
struct scan_control {
    /* 需要回收的页框数量 */
    unsigned long nr_to_reclaim;

    /* 申请内存时使用的分配标志 */
    gfp_t gfp_mask;

    /* 申请内存时使用的order值，因为只有申请内存，然后内存不足时才会进行扫描 */
    int order;

    /* 允许执行扫描的node结点掩码 */
    nodemask_t    *nodemask;

    /* 目标memcg，如果是针对整个zone进行的，则此为NULL */
    struct mem_cgroup *target_mem_cgroup;

    /* 扫描优先级，一次扫描(total_size >> priority)个页框 
     * 优先级越低，一次扫描的页框数量就越多
     * 优先级越高，一次扫描的数量就越少
     * 默认优先级为12
     */
    int priority;

    /* 是否能够进行回写操作(与分配标志的__GFP_IO和__GFP_FS有关) */
    unsigned int may_writepage:1;

    /* 能否进行unmap操作，就是将所有映射了此页的页表项清空 */
    unsigned int may_unmap:1;

    /* 是否能够进行swap交换，如果不能，在内存回收时则不扫描匿名页lru链表 */
    unsigned int may_swap:1;

    unsigned int hibernation_mode:1;

    /* 扫描结束后会标记，用于内存回收判断是否需要进行内存压缩 */
    unsigned int compaction_ready:1;

    /* 已经扫描的页框数量 */
    unsigned long nr_scanned;
    /* 已经回收的页框数量 */
    unsigned long nr_reclaimed;
};

参考资料

linux内核冻结技术

cgroup-memory子系统分析1