前沿

往篇回顾

在内存压缩算法主要流程中，主要是分析了如何从zone中隔离出合适的migrate pages以及相应的从zone中隔离出足够数量的free pages,然后调用unmap_and_move函数进行迁移页面的数据同步
在内存压缩算法之数据同步中，负责进行页面迁移，过程中涉及细节处理较多，大部分内容其实并不要细看，了解大致内容即可
1. 在__unmap_and_move函数对单个page进行迁移中，首先会将old page置位旧业的PG_locked
2. 新建一个swap类型的页表项数据，映射到old page上
3. try_to_unmap中通过RMAP将new swap entry数据写入到所有映射了此页的进程页表项中(此时开始，再无法访问Old Page, 如果访问就会被加入一个等待队列中)
4. move_to_new_page中将old page数据和参数复制到new page中
5. 新建一个常规的页表项数据，用于映射new page
6. 再次通过RMAP将新的常规页表项数据写入到所有映射了旧业的进程页表项中
7. 清除旧页的PG_lock标志，唤醒所有访问此页的进程

内存回收主要内容(lru链表中页的内存回收)

回收的页都是非活动匿名页lru链表或者非活动文件页lru链表上的页，包括: 进程堆、栈、匿名mmap共享内存映射、shmem共享内存映射使用的页、映射磁盘文件页
根本方法：将page->_count降到0；然后就根据不同的情况分别处理：
1. 干净页，并且映射了磁盘文件的页，直接回收
2. 没有进程映射，并且没有映射磁盘文件的页，直接回收
3. 文件页：脏页(PG_dirty置位)，回写到对应磁盘文件中，然后回收
4. 匿名页：有进程映射，并且没有映射磁盘文件的页，回写到swap分区中，然后回收
触发路径：但无论哪种路径，最后的回收入口函数均为shrink_zone,都是着眼于node维度
1. 被动式：由内核线程kswapd直接调用内存回收逻辑进行回收，对每个zone从低到高的方向扫描
2. 主动式：在从伙伴系统申请内存(__alloc_pages)时进入分配逻辑时进行内存回收，从高到低扫描

本篇主要内容

内存回收分为两篇来描述，本篇主要说明触发内存回收的不同入口，以及他们在启动内存回收前的处理措施

代码分析

由kswapd触发内核内存回收路径

kswapd_init

setup_arch()->paging_init()->bootmem_init()->zone_sizes_init()->free_area_init_node()->free_area_init_core()->init_waitqueue_head(&pgdat->kswapd_wait)
static int __init kswapd_init(void)
{
	int nid;

	swap_setup();
	for_each_node_state(nid, N_MEMORY)
 		kswapd_run(nid);
	hotcpu_notifier(cpu_callback, 0);
	return 0;
}

kswapd_run

int kswapd_run(int nid)
{
	/* 获取内存节点对应的pg_data_t指针 */
	pg_data_t *pgdat = NODE_DATA(nid);
	int ret = 0;

	if (pgdat->kswapd)
		return 0;
	/* kswapd函数，pgdat作为参数传入kswapd函数 */
	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
	if (IS_ERR(pgdat->kswapd)) {
		/* failure at boot is fatal */
		BUG_ON(system_state == SYSTEM_BOOTING);
		pr_err("Failed to start kswapd on node %d\n", nid);
		ret = PTR_ERR(pgdat->kswapd);
		pgdat->kswapd = NULL;
	}
	return ret;
}

kswapd

/*
 * kswapd线程每隔一段时间就自动平衡空闲页面，从低到高扫描每个zone，使每个zone的空闲页面维持在一定水平
 * 
 */
static int kswapd(void *p)
{
	unsigned long order, new_order;
	unsigned balanced_order;
	int classzone_idx, new_classzone_idx;
	int balanced_classzone_idx;
	/* 每个node一个kswapd线程，对应一个pg_data_t结构体 */
	pg_data_t *pgdat = (pg_data_t*)p;
	struct task_struct *tsk = current;

	struct reclaim_state reclaim_state = {
		.reclaimed_slab = 0,
	};
	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

	lockdep_set_current_reclaim_state(GFP_KERNEL);

	if (!cpumask_empty(cpumask))
		set_cpus_allowed_ptr(tsk, cpumask);
	current->reclaim_state = &reclaim_state;

	/*
	 * Tell the memory management that we're a "memory allocator",
	 * and that if we need more memory we should get access to it
	 * regardless (see "__alloc_pages()"). "kswapd" should
	 * never get caught in the normal page freeing logic.
	 *
	 * (Kswapd normally doesn't need memory anyway, but sometimes
	 * you need a small amount of memory in order to be able to
	 * page out something else, and this flag essentially protects
	 * us from recursively trying to free more memory as we're
	 * trying to free the first piece of memory in the first place).
	 */
	tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
	set_freezable();

	order = new_order = 0;
	balanced_order = 0;
	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
	balanced_classzone_idx = classzone_idx;
	for ( ; ; ) {
		bool ret;

		/*
		 * If the last balance_pgdat was unsuccessful it's unlikely a
		 * new request of a similar or harder type will succeed soon
		 * so consider going to sleep on the basis we reclaimed at
		 */
		if (balanced_classzone_idx >= new_classzone_idx &&
					balanced_order == new_order) {
			new_order = pgdat->kswapd_max_order;
			new_classzone_idx = pgdat->classzone_idx;
			pgdat->kswapd_max_order =  0;
			pgdat->classzone_idx = pgdat->nr_zones - 1;
		}

		if (order < new_order || classzone_idx > new_classzone_idx) {
			/*
			 * Don't sleep if someone wants a larger 'order'
			 * allocation or has tigher zone constraints
			 */
			order = new_order;
			classzone_idx = new_classzone_idx;
		} else {
			/* 在此处睡眠，等待wakeup_kswapd来唤醒 */
			kswapd_try_to_sleep(pgdat, balanced_order,
						balanced_classzone_idx);
			/* pgdata->kswapd_max_order和pgdat->classzone_id已经在wakeup_kswapd中进行了更新 */
			order = pgdat->kswapd_max_order;
			classzone_idx = pgdat->classzone_idx;
			new_order = order;
			new_classzone_idx = classzone_idx;
			pgdat->kswapd_max_order = 0;
			pgdat->classzone_idx = pgdat->nr_zones - 1;
		}
		/* 内核进程冻结技术，详细介绍参考文献
		 * 1. 内核为每个进程在适当的时候调用try_to_freeze来设置PF_FREEZE标志，
		 * 2. 当系统要suspend的时候，系统那边会调用freeze_process函数来将所有可冷冻的任务的TIF_FREEZE标志置位
		 * 3. 然后所有有TIF_FREEZE标志的进程会睡眠
		 */
		ret = try_to_freeze();
		/* 判断内核线程是否应该停止，当有人调用kthread_stop()时，此处为真 */
		if (kthread_should_stop())
			break;

		/*
		 * We can speed up thawing tasks if we don't call balance_pgdat
		 * after returning from the refrigerator
		 */
		if (!ret) {
			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
			balanced_classzone_idx = classzone_idx;
			/* 启动内存回收具体任务 */
			balanced_order = balance_pgdat(pgdat, order,
						&balanced_classzone_idx);
		}
	}
	/* 当kthread被停止，修改进程标识，具体什么意思？ */
	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
	current->reclaim_state = NULL;
	lockdep_clear_current_reclaim_state();

	return 0;
}

balance_pgdat

/* 对node中zone，从低往高进行内存释放 
 * 以默认优先级12，执行多次遍历node内存释放；只有当在当前优先优先级无法回收到内存情况下，才进行sc->priority--
 * 1. 从高到底找出第一不平衡的zone
 * 2. 循环对第一个不平衡的其下面的zone进行内存回收
		检测node是否需要内存压缩：只要有一个zone能以low level分配order页框则不需要
		mem_cgroup_soft_limit_reclaim：对zone内memcg中超过soft_limit_in_bytes的进程内存进行软回收
		kswapd_shrink_zone->shrink_zone: 进行内存回收(如果内存平衡则不需要进行回收)，回收不成功则降低回收优先级
		pfmemalloc_watermark_ok判断OK，则唤醒pfmemalloc_wait队列中的等待进程
		如果要进行内存压缩的话，则启动
 * 本node内存balance(pgdat_balanced) || sc->priority==0 停止对node循环内存回收
*/
static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
							int *classzone_idx)
{
	int i;
	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
	unsigned long nr_soft_reclaimed;
	unsigned long nr_soft_scanned;
	struct scan_control sc = {
		.gfp_mask = GFP_KERNEL,
		.order = order,
/* 优先级使用默认为的12，会执行多次遍历node(并不是node中的所有zone)，但并不会每次遍历都进行sc->priority--，当能够回收的内存时，才进行sc->priority-- */
		.priority = DEF_PRIORITY,
		.may_writepage = !laptop_mode,
		.may_unmap = 1,
		.may_swap = 1,
	};
	count_vm_event(PAGEOUTRUN);

	do {
		unsigned long nr_attempted = 0;
		bool raise_priority = true;
		/* 如果要求收集的内存order>0,则在回收内存后，启动内存压缩，整理碎片 */
		bool pgdat_needs_compaction = (order > 0);

		sc.nr_reclaimed = 0;

		/*
		 * 确定内存释放区的zone id上限：
		 * 1. 通过age_active_anon检查该zone lru链表中inactive_list中page是否足够
		 * 2. 为什么上限是第一个buffer_header，或不需要进行内存平衡的区域？
		 */
		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
			struct zone *zone = pgdat->node_zones + i;
			/* 如果zone中没有page，则跳过本zone */
			if (!populated_zone(zone))
				continue;
			/* ？？ */
			if (sc.priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone))
				continue;

			/* 通过inactive_anon_is_low()判断本zone的lru inactive链表中页面数量是否足够
			 * 如果不够，则使用shrink_active_list()释放更多active页面到inactive链表
			 * shrink_active_list、shrink_zone等更多的收缩流程在下一篇中介绍
			 */
			age_active_anon(zone, &sc);

			/*
			 * If the number of buffer_heads in the machine
			 * exceeds the maximum allowed level and this node
			 * has a highmem zone, force kswapd to reclaim from
			 * it to relieve lowmem pressure.
			 */
			if (buffer_heads_over_limit && is_highmem_idx(i)) {
				end_zone = i;
				break;
			}

			if (!zone_balanced(zone, order, 0, 0)) {
				/* 找到第一个不平衡的zone，平衡条件
				 * 1.  此zone分配页框后剩余的页框数量 > 此zone的high阀值 + 此zone保留的页框数量
				 * 2. 申请的order阶内存，可以通过内存压缩达到
				 */
				end_zone = i;
				break;
			} else {
				/*
				 * If balanced, clear the dirty and congested
				 * flags
				 */
				clear_bit(ZONE_CONGESTED, &zone->flags);
				clear_bit(ZONE_DIRTY, &zone->flags);
			}
		}

		if (i < 0)
			goto out;

		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone))
				continue;

			/* 如果内存回收后，足够申请order阶的新内存，则不启动内存压缩
			 * 所有需要扫描的低端zone只有有一个满足上述条件即可
			 */
			if (pgdat_needs_compaction &&
					zone_watermark_ok(zone, order,
						low_wmark_pages(zone),
						*classzone_idx, 0))
				pgdat_needs_compaction = false;
		}

		/*
		 * If we're getting trouble reclaiming, start doing writepage
		 * even in laptop mode.
		 */
		/* 优先级越低，则要求回收的内存越多 */
		if (sc.priority < DEF_PRIORITY - 2)
			sc.may_writepage = 1;

		/*
		 * 接下来从低到高扫描zone，停止在end_zone上
		 */
		for (i = 0; i <= end_zone; i++) {
			struct zone *zone = pgdat->node_zones + i;

			if (!populated_zone(zone))
				continue;

			if (sc.priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone))
				continue;

			sc.nr_scanned = 0;

			nr_soft_scanned = 0;
			/*
			 * 回收该zone上超过soft_limit最多的mem_cgroup在该zone上mem_cgroup_per_zone对应的lru链表
			 * 直接回收、间接回收都会调用该函数，是调用shrink_zone()前的必备动作
			 * 通过全局mem group结构体soft_limit_tree->rb_tree_per_node->rb_tree_per_zone->rb_root找到mem_cgroup_per_zone
			 */
			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
							order, sc.gfp_mask,
							&nr_soft_scanned);
			sc.nr_reclaimed += nr_soft_reclaimed;

			/*
			 * There should be no need to raise the scanning
			 * priority if enough pages are already being scanned
			 * that that high watermark would be met at 100%
			 * efficiency.
			 */
			 /* 如果回收成功则不需要降低回收优先级 */
			if (kswapd_shrink_zone(zone, end_zone,
					       &sc, &nr_attempted))
				raise_priority = false;
		}

		/*
		 * If the low watermark is met there is no need for processes
		 * to be throttled on pfmemalloc_wait as they should not be
		 * able to safely make forward progress. Wake them
		 */
		/* 执行完一轮内存回收后，唤醒挂在pfmemalloc_wait队列中等待进行内存分配的任务
		 * 1. 通过waitqueue_active判断pfmemalloc_wait队列中是否有成员
		 * 2. pfmemalloc_watermark_ok判断本zone是否有足够空闲内存用于分配，如果没有会唤醒kswapd线程回收内存
		 */
		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
				pfmemalloc_watermark_ok(pgdat))
			wake_up_all(&pgdat->pfmemalloc_wait);

		/*
		 * Fragmentation may mean that the system cannot be rebalanced
		 * for high-order allocations in all zones. If twice the
		 * allocation size has been reclaimed and the zones are still
		 * not balanced then recheck the watermarks at order-0 to
		 * prevent kswapd reclaiming excessively. Assume that a
		 * process requested a high-order can direct reclaim/compact.
		 */
		if (order && sc.nr_reclaimed >= 2UL << order)
			order = sc.order = 0;

		/* Check if kswapd should be suspending */
		if (try_to_freeze() || kthread_should_stop())
			break;

		/* 如果需要内存压缩则启动内存压缩 */
		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
			compact_pgdat(pgdat, order);

		/*
		 * Raise priority if scanning rate is too low or there was no
		 * progress in reclaiming pages
		 */
		if (raise_priority || !sc.nr_reclaimed)
			sc.priority--;
	} while (sc.priority >= 1 &&
		 !pgdat_balanced(pgdat, order, *classzone_idx));

out:
	/*
	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
	 * makes a decision on the order we were last reclaiming at. However,
	 * if another caller entered the allocator slow path while kswapd
	 * was awake, order will remain at the higher level
	 */
	/* 记录本次均衡操作的最终zone id */
	*classzone_idx = end_zone;
	return order;
}

kswapd_shrink_zone

/*
 * 对zone执行内存回收：
 * 1. 确定一个zone需要回收的内存数量
 * 2. 检测目前zone的内存压力，尽量少执行内存回收
 * 3. 通过shrink_zone()实际执行内存回收
 */
static bool kswapd_shrink_zone(struct zone *zone,
			       int classzone_idx,
			       struct scan_control *sc,
			       unsigned long *nr_attempted)
{
	int testorder = sc->order;
	unsigned long balance_gap;
	bool lowmem_pressure;

	/* 确定本zone需要回收的页框数量；目标是要使水位回到high levle. */
	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));

	/*
	 * Kswapd reclaims only single pages with compaction enabled. Trying
	 * too hard to reclaim until contiguous free pages have become
	 * available can hurt performance by evicting too much useful data
	 * from memory. Do not reclaim more than needed for compaction.
	 */
	 /* 如果可以内存压缩可以解决需求order，只要回收单个页面即可.避免连续回收页面对系统性能造成影响 */
	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
			compaction_suitable(zone, sc->order, 0, classzone_idx)
							!= COMPACT_SKIPPED)
		testorder = 0;

	/*
	 * We put equal pressure on every zone, unless one zone has way too
	 * many pages free already. The "too many pages" is defined as the
	 * high wmark plus a "gap" where the gap is either the low
	 * watermark or 1% of the zone, whichever is smaller.
	 */
	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));

	/*
	 * If there is no low memory pressure or the zone is balanced then no
	 * reclaim is necessary
	 */
	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
	/* 如果低端内存压力不大，则不需要进行内存回收 */
	if (!lowmem_pressure && zone_balanced(zone, testorder,
						balance_gap, classzone_idx))
		return true;
	/* 内存回收入口函数 */
	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);

	/* 计算尝试回收的各个zone内存数量总和 */
	*nr_attempted += sc->nr_to_reclaim;

	clear_bit(ZONE_WRITEBACK, &zone->flags);

	/*
	 * If a zone reaches its high watermark, consider it to be no longer
	 * congested. It's possible there are dirty pages backed by congested
	 * BDIs but as pressure is relieved, speculatively avoid congestion
	 * waits.
	 */
	if (zone_reclaimable(zone) &&
	    zone_balanced(zone, testorder, 0, classzone_idx)) {
		clear_bit(ZONE_CONGESTED, &zone->flags);
		clear_bit(ZONE_DIRTY, &zone->flags);
	}

	return sc->nr_scanned >= sc->nr_to_reclaim;
}

总结

开始标志：zonelist的所有zone都不能通过min阀值获取到页框时，会唤醒所有node的kswapd内核线程，然后在kswapd中会对不满足 zone分配页框后剩余的页框数量 > 此zone的high阀值 + 此zone保留的页框数量的zone进行内存回收
结束标志：node中所有zone都满足 zone分配页框后剩余的页框数量 > 此zone的high阀值 + 此zone保留的页框数量(可能会进行多次shrink_zone()的调用)
回收对象：超过所在memcg的soft_limit_in_bytes的进程的内存、zone的可回收页框、slab

由__alloc_pages()触发内核内存回收路径

快速分配触发内存回收路径

__alloc_pages()->__alloc_pages_nodemask()->get_page_from_freelist()->zone_reclaim->__zone_reclaim()->shrink_zone()

__zone_reclaim

/*
 * 快速内存分配失败后触发本函数进行内存回收；但是并不能保证回收后必然可以分配order阶内存块？
 * 1. 根据目前zone_reclaim_mode确定内存回收sc控制结构体
 * 2. zone_pagecache_reclaimable:根据zone_reclaim_mdoe状态计算可供回收的页面数量
 * 3. 对不同priority级别进行循环内存回收，直到回收到足够的页面
 */
static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
	/* Minimum pages needed in order to stay on node */
	const unsigned long nr_pages = 1 << order;
	struct task_struct *p = current;
	struct reclaim_state reclaim_state;
	struct scan_control sc = {
/* 最少一次回收SWAP_CLUSTER_MAX，最多一次回收1 << order个，应该是1024个 */
		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
/* 当前进程明确禁止分配内存的IO操作(禁止__GFP_IO，__GFP_FS标志)，那么则清除__GFP_IO，__GFP_FS标志，表示不进行IO操作 */
		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
		.order = order,
/* 优先级为4，默认是12，会比12一次扫描更多lru链表中的页框，而且扫描次数会比优先级为12的少，并且如果回收过程中回收到了足够页框，就会返回 */
		.priority = ZONE_RECLAIM_PRIORITY,
		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),/* 根据zone_reclaim_mdoe判断，允许file页回写磁盘操作 */
		.may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),/* 根据zone_reclaim_mdoe判断，允许unmap操作 */
		.may_swap = 1,/* 允许匿名页swap */
	};

	cond_resched();
	/*
	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
	 * and we also need to be able to write out pages for RECLAIM_WRITE
	 * and RECLAIM_SWAP.
	 */
	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
	lockdep_set_current_reclaim_state(gfp_mask);
	reclaim_state.reclaimed_slab = 0;
	p->reclaim_state = &reclaim_state;
	/*
	 * 根据目前的zone_reclaim_mode计算有多少潜在的page可以被回收
	 * 0x0:只会对最有zone附近的几个需要进行内存回收的zone进行内存回收；其余的会对所有zone回收
	 * 0x1:开启zone的内存回收
	 * 0x2:允许回写
	 * 0x4:允许unmap操作
	 */
	if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
		/*
		 * Free memory by calling shrink zone with increasing
		 * priorities until we have enough memory freed.
		 */
		do {
			/* 内存回收入口函数，对本zone进行循环内存回收，直到收集到足够的页面 */
			shrink_zone(zone, &sc, true);
/* 最多进行4次调用shrink_zone()，并且每次调用shrink_zone()扫描的页框会越来越多，直到回收到了1<<order个页框为止 */
		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
	}

	p->reclaim_state = NULL;
	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
	lockdep_clear_current_reclaim_state();
	return sc.nr_reclaimed >= nr_pages;
}

总结

开始标志是：此zone分配后剩余的页框数量 > 此zone的阀值 + 此zone的保留页框数量(阀值可能是:min,low,high其中一个)
结束标志是：对此zone回收到了本次分配时需要的页框数量或者 sc->priority降为0(可能会进行多次shrink_zone()的调用)
回收对象：zone的可回收页框、slab

慢速分配触发内存回收路径

__alloc_pages()->__alloc_pages_nodemask()->__alloc_pages_slowpath()->__alloc_pages_direct_reclaim()->__perform_reclaim()->try_to_free_pages()->do_try_to_free_pages()->shrink_zone()

try_to_free_pages

/* 初始化内存回收控制结构体，并检测当前进程是否需要进入pgdat->pfmemalloc_wait等待队列
 * 1. 初始化sc，回收数量目标是SWAP_CLUSTER_MAX？
 * 2. throttle_direct_reclaim：判断当前内存申请进程是否要进入等待队列
 * 3. do_try_to_free_pages：触发内存回收；直到回收到目标数量page或内存压缩可以解决问题
 */
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
				gfp_t gfp_mask, nodemask_t *nodemask)
{
	unsigned long nr_reclaimed;
	struct scan_control sc = {
		.nr_to_reclaim = SWAP_CLUSTER_MAX,/* 只打算回收的页框为32个? */
		.gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
		.order = order,
		.nodemask = nodemask,
		.priority = DEF_PRIORITY,
		/* 与/proc/sys/vm/laptop_mode文件有关
         * laptop_mode为0，则允许进行回写操作，即使允许回写，直接内存回收也不能对脏文件页进行回写
         * 不过允许回写时，可以对非文件页进行回写
         */
		.may_writepage = !laptop_mode,
		.may_unmap = 1,/* 允许进行unmap操作 */
		.may_swap = 1,/* 允许进行非文件页的操作 */
	};

	/* 判断node是否平衡，如果平衡，则继续进行内存回收
	 * 如果不平衡，如果此node的kswapd没有被唤醒，则唤醒，并且这里唤醒kswapd只会对ZONE_NORMAL以下的zone进行内存回收
	 * 此node的ZONE_DMA和ZONE_NORMAL的总共空闲页框数量 > 此node的ZONE_DMA和ZONE_NORMAL的平均min阀值数量
	 */
	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
		return 1;

	trace_mm_vmscan_direct_reclaim_begin(order,
				sc.may_writepage,
				gfp_mask);

	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);

	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);

	return nr_reclaimed;
}

throttle_direct_reclaim

/*
 * 判断当前进程是否加入到pgdat->pfmemalloc_wait等待队列中
 * 1. 如果为内核进程，则返回
 * 2. 如果当前进程已接收到kill信号，则返回
 * 3. 判断最优node(最优zone所属的node)是否pfmemalloc balance，否则加入pgdat->pfmemalloc_wait等待队列中
		a. 有__GFP_FS: 加入队列没有超时限制，并且进程状态为TASK_KILLABLE
		b. 无__GFP_FS: 超时时间为1s，且状态为TASK_INTERRUPTABLE
 */
static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
					nodemask_t *nodemask)
{
	struct zoneref *z;
	struct zone *zone;
	pg_data_t *pgdat = NULL;

	 /* 如果标记了PF_KTHREAD，表示此进程是一个内核线程，则不会往下执行 */
	if (current->flags & PF_KTHREAD)
		goto out;

	/*
	 * If a fatal signal is pending, this process should not throttle.
	 * It should return quickly so it can exit and free its memory
	 */
	 /* 此进程已经接收到了kill信号，准备要被杀掉了 */
	if (fatal_signal_pending(current))
		goto out;

	/* 遍历zonelist，但是里面只会在获取到第一个pgdat时就跳出
	 * 但不能是zone_highmem，因为有可能pfmemalloc队列中等待的任务有GFP_KERNEL分配标志，不能从zone_highmem分配内存
	 */
	for_each_zone_zonelist_nodemask(zone, z, zonelist,
					gfp_zone(gfp_mask), nodemask) {
		/* 只遍历ZONE_NORMAL和ZONE_DMA区 */
		if (zone_idx(zone) > ZONE_NORMAL)
			continue;

		/* 获取zone对应的node */
		pgdat = zone->zone_pgdat;
		/* 判断node是否平衡，如果平衡，则返回真
         * 如果不平衡，如果此node的kswapd没有被唤醒，则唤醒，并且这里唤醒kswapd只会对ZONE_NORMAL以下的zone进行内存回收
         * node是否平衡的判断标准是:
         * 此node的ZONE_DMA和ZONE_NORMAL的总共空闲页框数量 > 此node的ZONE_DMA和ZONE_NORMAL的平均min阀值数量
         */
		if (pfmemalloc_watermark_ok(pgdat))
			goto out;
		break;
	}

	/* If no zone was usable by the allocation flags then do not throttle */
	if (!pgdat)
		goto out;

	/* Account for the throttling */
	count_vm_event(PGSCAN_DIRECT_THROTTLE);

	/* 如果分配标志禁止了文件系统操作，则将要进行内存回收的进程设置为TASK_INTERRUPTIBLE状态，然后加入到node的pgdat->pfmemalloc_wait，并且会设置超时时间为1s 
	 * 1.pfmemalloc_watermark_ok(pgdat)为真时被唤醒，而1s没超时，返回剩余timeout(jiffies)
	 * 2.睡眠超过1s时会唤醒，而pfmemalloc_watermark_ok(pgdat)此时为真，返回1
	 * 3.睡眠超过1s时会唤醒，而pfmemalloc_watermark_ok(pgdat)此时为假，返回0
	 * 4.接收到信号被唤醒，返回-ERESTARTSYS
	 */
	if (!(gfp_mask & __GFP_FS)) {
		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
			pfmemalloc_watermark_ok(pgdat), HZ);

		goto check_pending;
	}

    /* 如果分配标志没有禁止了文件系统操作
	 * 将本进程加入到node的pgdat->pfmemalloc_wait，并设置为TASK_KILLABLE状态，表示允许 TASK_UNINTERRUPTIBLE 响应致命信号的状态 
     * 这些进程在两种情况下被唤醒
     * 1.pfmemalloc_watermark_ok(pgdat)为真时
     * 2.接收到致命信号时
     */
	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
		pfmemalloc_watermark_ok(pgdat));

check_pending:
    /* 如果加入到了pgdat->pfmemalloc_wait后被唤醒，就会执行到这
     * 唤醒后再次检查当前进程是否接受到了kill信号，准备退出 
	 */
	if (fatal_signal_pending(current))
		return true;

out:
	return false;
}

do_try_to_free_pages

/*
 * 尝试对所有优先级别进行内存回收，直到以下两个条件或成立
 * 1. 已经回收到足够的页面
 * 2. 回收到的页面已经足够，只要启动内存压缩即可满足要求
 * 3. 期间如果累计扫描过的页面较多，会触发所有bdi设备启动flush回写线程
 */
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
					  struct scan_control *sc)
{
	int initial_priority = sc->priority;
	unsigned long total_scanned = 0;
	unsigned long writeback_threshold;
	bool zones_reclaimable;
retry:
	delayacct_freepages_start();

	if (global_reclaim(sc))
		count_vm_event(ALLOCSTALL);
	/*
	 * 尝试对所有优先级别进行内存回收，直到以下两个条件或成立
	 * 1. 已经回收到足够的页面
	 * 2. 回收到的页面已经足够，只要启动内存压缩即可满足要求
	 */
	do {
		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
				sc->priority);
		sc->nr_scanned = 0;
		zones_reclaimable = shrink_zones(zonelist, sc);

		total_scanned += sc->nr_scanned;
		/* 回收到足够的page，则退出 */
		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
			break;

		if (sc->compaction_ready)
			break;

		/*
		 * If we're getting trouble reclaiming, start doing
		 * writepage even in laptop mode.
		 */
		if (sc->priority < DEF_PRIORITY - 2)
			sc->may_writepage = 1;

		/*
		 * 如果扫描过的page数量>需要回收页面数量*1.5，还没有完成回收任务，则触发强制回写dirty page到磁盘
		 * 在default_bdi_init()中会初始化两个线程 sysnc_supers(周期性同步所有superblocks), bdi_default(在必要时create,start,stop flusher线程)
		 * 系统每新增一个bdi设备,会通过bdi_register()注册到全局bdi_list链表中，每个bdi设备都会有自己的flusher线程，用来回写本设备的脏页
		 */
		writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
		if (total_scanned > writeback_threshold) {
			wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
						WB_REASON_TRY_TO_FREE_PAGES);
			sc->may_writepage = 1;
		}

	} while (--sc->priority >= 0);

	delayacct_freepages_end();
	/* 如果已经回收到page，则返回成功回收的page num */
	if (sc->nr_reclaimed)
		return sc->nr_reclaimed;

	/* Aborted reclaim to try compaction? don't OOM, then */
	/* ready则表示已经回收到足够页面，但需要内存压缩来整理碎片才能满足内存分配要求 */
	if (sc->compaction_ready)
		return 1;

	/* Untapped cgroup reserves?  Don't OOM, retry. */
	if (!sc->may_thrash) {
		sc->priority = initial_priority;
		sc->may_thrash = 1;
		goto retry;
	}

	/* Any of the zones still reclaimable?  Don't OOM. */
	if (zones_reclaimable)
		return 1;

	return 0;
}

shrink_zones

/*
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
 *
 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
 * Because:
 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
 *    allocation or
 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
 *    must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
 *    zone defense algorithm.
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 *
 * Returns true if a zone was reclaimable.
 */
static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
{
	struct zoneref *z;
	struct zone *zone;
	unsigned long nr_soft_reclaimed;
	unsigned long nr_soft_scanned;
	gfp_t orig_mask;
	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
	bool reclaimable = false;

	/*
	 * If the number of buffer_heads in the machine exceeds the maximum
	 * allowed level, force direct reclaim to scan the highmem zone as
	 * highmem pages could be pinning lowmem pages storing buffer_heads
	 */
	orig_mask = sc->gfp_mask;
	if (buffer_heads_over_limit)
		sc->gfp_mask |= __GFP_HIGHMEM;

	for_each_zone_zonelist_nodemask(zone, z, zonelist,
					requested_highidx, sc->nodemask) {
		enum zone_type classzone_idx;

		if (!populated_zone(zone))
			continue;

		classzone_idx = requested_highidx;
		while (!populated_zone(zone->zone_pgdat->node_zones +
							classzone_idx))
			classzone_idx--;

		/* 判断是zone级别的还是mem group级别的内存回收 */
		if (global_reclaim(sc)) {
			if (!cpuset_zone_allowed(zone,
						 GFP_KERNEL | __GFP_HARDWALL))
				continue;

			if (sc->priority != DEF_PRIORITY &&
			    !zone_reclaimable(zone))
				continue;	/* Let kswapd poll it */

			/*
			 * If we already have plenty of memory free for
			 * compaction in this zone, don't free any more.
			 * Even though compaction is invoked for any
			 * non-zero order, only frequent costly order
			 * reclamation is disruptive enough to become a
			 * noticeable problem, like transparent huge
			 * page allocations.
			 */
			/* 
			 * 检测此zone目前是否能通过内存压缩满足内存分配要求：
			 * 1. 通过内存压缩能满足order阶内存的分配
			 * 2. 不用内存压缩也能满足order阶内存的分配
			 */
			if (IS_ENABLED(CONFIG_COMPACTION) &&
			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
			    zonelist_zone_idx(z) <= requested_highidx &&
			    compaction_ready(zone, sc->order)) {
				sc->compaction_ready = true;
				continue;
			}

			/*
			 * 回收该zone上超过soft_limit最多的mem_cgroup在该zone上mem_cgroup_per_zone对应的lru链表
			 * 直接回收、间接回收都会调用该函数，是调用shrink_zone()前的必备动作
			 * 通过全局mem group结构体soft_limit_tree->rb_tree_per_node->rb_tree_per_zone->rb_root找到mem_cgroup_per_zone
			 */
			nr_soft_scanned = 0;
			nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
						sc->order, sc->gfp_mask,
						&nr_soft_scanned);
			sc->nr_reclaimed += nr_soft_reclaimed;
			sc->nr_scanned += nr_soft_scanned;
			if (nr_soft_reclaimed)
				reclaimable = true;
			/* need some check for avoid more shrink_zone() */
		}
		/* 如果是zone级别的内存回收，则直接调用shrink_zone(); 只要回收到了内存就返回true */
		if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
			reclaimable = true;
		
		if (global_reclaim(sc) &&
		    !reclaimable && zone_reclaimable(zone))
			reclaimable = true;
	}

	/*
	 * Restore to original mask to avoid the impact on the caller if we
	 * promoted it to __GFP_HIGHMEM.
	 */
	sc->gfp_mask = orig_mask;

	return reclaimable;
}

总结

慢速分配中，首先唤醒所有node节点的kswap内核线程
然后调用get_page_from_freelist()尝试用min阀值从zonelist的zone中获取连续页框
如果失败则对zonelist的zone进行异步压缩，然后调用get_page_from_freelist再次以min阀值尝试分配内存
如果还是失败则进入内存回收__alloc_pages_direct_reclaim
在进行直接内存回收时，进程是有可能加入到node的pgdat->pfmemalloc_wait这个等待队列中，当kswapd进行内存回收后如果node空闲内存达到平衡，那么就会唤醒pgdat->pfmemalloc_wait中的进程
开始标志：进程申请内存时，zonelist的所有zone都不能通过min阀值获取到页框时
结束标志：回收到32个页框，或者sc->priority降到0，或者空闲页框足够进行内存压缩了(可能会进行多次shrink_zone()的调用)
回收对象：超过所在memcg的soft_limit_in_bytes的进程的内存、zone的可回收页框、slab

附录

PG flag说明

PG_lru：表示页在lru链表中
PG_referenced: 表示页最近被访问(只有文件页使用)
PG_dirty：页为脏页，文件页被修改，以及非文件页加入到swap cache后，就会被标记为脏页。在此页回写前会被清除，但是回写失败时又会被置位
PG_active：页为活动页，配合PG_lru就可以得出页是处于非活动页lru链表还是活动页lru链表
PG_private：页描述符中的page->private保存有数据
PG_writeback：页正在进行回写
PG_swapbacked：此页可写入swap分区，一般用于表示此页是非文件页
PG_swapcache：页已经加入到了swap cache中(只有非文件页使用)
PG_reclaim：页正在进行回收，只有在内存回收时才会对需要回收的页进行此标记
PG_mlocked：页被锁在内存中

三个影响内存回收的全局参数

`/proc/sys/vm/zone_reclaim_mode`

只影响快速内存回收

0x0：快速内存回收只会对最优zone附近的几个需要进行内存回收的zone进行内存回收
0x1：开启zone的内存回收
0x2：开启zone的内存回收，并且允许回写
0x4：开启zone的内存回收，允许进行unmap操作

`/proc/sys/vm/laptop_mode`

影响所有内存回收

0：允许直接内存回收对匿名页lru链表中的页进行回写操作，并且允许直接内存回收唤醒flush内核线程
非0：直接内存回收不会对匿名页lru链表中的页进行回写操作

`/proc/sys/vm/swapiness`

影响所有内存回收,在shrink_zones中用于确定扫描匿名页lru链表和文件页lru链表的比例，范围是0~200，系统默认是30；

接近0：进行内存回收时，更多地去扫描文件页lru链表，如果为0，那么就不会去扫描匿名页lru链表。
接近200：进行内存回收时，更多地去扫描匿名页lru链表。

scan_control

/* 扫描控制结构，用于内存回收和内存压缩 */
struct scan_control {
    /* 需要回收的页框数量 */
    unsigned long nr_to_reclaim;

    /* 申请内存时使用的分配标志 */
    gfp_t gfp_mask;

    /* 申请内存时使用的order值，因为只有申请内存，然后内存不足时才会进行扫描 */
    int order;

    /* 允许执行扫描的node结点掩码 */
    nodemask_t    *nodemask;

    /* 目标memcg，如果是针对整个zone进行的，则此为NULL */
    struct mem_cgroup *target_mem_cgroup;

    /* 扫描优先级，一次扫描(total_size >> priority)个页框 
     * 优先级越低，一次扫描的页框数量就越多
     * 优先级越高，一次扫描的数量就越少
     * 默认优先级为12
     */
    int priority;

    /* 是否能够进行回写操作(与分配标志的__GFP_IO和__GFP_FS有关) */
    unsigned int may_writepage:1;

    /* 能否进行unmap操作，就是将所有映射了此页的页表项清空 */
    unsigned int may_unmap:1;

    /* 是否能够进行swap交换，如果不能，在内存回收时则不扫描匿名页lru链表 */
    unsigned int may_swap:1;

    unsigned int hibernation_mode:1;

    /* 扫描结束后会标记，用于内存回收判断是否需要进行内存压缩 */
    unsigned int compaction_ready:1;

    /* 已经扫描的页框数量 */
    unsigned long nr_scanned;
    /* 已经回收的页框数量 */
    unsigned long nr_reclaimed;
};

参考资料

linux内核冻结技术

cgroup-memory子系统分析1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

内存管理系列二十一：内存回收入口.md

内存管理系列二十一：内存回收入口.md

前沿

往篇回顾

内存回收主要内容(lru链表中页的内存回收)

本篇主要内容

代码分析

由kswapd触发内核内存回收路径

kswapd_init

kswapd_run

kswapd

balance_pgdat

kswapd_shrink_zone

总结

由__alloc_pages()触发内核内存回收路径

快速分配触发内存回收路径

__zone_reclaim

总结

慢速分配触发内存回收路径

try_to_free_pages

throttle_direct_reclaim

do_try_to_free_pages

shrink_zones

总结

附录

PG flag说明

三个影响内存回收的全局参数

`/proc/sys/vm/zone_reclaim_mode`

`/proc/sys/vm/laptop_mode`

`/proc/sys/vm/swapiness`

scan_control

参考资料

Files

内存管理系列二十一：内存回收入口.md

Latest commit

History

内存管理系列二十一：内存回收入口.md

File metadata and controls

前沿

往篇回顾

内存回收主要内容(lru链表中页的内存回收)

本篇主要内容

代码分析

由kswapd触发内核内存回收路径

kswapd_init

kswapd_run

kswapd

balance_pgdat

kswapd_shrink_zone

总结

由__alloc_pages()触发内核内存回收路径

快速分配触发内存回收路径

__zone_reclaim

总结

慢速分配触发内存回收路径

try_to_free_pages

throttle_direct_reclaim

do_try_to_free_pages

shrink_zones

总结

附录

PG flag说明

三个影响内存回收的全局参数

/proc/sys/vm/zone_reclaim_mode

/proc/sys/vm/laptop_mode

/proc/sys/vm/swapiness

scan_control

参考资料

`/proc/sys/vm/zone_reclaim_mode`

`/proc/sys/vm/laptop_mode`

`/proc/sys/vm/swapiness`