Linux内核-内存管理: Out Of Memory Management 源码分析

我们这篇文章中描述了Linux内核对于 Out Of Memory Management 场景下的相关策略,接下来我们将进行Linux 5.0内核的OOM内核源码分析。

一. 关键数据结构

针对源码部分,我们首先需要阐述oom_kill部分的核心数据结构。

文件路径:/linux/include/linux/oom.h

1.1 oom_control 结构体

首先,我们给出具体的内核定义:

/*
 * Details of the page allocation that triggered the oom killer that are used to
 * determine what should be killed.
 */
struct oom_control {
	/* Used to determine cpuset */
	struct zonelist *zonelist;

	/* Used to determine mempolicy */
	nodemask_t *nodemask;

	/* Memory cgroup in which oom is invoked, or NULL for global oom */
	struct mem_cgroup *memcg;

	/* Used to determine cpuset and node locality requirement */
	const gfp_t gfp_mask;

	/*
	 * order == -1 means the oom kill is required by sysrq, otherwise only
	 * for display purposes.
	 */
	const int order;

	/* Used by oom implementation, do not set */
	unsigned long totalpages;
	struct task_struct *chosen;
	unsigned long chosen_points;

	/* Used to print the constraint info. */
	enum oom_constraint constraint;
};

上面这个结构体中,着重强调一下几个参数:

  1. gfp_mask:全局文件指针mask,在oom阶段被用来判断IO设备属性等。
  2. totalpages : 总共的内存页。
  3. chosen :被选择待kill的进程task结构。
  4. chosen_points:被选择待kill的进程分数。
  5. constraint:oom分配约束的类型枚举类型。

二. out_of_memory 函数

当操作系统内存不够使用的时候,这个函数杀死“最佳进程”,这个函数并非最优操作,只是尽量让系统运行良好。

文件路径:/linux/mm/oom_kill.c

这部分内核源码如下:

1033  bool out_of_memory(struct oom_control *oc)
1034  {
1035  	unsigned long freed = 0;
1036  	enum oom_constraint constraint = CONSTRAINT_NONE;
1037  
1038  	if (oom_killer_disabled)
1039  		return false;
1040  
1041  	if (!is_memcg_oom(oc)) {
1042  		blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
1043  		if (freed > 0)
1044  			/* Got some memory back in the last second. */
1045  			return true;
1046  	}
1047  
1048  	/*
1049  	 * If current has a pending SIGKILL or is exiting, then automatically
1050  	 * select it.  The goal is to allow it to allocate so that it may
1051  	 * quickly exit and free its memory.
1052  	 */
1053  	if (task_will_free_mem(current)) {
1054  		mark_oom_victim(current);
1055  		wake_oom_reaper(current);
1056  		return true;
1057  	}
1058  
1059  	/*
1060  	 * The OOM killer does not compensate for IO-less reclaim.
1061  	 * pagefault_out_of_memory lost its gfp context so we have to
1062  	 * make sure exclude 0 mask - all other users should have at least
1063  	 * ___GFP_DIRECT_RECLAIM to get here.
1064  	 */
1065  	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
1066  		return true;
1067  
1068  	/*
1069  	 * Check if there were limitations on the allocation (only relevant for
1070  	 * NUMA and memcg) that may require different handling.
1071  	 */
1072  	constraint = constrained_alloc(oc);
1073  	if (constraint != CONSTRAINT_MEMORY_POLICY)
1074  		oc->nodemask = NULL;
1075  	check_panic_on_oom(oc, constraint);
1076  
1077  	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
1078  	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
1079  	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
1080  		get_task_struct(current);
1081  		oc->chosen = current;
1082  		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
1083  		return true;
1084  	}
1085  
1086  	select_bad_process(oc);
1087  	/* Found nothing?!?! */
1088  	if (!oc->chosen) {
1089  		dump_header(oc, NULL);
1090  		pr_warn("Out of memory and no killable processes...\n");
1091  		/*
1092  		 * If we got here due to an actual allocation at the
1093  		 * system level, we cannot survive this and will enter
1094  		 * an endless loop in the allocator. Bail out now.
1095  		 */
1096  		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
1097  			panic("System is deadlocked on memory\n");
1098  	}
1099  	if (oc->chosen && oc->chosen != (void *)-1UL)
1100  		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
1101  				 "Memory cgroup out of memory");
1102  	return !!oc->chosen;
1103  }

针对上面的代码,进行相关阐述如下:

三. select_bad_process、oom_evaluate_task、oom_badness函数

3.1 select_bad_process函数

这部分源代码如下:

366  /*
367   * Simple selection loop. We choose the process with the highest number of
368   * 'points'. In case scan was aborted, oc->chosen is set to -1.
369   */
370  static void select_bad_process(struct oom_control *oc)
371  {
372  	if (is_memcg_oom(oc))
373  		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
374  	else {
375  		struct task_struct *p;
376  
377  		rcu_read_lock();
378  		for_each_process(p)
379  			if (oom_evaluate_task(p, oc))
380  				break;
381  		rcu_read_unlock();
382  	}
383  
384  	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
385  }

3.2 oom_evaluate_task 函数

这部分源代码如下:

315  static int oom_evaluate_task(struct task_struct *task, void *arg)
316  {
317  	struct oom_control *oc = arg;
318  	unsigned long points;
319  
320  	if (oom_unkillable_task(task, NULL, oc->nodemask))
321  		goto next;
322  
323  	/*
324  	 * This task already has access to memory reserves and is being killed.
325  	 * Don't allow any other task to have access to the reserves unless
326  	 * the task has MMF_OOM_SKIP because chances that it would release
327  	 * any memory is quite low.
328  	 */
329  	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
330  		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
331  			goto next;
332  		goto abort;
333  	}
334  
335  	/*
336  	 * If task is allocating a lot of memory and has been marked to be
337  	 * killed first if it triggers an oom, then select it.
338  	 */
339  	if (oom_task_origin(task)) {
340  		points = ULONG_MAX;
341  		goto select;
342  	}
343  
344  	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
345  	if (!points || points < oc->chosen_points)
346  		goto next;
347  
348  	/* Prefer thread group leaders for display purposes */
349  	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
350  		goto next;
351  select:
352  	if (oc->chosen)
353  		put_task_struct(oc->chosen);
354  	get_task_struct(task);
355  	oc->chosen = task;
356  	oc->chosen_points = points;
357  next:
358  	return 0;
359  abort:
360  	if (oc->chosen)
361  		put_task_struct(oc->chosen);
362  	oc->chosen = (void *)-1UL;
363  	return 1;
364  }

针对上面的代码,进行相关阐述如下:

3.3 oom_badness 函数

这个函数的功能是对进程进行不良评分,函数的源代码如下:

192  /**
193   * oom_badness - heuristic function to determine which candidate task to kill
194   * @p: task struct of which task we should calculate
195   * @totalpages: total present RAM allowed for page allocation
196   * @memcg: task's memory controller, if constrained
197   * @nodemask: nodemask passed to page allocator for mempolicy ooms
198   *
199   * The heuristic for determining which task to kill is made to be as simple and
200   * predictable as possible.  The goal is to return the highest value for the
201   * task consuming the most memory to avoid subsequent oom failures.
202   */
203  unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
204  			  const nodemask_t *nodemask, unsigned long totalpages)
205  {
206  	long points;
207  	long adj;
208  
209  	if (oom_unkillable_task(p, memcg, nodemask))
210  		return 0;
211  
212  	p = find_lock_task_mm(p);
213  	if (!p)
214  		return 0;
215  
216  	/*
217  	 * Do not even consider tasks which are explicitly marked oom
218  	 * unkillable or have been already oom reaped or the are in
219  	 * the middle of vfork
220  	 */
221  	adj = (long)p->signal->oom_score_adj;
222  	if (adj == OOM_SCORE_ADJ_MIN ||
223  			test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
224  			in_vfork(p)) {
225  		task_unlock(p);
226  		return 0;
227  	}
228  
229  	/*
230  	 * The baseline for the badness score is the proportion of RAM that each
231  	 * task's rss, pagetable and swap space use.
232  	 */
233  	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
234  		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
235  	task_unlock(p);
236  
237  	/* Normalize to oom_score_adj units 此处是为了调整adj的单位,增加adj的加法权重 */ 
238  	adj *= totalpages / 1000;
239  	points += adj;
240  
241  	/*
242  	 * Never return 0 for an eligible task regardless of the root bonus and
243  	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
244  	 */
245  	return points > 0 ? points : 1;
246  }

针对上面的代码,进行相关阐述如下:

特别注意:Linux 3.10 内核针对系统管理员集成把不良分数降低 3% , 相当于优先避免kill 系统管理员进程。

	/*
	 * Root processes get 3% bonus, just like the __vm_enough_memory()
	 * implementation used by LSMs.
	 */
	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
		points -= (points * 3) / 100;