kk Blog —— 通用基础

date [-d @int|str] [+%s|"+%F %T"]

kernel 3.10内核源码分析--Out of Memory(OOM)处理流程

http://blog.chinaunix.net/uid-20671208-id-4440249.html

Out Of Memory(OOM),即内存耗尽,当系统中内存耗尽时,如果不做处理,将处于崩溃的边缘,因为无内核资源可用,而系统运行时刻都可能需要申请内存。这时,内核需要采取一定的措施来防止系统崩溃,这就是我们熟知的OOM流程,其实就是要回收一些内存,而走到OOM流程,已经基本说明其它的回收内存的手段都已经尝试过了(比如回收cache),这里通常只能通过kill进程来回收内存了,而选择被kill进程的标准就比较简单直接了,总体就是:谁用的多,就kill谁。

OOM处理的基本流程简单描述如下:

1、检查是否配置了/proc/sys/kernel/panic_on_oom,如果是则直接触发panic。

2、检查是否配置了oom_kill_allocating_task,即是否需要kill current进程来回收内存,如果是,且current进程是killable的,则kill current进程。

3、根据既定策略选择需要kill的process,基本策略为:通过进程的内存占用情况计算“点数”,点数最高者被选中。

4、如果没有选出来可kill的进程,那么直接panic(通常不会走到这个流程,但也有例外,比如,当被选中的进程处于D状态,或者正在被kill)

5、kill掉被选中的进程,以释放内存。

代码注释如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/*
  * OOM处理的主流程,上面的注释应该比较清楚了。
  */
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
		int order, nodemask_t *nodemask, bool force_kill)
{
	const nodemask_t *mpol_mask;
	struct task_struct *p;
	unsigned long totalpages;
	unsigned long freed = 0;
	unsigned int uninitialized_var(points);
	enum oom_constraint constraint = CONSTRAINT_NONE;
	int killed = 0;

	// 调用block通知链oom_nofify_list中的函数
	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);

	if (freed > 0)
		/* Got some memory back in the last second. */
		return;

	/*
	 * If current has a pending SIGKILL or is exiting, then automatically
	 * select it. The goal is to allow it to allocate so that it may
	 * quickly exit and free its memory.
	 */
	/*
	 * 如果当前进程有pending的SIGKILL(9)信号,或者正在退出,则选择当前进程来kill,
	 * 这样可以最快的达到释放内存的目的。
	 */
	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
		set_thread_flag(TIF_MEMDIE);
		return;
	}

	/*
	 * Check if there were limitations on the allocation (only relevant for
	 * NUMA) that may require different handling.
	 */
	/*
	 * 检查是否有限制,有几种不同的限制策略,仅用于NUMA场景
	 */
	constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
						&totalpages);
	mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
	// 检查是否配置了/proc/sys/kernel/panic_on_oom,如果是则直接触发panic
	check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);

	/*
	 * 检查是否配置了oom_kill_allocating_task,即是否需要kill current进程来
	 * 回收内存,如果是,且current进程是killable的,则kill current进程。
	 */
	if (sysctl_oom_kill_allocating_task && current->mm &&
	 !oom_unkillable_task(current, NULL, nodemask) &&
	 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
		get_task_struct(current);
		// kill被选中的进程。
		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
				 nodemask,
				 "Out of memory (oom_kill_allocating_task)");
		goto out;
	}

	// 根据既定策略选择需要kill的process。
	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
	/* Found nothing?!?! Either we hang forever, or we panic. */
	/*
	 * 如果没有选出来,即没有可kill的进程,那么直接panic
	 * 通常不会走到这个流程,但也有例外,比如,当被选中的进程处于D状态,或者正在被kill
	 */
	if (!p) {
		dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
		panic("Out of memory and no killable processes...\n");
	}
	// kill掉被选中的进程,以释放内存。
	if (PTR_ERR(p) != -1UL) {
		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
				 nodemask, "Out of memory");
		killed = 1;
	}
out:
	/*
	 * Give the killed threads a good chance of exiting before trying to
	 * allocate memory again.
	 */
	/*
	 * 在重新分配内存之前,给被kill的进程1s的时间完成exit相关处理,通常情况
	 * 下,1s应该够了。
	 */
	if (killed)
		schedule_timeout_killable(1);
}

out_of_memory->select_bad_process

通过select_bad_process函数选择被kill的进程,其基本流程为:

1、遍历系统中的所有进程,进行"点数"计算

2、进行一些特殊情况的处理,比如: 优先选择触发OOM的进程、不处理正在exit的进程等。

3、计算"点数",选择点数最大的进程。通过函数oom_badness()

代码注释和分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/*
  * OOM流程中,用来选择被kill的进程的函数
  * @ppoints:点数,用来计算每个进程被"选中"可能性,点数越高,越可能被"选中"
  */
static struct task_struct *select_bad_process(unsigned int *ppoints,
		unsigned long totalpages, const nodemask_t *nodemask,
		bool force_kill)
{
	struct task_struct *g, *p;
	struct task_struct *chosen = NULL;
	unsigned long chosen_points = 0;

	rcu_read_lock();
	// 遍历系统中的所有进程,进行"点数"计算
	do_each_thread(g, p) {
		unsigned int points;

		/*
		 * 进行一些特殊情况的处理,比如: 优先选择触发OOM的进程、不处理
		 * 正在exit的进程等。
		 */        
		switch (oom_scan_process_thread(p, totalpages, nodemask,
						force_kill)) {
		case OOM_SCAN_SELECT:
			chosen = p;
			chosen_points = ULONG_MAX;
			/* fall through */
		case OOM_SCAN_CONTINUE:
			continue;
		case OOM_SCAN_ABORT:
			rcu_read_unlock();
			return ERR_PTR(-1UL);
		case OOM_SCAN_OK:
			break;
		};
		// 计算"点数",选择点数最大的进程。
		points = oom_badness(p, NULL, nodemask, totalpages);
		if (points > chosen_points) {
			chosen = p;
			chosen_points = points;
		}
	} while_each_thread(g, p);
	if (chosen)
		get_task_struct(chosen);
	rcu_read_unlock();

	*ppoints = chosen_points * 1000 / totalpages;
	return chosen;
}

out_of_memory->select_bad_process->oom_scan_process_thread

oom_scan_process_thread函数的分析和注释如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
		unsigned long totalpages, const nodemask_t *nodemask,
		bool force_kill)
{
	// 如果进程正在exit
	if (task->exit_state)
		return OOM_SCAN_CONTINUE;
	/*
	 * 如果进程不能被kill,比如: init进程或进程在nodemask对应的节点上,
	 * 没有可以释放的内存。
	 */
	if (oom_unkillable_task(task, NULL, nodemask))
		return OOM_SCAN_CONTINUE;

	/*
	 * This task already has access to memory reserves and is being killed.
	 * Don't allow any other task to have access to the reserves.
	 */
	/*
	 * 如果有进程正在被OOM流程kill,那么应该有内存可以释放了,就不需要再kill
	 * 其它进程了,此时返回abort,结束oom kill流程。
	 */
	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
		if (unlikely(frozen(task)))
			__thaw_task(task);
		if (!force_kill)
			return OOM_SCAN_ABORT;
	}
	// 如果不存在mm了(可能进程刚退出了)
	if (!task->mm)
		return OOM_SCAN_CONTINUE;

	/*
	 * If task is allocating a lot of memory and has been marked to be
	 * killed first if it triggers an oom, then select it.
	 */
	// 优先选择触发OOM的进程。
	if (oom_task_origin(task))
		return OOM_SCAN_SELECT;

	if (task->flags & PF_EXITING && !force_kill) {
		/*
		 * If this task is not being ptraced on exit, then wait for it
		 * to finish before killing some other task unnecessarily.
		 */
		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
			return OOM_SCAN_ABORT;
	}
	return OOM_SCAN_OK;
}

out_of_memory->select_bad_process->oom_badness

oom_badness用于计算进程的“点数”,点数最高者被选中,代码注释和分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
/*
 * 计算进程"点数"(代表进程被选中的可能性)的函数,点数根据进程占用的物理内存来计算
 * 物理内存占用越多,被选中的可能性越大。root processes有3%的bonus。
 */
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
			 const nodemask_t *nodemask, unsigned long totalpages)
{
	long points;
	long adj;

	if (oom_unkillable_task(p, memcg, nodemask))
		return 0;
	// 确认进程是否还存在
	p = find_lock_task_mm(p);
	if (!p)
		return 0;

	adj = (long)p->signal->oom_score_adj;
	if (adj == OOM_SCORE_ADJ_MIN) {
		task_unlock(p);
		return 0;
	}

	/*
	 * The baseline for the badness score is the proportion of RAM that each
	 * task's rss, pagetable and swap space use.
	 */
	// 点数=rss(驻留内存/占用物理内存)+pte数+交换分区用量
	points = get_mm_rss(p->mm) + p->mm->nr_ptes +
		 get_mm_counter(p->mm, MM_SWAPENTS);
	task_unlock(p);

	/*
	 * Root processes get 3% bonus, just like the __vm_enough_memory()
	 * implementation used by LSMs.
	 */
	/*
	 * root用户启动的进程,有总 内存*3% 的bonus,就是说可以使用比其它进程多3%的内存
	 * 3%=30/1000
	 */
	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
		adj -= 30;

	/* Normalize to oom_score_adj units */
	// 归一化"点数"单位
	adj *= totalpages / 1000;
	points += adj;

	/*
	 * Never return 0 for an eligible task regardless of the root bonus and
	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
	 */
	return points > 0 ? points : 1;
}

out_of_memory->oom_kill_process

oom_kill_process()函数用于:kill被选中的进程,其实就是给指定进程发送SIGKILL信号,待被选中进程返回用户态时,进行信号处理。

相关代码注释和分析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*
  * kill被选中的进程,在OOM流程中被调用
  */
void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
		 unsigned int points, unsigned long totalpages,
		 struct mem_cgroup *memcg, nodemask_t *nodemask,
		 const char *message)
{
	struct task_struct *victim = p;
	struct task_struct *child;
	struct task_struct *t = p;
	struct mm_struct *mm;
	unsigned int victim_points = 0;
	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
					 DEFAULT_RATELIMIT_BURST);

	/*
	 * If the task is already exiting, don't alarm the sysadmin or kill
	 * its children or threads, just set TIF_MEMDIE so it can die quickly
	 */
	/*
	 * 如果进程正在exiting,就没有必要再kill它了,直接设置TIF_MEMDIE,然后返回。
	*/
	if (p->flags & PF_EXITING) {
		set_tsk_thread_flag(p, TIF_MEMDIE);
		put_task_struct(p);
		return;
	}

	if (__ratelimit(&oom_rs))
		dump_header(p, gfp_mask, order, memcg, nodemask);

	task_lock(p);
	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
		message, task_pid_nr(p), p->comm, points);
	task_unlock(p);

	/*
	 * If any of p's children has a different mm and is eligible for kill,
	 * the one with the highest oom_badness() score is sacrificed for its
	 * parent. This attempts to lose the minimal amount of work done while
	 * still freeing memory.
	 */
	/*
	 * 如果被选中的进程的子进程,不跟其共享mm(通常是这样),且膐om_badness的
	 * 得分更高,那么重新选择该子进程为被kill的进程。
	 */
	read_lock(&tasklist_lock);
	do {
		// 遍历被选中进程的所有子进程
		list_for_each_entry(child, &t->children, sibling) {
			unsigned int child_points;

			// 如果不共享mm
			if (child->mm == p->mm)
				continue;
			/*
			 * oom_badness() returns 0 if the thread is unkillable
			 */
			// 计算child?om_badness得分
			child_points = oom_badness(child, memcg, nodemask,
								totalpages);
			// 如果child得分更高,则将被选中进程换成child
			if (child_points > victim_points) {
				put_task_struct(victim);
				victim = child;
				victim_points = child_points;
				get_task_struct(victim);
			}
		}
	} while_each_thread(p, t);
	read_unlock(&tasklist_lock);

	rcu_read_lock();
	/*
	 * 遍历确认被选中进程的线程组,判断是否还存在task_struct->mm,如果不存在
	 * (有可能这个时候进程退出了,或释放了mm),就没必要再kill了。
	 * 如果存在则选择线程组中的进程。
	 */
	p = find_lock_task_mm(victim);
	if (!p) {
		rcu_read_unlock();
		put_task_struct(victim);
		return;
	// 如果新选择的进程跟之前的不是同一个,那么更新victim。
	} else if (victim != p) {
		get_task_struct(p);
		put_task_struct(victim);
		victim = p;
	}

	/* mm cannot safely be dereferenced after task_unlock(victim) */
	mm = victim->mm;
	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
		K(get_mm_counter(victim->mm, MM_FILEPAGES)));
	task_unlock(victim);

	/*
	 * Kill all user processes sharing victim->mm in other thread groups, if
	 * any. They don't get access to memory reserves, though, to avoid
	 * depletion of all memory. This prevents mm->mmap_sem livelock when an
	 * oom killed thread cannot exit because it requires the semaphore and
	 * its contended by another thread trying to allocate memory itself.
	 * That thread will now get access to memory reserves since it has a
	 * pending fatal signal.
	 */
	/*
	 * 遍历系统中的所有进程,寻找在其它线程组中,跟被选中进程(victim)共享mm结构
	 * 的进程(内核线程除外),共享mm结构即共享进程地址空间,比如fork后exec之前,
	 * 父子进程是共享mm的,回收内存必须要将共享mm的所有进程都kill掉。
	 */
	for_each_process(p)
		if (p->mm == mm && !same_thread_group(p, victim) &&
		 !(p->flags & PF_KTHREAD)) {
			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
				continue;

			// 进行task_struct相关操作时,通常需要获取该锁。
			task_lock(p);    /* Protect ->comm from prctl() */
			pr_err("Kill process %d (%s) sharing same memory\n",
				task_pid_nr(p), p->comm);
			task_unlock(p);
			// 通过向被选中的进程发送kill信号,来kill进程。
			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
		}
	rcu_read_unlock();

	// 进程设置TIF_MEMDIE标记,表示进程正在被oom killer终止中。
	set_tsk_thread_flag(victim, TIF_MEMDIE);
	/*
	 * 最终通过向被选中的进程发送kill信号,来kill进程,被kill的进程在从内核态
	 * 返回用户态时,进行信号处理。
	 * 被选中的进程可以是自己(current),则current进程会在oom流程执行完成后,返回
	 * 用户态时,处理信号。
	 */
	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
	put_task_struct(victim);
}