1 /* drivers/misc/lowmemorykiller.c
2 *
3 * The lowmemorykiller driver lets user-space specify a set of memory thresholds
4 * where processes with a range of oom_score_adj values will get killed. Specify
5 * the minimum oom_score_adj values in
6 * /sys/module/lowmemorykiller/parameters/adj and the number of free pages in
7 * /sys/module/lowmemorykiller/parameters/minfree. Both files take a comma
8 * separated list of numbers in ascending order.
9 *
10 * For example, write "0,8" to /sys/module/lowmemorykiller/parameters/adj and
11 * "1024,4096" to /sys/module/lowmemorykiller/parameters/minfree to kill
12 * processes with a oom_score_adj value of 8 or higher when the free memory
13 * drops below 4096 pages and kill processes with a oom_score_adj value of 0 or
14 * higher when the free memory drops below 1024 pages.
15 *
16 * The driver considers memory used for caches to be free, but if a large
17 * percentage of the cached memory is locked this can be very inaccurate
18 * and processes may not get killed until the normal oom killer is triggered.
19 *
20 * Copyright (C) 2007-2008 Google, Inc.
21 *
22 * This software is licensed under the terms of the GNU General Public
23 * License version 2, as published by the Free Software Foundation, and
24 * may be copied, distributed, and modified under those terms.
25 *
26 * This program is distributed in the hope that it will be useful,
27 * but WITHOUT ANY WARRANTY; without even the implied warranty of
28 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29 * GNU General Public License for more details.
30 *
31 */
32
33 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
34
35 #include <linux/init.h>
36 #include <linux/moduleparam.h>
37 #include <linux/kernel.h>
38 #include <linux/mm.h>
39 #include <linux/oom.h>
40 #include <linux/sched.h>
41 #include <linux/swap.h>
42 #include <linux/rcupdate.h>
43 #include <linux/profile.h>
44 #include <linux/notifier.h>
45 #include <linux/circ_buf.h>
46 #include <linux/proc_fs.h>
47 #include <linux/slab.h>
48 #include <linux/poll.h>
49
50 #define CREATE_TRACE_POINTS
51 #include "trace/lowmemorykiller.h"
52
53 static uint32_t lowmem_debug_level = 1;
54 static short lowmem_adj[6] = {
55 0,
56 1,
57 6,
58 12,
59 };
60 static int lowmem_adj_size = 4;
61 static int lowmem_minfree[6] = {
62 3 * 512, /* 6MB */
63 2 * 1024, /* 8MB */
64 4 * 1024, /* 16MB */
65 16 * 1024, /* 64MB */
66 };
67 static int lowmem_minfree_size = 4;
68
69 static unsigned long lowmem_deathpending_timeout;
70
71 #define lowmem_print(level, x...) \
72 do { \
73 if (lowmem_debug_level >= (level)) \
74 pr_info(x); \
75 } while (0)
76
77
78 static DECLARE_WAIT_QUEUE_HEAD(event_wait);
79 static DEFINE_SPINLOCK(lmk_event_lock);
80 static struct circ_buf event_buffer;
81 #define MAX_BUFFERED_EVENTS 8
82 #define MAX_TASKNAME 128
83
84 struct lmk_event {
85 char taskname[MAX_TASKNAME];
86 pid_t pid;
87 uid_t uid;
88 pid_t group_leader_pid;
89 unsigned long min_flt;
90 unsigned long maj_flt;
91 unsigned long rss_in_pages;
92 short oom_score_adj;
93 short min_score_adj;
94 unsigned long long start_time;
95 struct list_head list;
96 };
97
handle_lmk_event(struct task_struct * selected,int selected_tasksize,short min_score_adj)98 void handle_lmk_event(struct task_struct *selected, int selected_tasksize,
99 short min_score_adj)
100 {
101 int head;
102 int tail;
103 struct lmk_event *events;
104 struct lmk_event *event;
105
106 spin_lock(&lmk_event_lock);
107
108 head = event_buffer.head;
109 tail = READ_ONCE(event_buffer.tail);
110
111 /* Do not continue to log if no space remains in the buffer. */
112 if (CIRC_SPACE(head, tail, MAX_BUFFERED_EVENTS) < 1) {
113 spin_unlock(&lmk_event_lock);
114 return;
115 }
116
117 events = (struct lmk_event *) event_buffer.buf;
118 event = &events[head];
119
120 strncpy(event->taskname, selected->comm, MAX_TASKNAME);
121
122 event->pid = selected->pid;
123 event->uid = from_kuid_munged(current_user_ns(), task_uid(selected));
124 if (selected->group_leader)
125 event->group_leader_pid = selected->group_leader->pid;
126 else
127 event->group_leader_pid = -1;
128 event->min_flt = selected->min_flt;
129 event->maj_flt = selected->maj_flt;
130 event->oom_score_adj = selected->signal->oom_score_adj;
131 event->start_time = nsec_to_clock_t(selected->real_start_time);
132 event->rss_in_pages = selected_tasksize;
133 event->min_score_adj = min_score_adj;
134
135 event_buffer.head = (head + 1) & (MAX_BUFFERED_EVENTS - 1);
136
137 spin_unlock(&lmk_event_lock);
138
139 wake_up_interruptible(&event_wait);
140 }
141
lmk_event_show(struct seq_file * s,void * unused)142 static int lmk_event_show(struct seq_file *s, void *unused)
143 {
144 struct lmk_event *events = (struct lmk_event *) event_buffer.buf;
145 int head;
146 int tail;
147 struct lmk_event *event;
148
149 spin_lock(&lmk_event_lock);
150
151 head = event_buffer.head;
152 tail = event_buffer.tail;
153
154 if (head == tail) {
155 spin_unlock(&lmk_event_lock);
156 return -EAGAIN;
157 }
158
159 event = &events[tail];
160
161 seq_printf(s, "%lu %lu %lu %lu %lu %lu %hd %hd %llu\n%s\n",
162 (unsigned long) event->pid, (unsigned long) event->uid,
163 (unsigned long) event->group_leader_pid, event->min_flt,
164 event->maj_flt, event->rss_in_pages, event->oom_score_adj,
165 event->min_score_adj, event->start_time, event->taskname);
166
167 event_buffer.tail = (tail + 1) & (MAX_BUFFERED_EVENTS - 1);
168
169 spin_unlock(&lmk_event_lock);
170 return 0;
171 }
172
lmk_event_poll(struct file * file,poll_table * wait)173 static unsigned int lmk_event_poll(struct file *file, poll_table *wait)
174 {
175 int ret = 0;
176
177 poll_wait(file, &event_wait, wait);
178 spin_lock(&lmk_event_lock);
179 if (event_buffer.head != event_buffer.tail)
180 ret = POLLIN;
181 spin_unlock(&lmk_event_lock);
182 return ret;
183 }
184
lmk_event_open(struct inode * inode,struct file * file)185 static int lmk_event_open(struct inode *inode, struct file *file)
186 {
187 return single_open(file, lmk_event_show, inode->i_private);
188 }
189
190 static const struct file_operations event_file_ops = {
191 .open = lmk_event_open,
192 .poll = lmk_event_poll,
193 .read = seq_read
194 };
195
lmk_event_init(void)196 static void lmk_event_init(void)
197 {
198 struct proc_dir_entry *entry;
199
200 event_buffer.head = 0;
201 event_buffer.tail = 0;
202 event_buffer.buf = kmalloc(
203 sizeof(struct lmk_event) * MAX_BUFFERED_EVENTS, GFP_KERNEL);
204 if (!event_buffer.buf)
205 return;
206 entry = proc_create("lowmemorykiller", 0, NULL, &event_file_ops);
207 if (!entry)
208 pr_err("error creating kernel lmk event file\n");
209 }
210
lowmem_count(struct shrinker * s,struct shrink_control * sc)211 static unsigned long lowmem_count(struct shrinker *s,
212 struct shrink_control *sc)
213 {
214 return global_page_state(NR_ACTIVE_ANON) +
215 global_page_state(NR_ACTIVE_FILE) +
216 global_page_state(NR_INACTIVE_ANON) +
217 global_page_state(NR_INACTIVE_FILE);
218 }
219
lowmem_scan(struct shrinker * s,struct shrink_control * sc)220 static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
221 {
222 struct task_struct *tsk;
223 struct task_struct *selected = NULL;
224 unsigned long rem = 0;
225 int tasksize;
226 int i;
227 short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
228 int minfree = 0;
229 int selected_tasksize = 0;
230 short selected_oom_score_adj;
231 int array_size = ARRAY_SIZE(lowmem_adj);
232 int other_free = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
233 int other_file = global_page_state(NR_FILE_PAGES) -
234 global_page_state(NR_SHMEM) -
235 global_page_state(NR_UNEVICTABLE) -
236 total_swapcache_pages();
237
238 if (lowmem_adj_size < array_size)
239 array_size = lowmem_adj_size;
240 if (lowmem_minfree_size < array_size)
241 array_size = lowmem_minfree_size;
242 for (i = 0; i < array_size; i++) {
243 minfree = lowmem_minfree[i];
244 if (other_free < minfree && other_file < minfree) {
245 min_score_adj = lowmem_adj[i];
246 break;
247 }
248 }
249
250 lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
251 sc->nr_to_scan, sc->gfp_mask, other_free,
252 other_file, min_score_adj);
253
254 if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
255 lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
256 sc->nr_to_scan, sc->gfp_mask);
257 return 0;
258 }
259
260 selected_oom_score_adj = min_score_adj;
261
262 rcu_read_lock();
263 for_each_process(tsk) {
264 struct task_struct *p;
265 short oom_score_adj;
266
267 if (tsk->flags & PF_KTHREAD)
268 continue;
269
270 p = find_lock_task_mm(tsk);
271 if (!p)
272 continue;
273
274 if (test_tsk_thread_flag(p, TIF_MEMDIE) &&
275 time_before_eq(jiffies, lowmem_deathpending_timeout)) {
276 task_unlock(p);
277 rcu_read_unlock();
278 return 0;
279 }
280 oom_score_adj = p->signal->oom_score_adj;
281 if (oom_score_adj < min_score_adj) {
282 task_unlock(p);
283 continue;
284 }
285 tasksize = get_mm_rss(p->mm);
286 task_unlock(p);
287 if (tasksize <= 0)
288 continue;
289 if (selected) {
290 if (oom_score_adj < selected_oom_score_adj)
291 continue;
292 if (oom_score_adj == selected_oom_score_adj &&
293 tasksize <= selected_tasksize)
294 continue;
295 }
296 selected = p;
297 selected_tasksize = tasksize;
298 selected_oom_score_adj = oom_score_adj;
299 lowmem_print(2, "select '%s' (%d), adj %hd, size %d, to kill\n",
300 p->comm, p->pid, oom_score_adj, tasksize);
301 }
302 if (selected) {
303 long cache_size = other_file * (long)(PAGE_SIZE / 1024);
304 long cache_limit = minfree * (long)(PAGE_SIZE / 1024);
305 long free = other_free * (long)(PAGE_SIZE / 1024);
306
307 task_lock(selected);
308 send_sig(SIGKILL, selected, 0);
309 /*
310 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
311 * infrastructure. There is no real reason why the selected
312 * task should have access to the memory reserves.
313 */
314 if (selected->mm)
315 mark_oom_victim(selected);
316 task_unlock(selected);
317 trace_lowmemory_kill(selected, cache_size, cache_limit, free);
318 lowmem_print(1, "Killing '%s' (%d) (tgid %d), adj %hd,\n" \
319 " to free %ldkB on behalf of '%s' (%d) because\n" \
320 " cache %ldkB is below limit %ldkB for oom_score_adj %hd\n" \
321 " Free memory is %ldkB above reserved\n",
322 selected->comm, selected->pid, selected->tgid,
323 selected_oom_score_adj,
324 selected_tasksize * (long)(PAGE_SIZE / 1024),
325 current->comm, current->pid,
326 cache_size, cache_limit,
327 min_score_adj,
328 free);
329 lowmem_deathpending_timeout = jiffies + HZ;
330 rem += selected_tasksize;
331 get_task_struct(selected);
332 }
333
334 lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
335 sc->nr_to_scan, sc->gfp_mask, rem);
336 rcu_read_unlock();
337
338 if (selected) {
339 handle_lmk_event(selected, selected_tasksize, min_score_adj);
340 put_task_struct(selected);
341 }
342 return rem;
343 }
344
345 static struct shrinker lowmem_shrinker = {
346 .scan_objects = lowmem_scan,
347 .count_objects = lowmem_count,
348 .seeks = DEFAULT_SEEKS * 16
349 };
350
lowmem_init(void)351 static int __init lowmem_init(void)
352 {
353 register_shrinker(&lowmem_shrinker);
354 lmk_event_init();
355 return 0;
356 }
357 device_initcall(lowmem_init);
358
359 #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
lowmem_oom_adj_to_oom_score_adj(short oom_adj)360 static short lowmem_oom_adj_to_oom_score_adj(short oom_adj)
361 {
362 if (oom_adj == OOM_ADJUST_MAX)
363 return OOM_SCORE_ADJ_MAX;
364 else
365 return (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
366 }
367
lowmem_autodetect_oom_adj_values(void)368 static void lowmem_autodetect_oom_adj_values(void)
369 {
370 int i;
371 short oom_adj;
372 short oom_score_adj;
373 int array_size = ARRAY_SIZE(lowmem_adj);
374
375 if (lowmem_adj_size < array_size)
376 array_size = lowmem_adj_size;
377
378 if (array_size <= 0)
379 return;
380
381 oom_adj = lowmem_adj[array_size - 1];
382 if (oom_adj > OOM_ADJUST_MAX)
383 return;
384
385 oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
386 if (oom_score_adj <= OOM_ADJUST_MAX)
387 return;
388
389 lowmem_print(1, "lowmem_shrink: convert oom_adj to oom_score_adj:\n");
390 for (i = 0; i < array_size; i++) {
391 oom_adj = lowmem_adj[i];
392 oom_score_adj = lowmem_oom_adj_to_oom_score_adj(oom_adj);
393 lowmem_adj[i] = oom_score_adj;
394 lowmem_print(1, "oom_adj %d => oom_score_adj %d\n",
395 oom_adj, oom_score_adj);
396 }
397 }
398
lowmem_adj_array_set(const char * val,const struct kernel_param * kp)399 static int lowmem_adj_array_set(const char *val, const struct kernel_param *kp)
400 {
401 int ret;
402
403 ret = param_array_ops.set(val, kp);
404
405 /* HACK: Autodetect oom_adj values in lowmem_adj array */
406 lowmem_autodetect_oom_adj_values();
407
408 return ret;
409 }
410
lowmem_adj_array_get(char * buffer,const struct kernel_param * kp)411 static int lowmem_adj_array_get(char *buffer, const struct kernel_param *kp)
412 {
413 return param_array_ops.get(buffer, kp);
414 }
415
lowmem_adj_array_free(void * arg)416 static void lowmem_adj_array_free(void *arg)
417 {
418 param_array_ops.free(arg);
419 }
420
421 static struct kernel_param_ops lowmem_adj_array_ops = {
422 .set = lowmem_adj_array_set,
423 .get = lowmem_adj_array_get,
424 .free = lowmem_adj_array_free,
425 };
426
427 static const struct kparam_array __param_arr_adj = {
428 .max = ARRAY_SIZE(lowmem_adj),
429 .num = &lowmem_adj_size,
430 .ops = ¶m_ops_short,
431 .elemsize = sizeof(lowmem_adj[0]),
432 .elem = lowmem_adj,
433 };
434 #endif
435
436 /*
437 * not really modular, but the easiest way to keep compat with existing
438 * bootargs behaviour is to continue using module_param here.
439 */
440 module_param_named(cost, lowmem_shrinker.seeks, int, S_IRUGO | S_IWUSR);
441 #ifdef CONFIG_ANDROID_LOW_MEMORY_KILLER_AUTODETECT_OOM_ADJ_VALUES
442 module_param_cb(adj, &lowmem_adj_array_ops,
443 .arr = &__param_arr_adj,
444 S_IRUGO | S_IWUSR);
445 __MODULE_PARM_TYPE(adj, "array of short");
446 #else
447 module_param_array_named(adj, lowmem_adj, short, &lowmem_adj_size,
448 S_IRUGO | S_IWUSR);
449 #endif
450 module_param_array_named(minfree, lowmem_minfree, uint, &lowmem_minfree_size,
451 S_IRUGO | S_IWUSR);
452 module_param_named(debug_level, lowmem_debug_level, uint, S_IRUGO | S_IWUSR);
453
454