1 /*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "lowmemorykiller"
18
19 #include <errno.h>
20 #include <inttypes.h>
21 #include <pwd.h>
22 #include <sched.h>
23 #include <stdbool.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <sys/cdefs.h>
27 #include <sys/epoll.h>
28 #include <sys/eventfd.h>
29 #include <sys/mman.h>
30 #include <sys/pidfd.h>
31 #include <sys/socket.h>
32 #include <sys/syscall.h>
33 #include <sys/sysinfo.h>
34 #include <time.h>
35 #include <unistd.h>
36
37 #include <algorithm>
38 #include <array>
39 #include <memory>
40 #include <shared_mutex>
41 #include <vector>
42
43 #include <BpfSyscallWrappers.h>
44 #include <android-base/unique_fd.h>
45 #include <bpf/WaitForProgsLoaded.h>
46 #include <cutils/properties.h>
47 #include <cutils/sockets.h>
48 #include <liblmkd_utils.h>
49 #include <lmkd.h>
50 #include <lmkd_hooks.h>
51 #include <log/log.h>
52 #include <log/log_event_list.h>
53 #include <log/log_time.h>
54 #include <memevents/memevents.h>
55 #include <private/android_filesystem_config.h>
56 #include <processgroup/processgroup.h>
57 #include <psi/psi.h>
58
59 #include "reaper.h"
60 #include "statslog.h"
61 #include "watchdog.h"
62
63 /*
64 * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
65 * to profile and correlate with OOM kills
66 */
67 #ifdef LMKD_TRACE_KILLS
68
69 #define ATRACE_TAG ATRACE_TAG_ALWAYS
70 #include <cutils/trace.h>
71
trace_kill_start(const char * desc)72 static inline void trace_kill_start(const char *desc) {
73 ATRACE_BEGIN(desc);
74 }
75
trace_kill_end()76 static inline void trace_kill_end() {
77 ATRACE_END();
78 }
79
80 #else /* LMKD_TRACE_KILLS */
81
trace_kill_start(const char *)82 static inline void trace_kill_start(const char *) {}
trace_kill_end()83 static inline void trace_kill_end() {}
84
85 #endif /* LMKD_TRACE_KILLS */
86
87 #ifndef __unused
88 #define __unused __attribute__((__unused__))
89 #endif
90
91 #define ZONEINFO_PATH "/proc/zoneinfo"
92 #define MEMINFO_PATH "/proc/meminfo"
93 #define VMSTAT_PATH "/proc/vmstat"
94 #define PROC_STATUS_TGID_FIELD "Tgid:"
95 #define PROC_STATUS_RSS_FIELD "VmRSS:"
96 #define PROC_STATUS_SWAP_FIELD "VmSwap:"
97 #define NODE_STATS_MARKER " per-node stats"
98
99 #define PERCEPTIBLE_APP_ADJ 200
100 #define PREVIOUS_APP_ADJ 700
101
102 /* Android Logger event logtags (see event.logtags) */
103 #define KILLINFO_LOG_TAG 10195355
104
105 /* gid containing AID_SYSTEM required */
106 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
107 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
108
109 #define EIGHT_MEGA (1 << 23)
110
111 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
112 #define THRASHING_RESET_INTERVAL_MS 1000
113
114 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
115 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
116
117 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
118 #define SYSTEM_ADJ (-900)
119
120 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
121 #define STRINGIFY_INTERNAL(x) #x
122
123 #define PROCFS_PATH_MAX 64
124
125 /*
126 * Read lmk property with persist.device_config.lmkd_native.<name> overriding ro.lmk.<name>
127 * persist.device_config.lmkd_native.* properties are being set by experiments. If a new property
128 * can be controlled by an experiment then use GET_LMK_PROPERTY instead of property_get_xxx and
129 * add "on property" triggers in lmkd.rc to react to the experiment flag changes.
130 */
131 #define GET_LMK_PROPERTY(type, name, def) \
132 property_get_##type("persist.device_config.lmkd_native." name, \
133 property_get_##type("ro.lmk." name, def))
134
135 /*
136 * PSI monitor tracking window size.
137 * PSI monitor generates events at most once per window,
138 * therefore we poll memory state for the duration of
139 * PSI_WINDOW_SIZE_MS after the event happens.
140 */
141 #define PSI_WINDOW_SIZE_MS 1000
142 /* Polling period after PSI signal when pressure is high */
143 #define PSI_POLL_PERIOD_SHORT_MS 10
144 /* Polling period after PSI signal when pressure is low */
145 #define PSI_POLL_PERIOD_LONG_MS 100
146
147 #define FAIL_REPORT_RLIMIT_MS 1000
148
149 /*
150 * System property defaults
151 */
152 /* ro.lmk.swap_free_low_percentage property defaults */
153 #define DEF_LOW_SWAP 10
154 /* ro.lmk.thrashing_limit property defaults */
155 #define DEF_THRASHING_LOWRAM 30
156 #define DEF_THRASHING 100
157 /* ro.lmk.thrashing_limit_decay property defaults */
158 #define DEF_THRASHING_DECAY_LOWRAM 50
159 #define DEF_THRASHING_DECAY 10
160 /* ro.lmk.psi_partial_stall_ms property defaults */
161 #define DEF_PARTIAL_STALL_LOWRAM 200
162 #define DEF_PARTIAL_STALL 70
163 /* ro.lmk.psi_complete_stall_ms property defaults */
164 #define DEF_COMPLETE_STALL 700
165 /* ro.lmk.direct_reclaim_threshold_ms property defaults */
166 #define DEF_DIRECT_RECL_THRESH_MS 0
167 /* ro.lmk.swap_compression_ratio property defaults */
168 #define DEF_SWAP_COMP_RATIO 1
169 /* ro.lmk.lowmem_min_oom_score defaults */
170 #define DEF_LOWMEM_MIN_SCORE (PREVIOUS_APP_ADJ + 1)
171
172 #define LMKD_REINIT_PROP "lmkd.reinit"
173
174 #define WATCHDOG_TIMEOUT_SEC 2
175
176 /* default to old in-kernel interface if no memory pressure events */
177 static bool use_inkernel_interface = true;
178 static bool has_inkernel_module;
179
180 /* memory pressure levels */
181 enum vmpressure_level {
182 VMPRESS_LEVEL_LOW = 0,
183 VMPRESS_LEVEL_MEDIUM,
184 VMPRESS_LEVEL_CRITICAL,
185 VMPRESS_LEVEL_COUNT
186 };
187
188 static const char *level_name[] = {
189 "low",
190 "medium",
191 "critical"
192 };
193
194 struct {
195 int64_t min_nr_free_pages; /* recorded but not used yet */
196 int64_t max_nr_free_pages;
197 } low_pressure_mem = { -1, -1 };
198
199 struct psi_threshold {
200 enum psi_stall_type stall_type;
201 int threshold_ms;
202 };
203
204 /* Listener for direct reclaim and kswapd state changes */
205 static std::unique_ptr<android::bpf::memevents::MemEventListener> memevent_listener(nullptr);
206 static struct timespec direct_reclaim_start_tm;
207 static struct timespec kswapd_start_tm;
208
209 static int level_oomadj[VMPRESS_LEVEL_COUNT];
210 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
211 static bool pidfd_supported;
212 static int last_kill_pid_or_fd = -1;
213 static struct timespec last_kill_tm;
214 enum vmpressure_level prev_level = VMPRESS_LEVEL_LOW;
215 static bool monitors_initialized;
216 static bool boot_completed_handled = false;
217 static bool mem_event_update_zoneinfo_supported;
218
219 /* lmkd configurable parameters */
220 static bool debug_process_killing;
221 static bool enable_pressure_upgrade;
222 static int64_t upgrade_pressure;
223 static int64_t downgrade_pressure;
224 static bool low_ram_device;
225 static bool kill_heaviest_task;
226 static unsigned long kill_timeout_ms;
227 static int pressure_after_kill_min_score;
228 static bool use_minfree_levels;
229 static bool per_app_memcg;
230 static int swap_free_low_percentage;
231 static int psi_partial_stall_ms;
232 static int psi_complete_stall_ms;
233 static int thrashing_limit_pct;
234 static int thrashing_limit_decay_pct;
235 static int thrashing_critical_pct;
236 static int swap_util_max;
237 static int64_t filecache_min_kb;
238 static int64_t stall_limit_critical;
239 static bool use_psi_monitors = false;
240 static int kpoll_fd;
241 static bool delay_monitors_until_boot;
242 static int direct_reclaim_threshold_ms;
243 static int swap_compression_ratio;
244 static int lowmem_min_oom_score;
245 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
246 { PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */
247 { PSI_SOME, 100 }, /* 100ms out of 1sec for partial stall */
248 { PSI_FULL, 70 }, /* 70ms out of 1sec for complete stall */
249 };
250
251 static uint64_t mp_event_count;
252
253 static android_log_context ctx;
254 static Reaper reaper;
255 static int reaper_comm_fd[2];
256
257 enum polling_update {
258 POLLING_DO_NOT_CHANGE,
259 POLLING_START,
260 POLLING_PAUSE,
261 POLLING_RESUME,
262 };
263
264 /*
265 * Data used for periodic polling for the memory state of the device.
266 * Note that when system is not polling poll_handler is set to NULL,
267 * when polling starts poll_handler gets set and is reset back to
268 * NULL when polling stops.
269 */
270 struct polling_params {
271 struct event_handler_info* poll_handler;
272 struct event_handler_info* paused_handler;
273 struct timespec poll_start_tm;
274 struct timespec last_poll_tm;
275 int polling_interval_ms;
276 enum polling_update update;
277 };
278
279 /* data required to handle events */
280 struct event_handler_info {
281 int data;
282 void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
283 };
284
285 /* data required to handle socket events */
286 struct sock_event_handler_info {
287 int sock;
288 pid_t pid;
289 uint32_t async_event_mask;
290 struct event_handler_info handler_info;
291 };
292
293 /* max supported number of data connections (AMS, init, tests) */
294 #define MAX_DATA_CONN 3
295
296 /* socket event handler data */
297 static struct sock_event_handler_info ctrl_sock;
298 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
299
300 /* vmpressure event handler data */
301 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
302
303 /*
304 * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
305 * 1 lmk events + 1 fd to wait for process death + 1 fd to receive kill failure notifications
306 * + 1 fd to receive memevent_listener notifications
307 */
308 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1 + 1 + 1)
309 static int epollfd;
310 static int maxevents;
311
312 /* OOM score values used by both kernel and framework */
313 #define OOM_SCORE_ADJ_MIN (-1000)
314 #define OOM_SCORE_ADJ_MAX 1000
315
316 static std::array<int, MAX_TARGETS> lowmem_adj;
317 static std::array<int, MAX_TARGETS> lowmem_minfree;
318 static int lowmem_targets_size;
319
320 /* Fields to parse in /proc/zoneinfo */
321 /* zoneinfo per-zone fields */
322 enum zoneinfo_zone_field {
323 ZI_ZONE_NR_FREE_PAGES = 0,
324 ZI_ZONE_MIN,
325 ZI_ZONE_LOW,
326 ZI_ZONE_HIGH,
327 ZI_ZONE_PRESENT,
328 ZI_ZONE_NR_FREE_CMA,
329 ZI_ZONE_FIELD_COUNT
330 };
331
332 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
333 "nr_free_pages",
334 "min",
335 "low",
336 "high",
337 "present",
338 "nr_free_cma",
339 };
340
341 /* zoneinfo per-zone special fields */
342 enum zoneinfo_zone_spec_field {
343 ZI_ZONE_SPEC_PROTECTION = 0,
344 ZI_ZONE_SPEC_PAGESETS,
345 ZI_ZONE_SPEC_FIELD_COUNT,
346 };
347
348 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
349 "protection:",
350 "pagesets",
351 };
352
353 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
354 #define MAX_NR_ZONES 6
355
356 union zoneinfo_zone_fields {
357 struct {
358 int64_t nr_free_pages;
359 int64_t min;
360 int64_t low;
361 int64_t high;
362 int64_t present;
363 int64_t nr_free_cma;
364 } field;
365 int64_t arr[ZI_ZONE_FIELD_COUNT];
366 };
367
368 struct zoneinfo_zone {
369 union zoneinfo_zone_fields fields;
370 int64_t protection[MAX_NR_ZONES];
371 int64_t max_protection;
372 };
373
374 /* zoneinfo per-node fields */
375 enum zoneinfo_node_field {
376 ZI_NODE_NR_INACTIVE_FILE = 0,
377 ZI_NODE_NR_ACTIVE_FILE,
378 ZI_NODE_FIELD_COUNT
379 };
380
381 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
382 "nr_inactive_file",
383 "nr_active_file",
384 };
385
386 union zoneinfo_node_fields {
387 struct {
388 int64_t nr_inactive_file;
389 int64_t nr_active_file;
390 } field;
391 int64_t arr[ZI_NODE_FIELD_COUNT];
392 };
393
394 struct zoneinfo_node {
395 int id;
396 int zone_count;
397 struct zoneinfo_zone zones[MAX_NR_ZONES];
398 union zoneinfo_node_fields fields;
399 };
400
401 /* for now two memory nodes is more than enough */
402 #define MAX_NR_NODES 2
403
404 struct zoneinfo {
405 int node_count;
406 struct zoneinfo_node nodes[MAX_NR_NODES];
407 int64_t totalreserve_pages;
408 int64_t total_inactive_file;
409 int64_t total_active_file;
410 };
411
412 /* Fields to parse in /proc/meminfo */
413 enum meminfo_field {
414 MI_NR_FREE_PAGES = 0,
415 MI_CACHED,
416 MI_SWAP_CACHED,
417 MI_BUFFERS,
418 MI_SHMEM,
419 MI_UNEVICTABLE,
420 MI_TOTAL_SWAP,
421 MI_FREE_SWAP,
422 MI_ACTIVE_ANON,
423 MI_INACTIVE_ANON,
424 MI_ACTIVE_FILE,
425 MI_INACTIVE_FILE,
426 MI_SRECLAIMABLE,
427 MI_SUNRECLAIM,
428 MI_KERNEL_STACK,
429 MI_PAGE_TABLES,
430 MI_ION_HELP,
431 MI_ION_HELP_POOL,
432 MI_CMA_FREE,
433 MI_FIELD_COUNT
434 };
435
436 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
437 "MemFree:",
438 "Cached:",
439 "SwapCached:",
440 "Buffers:",
441 "Shmem:",
442 "Unevictable:",
443 "SwapTotal:",
444 "SwapFree:",
445 "Active(anon):",
446 "Inactive(anon):",
447 "Active(file):",
448 "Inactive(file):",
449 "SReclaimable:",
450 "SUnreclaim:",
451 "KernelStack:",
452 "PageTables:",
453 "ION_heap:",
454 "ION_heap_pool:",
455 "CmaFree:",
456 };
457
458 union meminfo {
459 struct {
460 int64_t nr_free_pages;
461 int64_t cached;
462 int64_t swap_cached;
463 int64_t buffers;
464 int64_t shmem;
465 int64_t unevictable;
466 int64_t total_swap;
467 int64_t free_swap;
468 int64_t active_anon;
469 int64_t inactive_anon;
470 int64_t active_file;
471 int64_t inactive_file;
472 int64_t sreclaimable;
473 int64_t sunreclaimable;
474 int64_t kernel_stack;
475 int64_t page_tables;
476 int64_t ion_heap;
477 int64_t ion_heap_pool;
478 int64_t cma_free;
479 /* fields below are calculated rather than read from the file */
480 int64_t nr_file_pages;
481 int64_t total_gpu_kb;
482 int64_t easy_available;
483 } field;
484 int64_t arr[MI_FIELD_COUNT];
485 };
486
487 /* Fields to parse in /proc/vmstat */
488 enum vmstat_field {
489 VS_FREE_PAGES,
490 VS_INACTIVE_FILE,
491 VS_ACTIVE_FILE,
492 VS_WORKINGSET_REFAULT,
493 VS_WORKINGSET_REFAULT_FILE,
494 VS_PGSCAN_KSWAPD,
495 VS_PGSCAN_DIRECT,
496 VS_PGSCAN_DIRECT_THROTTLE,
497 VS_PGREFILL,
498 VS_FIELD_COUNT
499 };
500
501 static const char* const vmstat_field_names[VS_FIELD_COUNT] = {
502 "nr_free_pages",
503 "nr_inactive_file",
504 "nr_active_file",
505 "workingset_refault",
506 "workingset_refault_file",
507 "pgscan_kswapd",
508 "pgscan_direct",
509 "pgscan_direct_throttle",
510 "pgrefill",
511 };
512
513 union vmstat {
514 struct {
515 int64_t nr_free_pages;
516 int64_t nr_inactive_file;
517 int64_t nr_active_file;
518 int64_t workingset_refault;
519 int64_t workingset_refault_file;
520 int64_t pgscan_kswapd;
521 int64_t pgscan_direct;
522 int64_t pgscan_direct_throttle;
523 int64_t pgrefill;
524 } field;
525 int64_t arr[VS_FIELD_COUNT];
526 };
527
528 enum field_match_result {
529 NO_MATCH,
530 PARSE_FAIL,
531 PARSE_SUCCESS
532 };
533
534 struct adjslot_list {
535 struct adjslot_list *next;
536 struct adjslot_list *prev;
537 };
538
539 struct proc {
540 struct adjslot_list asl;
541 int pid;
542 int pidfd;
543 uid_t uid;
544 int oomadj;
545 pid_t reg_pid; /* PID of the process that registered this record */
546 bool valid;
547 struct proc *pidhash_next;
548 };
549
550 struct reread_data {
551 const char* const filename;
552 int fd;
553 };
554
555 #define PIDHASH_SZ 1024
556 static struct proc *pidhash[PIDHASH_SZ];
557 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
558
559 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
560 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
561
562 // protects procadjslot_list from concurrent access
563 static std::shared_mutex adjslot_list_lock;
564 // procadjslot_list should be modified only from the main thread while exclusively holding
565 // adjslot_list_lock. Readers from non-main threads should hold adjslot_list_lock shared lock.
566 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
567
568 #define MAX_DISTINCT_OOM_ADJ 32
569 #define KILLCNT_INVALID_IDX 0xFF
570 /*
571 * Because killcnt array is sparse a two-level indirection is used
572 * to keep the size small. killcnt_idx stores index of the element in
573 * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
574 */
575 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
576 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
577 static int killcnt_free_idx = 0;
578 static uint32_t killcnt_total = 0;
579
580 static int pagesize;
581 static long page_k; /* page size in kB */
582
583 static bool update_props();
584 static bool init_monitors();
585 static void destroy_monitors();
586 static bool init_memevent_listener_monitoring();
587
clamp(int low,int high,int value)588 static int clamp(int low, int high, int value) {
589 return std::max(std::min(value, high), low);
590 }
591
parse_int64(const char * str,int64_t * ret)592 static bool parse_int64(const char* str, int64_t* ret) {
593 char* endptr;
594 long long val = strtoll(str, &endptr, 10);
595 if (str == endptr || val > INT64_MAX) {
596 return false;
597 }
598 *ret = (int64_t)val;
599 return true;
600 }
601
find_field(const char * name,const char * const field_names[],int field_count)602 static int find_field(const char* name, const char* const field_names[], int field_count) {
603 for (int i = 0; i < field_count; i++) {
604 if (!strcmp(name, field_names[i])) {
605 return i;
606 }
607 }
608 return -1;
609 }
610
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)611 static enum field_match_result match_field(const char* cp, const char* ap,
612 const char* const field_names[],
613 int field_count, int64_t* field,
614 int *field_idx) {
615 int i = find_field(cp, field_names, field_count);
616 if (i < 0) {
617 return NO_MATCH;
618 }
619 *field_idx = i;
620 return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
621 }
622
623 /*
624 * Read file content from the beginning up to max_len bytes or EOF
625 * whichever happens first.
626 */
read_all(int fd,char * buf,size_t max_len)627 static ssize_t read_all(int fd, char *buf, size_t max_len)
628 {
629 ssize_t ret = 0;
630 off_t offset = 0;
631
632 while (max_len > 0) {
633 ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
634 if (r == 0) {
635 break;
636 }
637 if (r == -1) {
638 return -1;
639 }
640 ret += r;
641 buf += r;
642 offset += r;
643 max_len -= r;
644 }
645
646 return ret;
647 }
648
649 /*
650 * Read a new or already opened file from the beginning.
651 * If the file has not been opened yet data->fd should be set to -1.
652 * To be used with files which are read often and possibly during high
653 * memory pressure to minimize file opening which by itself requires kernel
654 * memory allocation and might result in a stall on memory stressed system.
655 */
reread_file(struct reread_data * data)656 static char *reread_file(struct reread_data *data) {
657 /* start with page-size buffer and increase if needed */
658 static ssize_t buf_size = pagesize;
659 static char *new_buf, *buf = NULL;
660 ssize_t size;
661
662 if (data->fd == -1) {
663 /* First-time buffer initialization */
664 if (!buf && (buf = static_cast<char*>(malloc(buf_size))) == nullptr) {
665 return NULL;
666 }
667
668 data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
669 if (data->fd < 0) {
670 ALOGE("%s open: %s", data->filename, strerror(errno));
671 return NULL;
672 }
673 }
674
675 while (true) {
676 size = read_all(data->fd, buf, buf_size - 1);
677 if (size < 0) {
678 ALOGE("%s read: %s", data->filename, strerror(errno));
679 close(data->fd);
680 data->fd = -1;
681 return NULL;
682 }
683 if (size < buf_size - 1) {
684 break;
685 }
686 /*
687 * Since we are reading /proc files we can't use fstat to find out
688 * the real size of the file. Double the buffer size and keep retrying.
689 */
690 if ((new_buf = static_cast<char*>(realloc(buf, buf_size * 2))) == nullptr) {
691 errno = ENOMEM;
692 return NULL;
693 }
694 buf = new_buf;
695 buf_size *= 2;
696 }
697 buf[size] = 0;
698
699 return buf;
700 }
701
claim_record(struct proc * procp,pid_t pid)702 static bool claim_record(struct proc* procp, pid_t pid) {
703 if (procp->reg_pid == pid) {
704 /* Record already belongs to the registrant */
705 return true;
706 }
707 if (procp->reg_pid == 0) {
708 /* Old registrant is gone, claim the record */
709 procp->reg_pid = pid;
710 return true;
711 }
712 /* The record is owned by another registrant */
713 return false;
714 }
715
remove_claims(pid_t pid)716 static void remove_claims(pid_t pid) {
717 int i;
718
719 for (i = 0; i < PIDHASH_SZ; i++) {
720 struct proc* procp = pidhash[i];
721 while (procp) {
722 if (procp->reg_pid == pid) {
723 procp->reg_pid = 0;
724 }
725 procp = procp->pidhash_next;
726 }
727 }
728 }
729
ctrl_data_close(int dsock_idx)730 static void ctrl_data_close(int dsock_idx) {
731 struct epoll_event epev;
732
733 ALOGI("closing lmkd data connection");
734 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
735 // Log a warning and keep going
736 ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
737 }
738 maxevents--;
739
740 close(data_sock[dsock_idx].sock);
741 data_sock[dsock_idx].sock = -1;
742
743 /* Mark all records of the old registrant as unclaimed */
744 remove_claims(data_sock[dsock_idx].pid);
745 }
746
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz,struct ucred * sender_cred)747 static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) {
748 struct iovec iov = {buf, bufsz};
749 char control[CMSG_SPACE(sizeof(struct ucred))];
750 struct msghdr hdr = {
751 NULL, 0, &iov, 1, control, sizeof(control), 0,
752 };
753 ssize_t ret;
754 ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0));
755 if (ret == -1) {
756 ALOGE("control data socket read failed; %s", strerror(errno));
757 return -1;
758 }
759 if (ret == 0) {
760 ALOGE("Got EOF on control data socket");
761 return -1;
762 }
763
764 struct ucred* cred = NULL;
765 struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
766 while (cmsg != NULL) {
767 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) {
768 cred = (struct ucred*)CMSG_DATA(cmsg);
769 break;
770 }
771 cmsg = CMSG_NXTHDR(&hdr, cmsg);
772 }
773
774 if (cred == NULL) {
775 ALOGE("Failed to retrieve sender credentials");
776 /* Close the connection */
777 ctrl_data_close(dsock_idx);
778 return -1;
779 }
780
781 memcpy(sender_cred, cred, sizeof(struct ucred));
782
783 /* Store PID of the peer */
784 data_sock[dsock_idx].pid = cred->pid;
785
786 return ret;
787 }
788
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)789 static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) {
790 int ret = 0;
791
792 ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
793
794 if (ret == -1) {
795 ALOGE("control data socket write failed; errno=%d", errno);
796 } else if (ret == 0) {
797 ALOGE("Got EOF on control data socket");
798 ret = -1;
799 }
800
801 return ret;
802 }
803
804 /*
805 * Write the pid/uid pair over the data socket, note: all active clients
806 * will receive this unsolicited notification.
807 */
ctrl_data_write_lmk_kill_occurred(pid_t pid,uid_t uid,int64_t rss_kb)808 static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid, int64_t rss_kb) {
809 LMKD_CTRL_PACKET packet;
810 size_t len = lmkd_pack_set_prockills(packet, pid, uid, static_cast<int>(rss_kb));
811
812 for (int i = 0; i < MAX_DATA_CONN; i++) {
813 if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) {
814 ctrl_data_write(i, (char*)packet, len);
815 }
816 }
817 }
818
819 /*
820 * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd
821 */
stats_write_lmk_kill_occurred(struct kill_stat * kill_st,struct memory_stat * mem_st)822 static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st,
823 struct memory_stat *mem_st) {
824 LMK_KILL_OCCURRED_PACKET packet;
825 const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st);
826 if (len == 0) {
827 return;
828 }
829
830 for (int i = 0; i < MAX_DATA_CONN; i++) {
831 if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
832 ctrl_data_write(i, packet, len);
833 }
834 }
835
836 }
837
stats_write_lmk_kill_occurred_pid(int pid,struct kill_stat * kill_st,struct memory_stat * mem_st)838 static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st,
839 struct memory_stat *mem_st) {
840 kill_st->taskname = stats_get_task_name(pid);
841 if (kill_st->taskname != NULL) {
842 stats_write_lmk_kill_occurred(kill_st, mem_st);
843 }
844 }
845
poll_kernel(int poll_fd)846 static void poll_kernel(int poll_fd) {
847 if (poll_fd == -1) {
848 // not waiting
849 return;
850 }
851
852 while (1) {
853 char rd_buf[256];
854 int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf) - 1, 0));
855 if (bytes_read <= 0) break;
856 rd_buf[bytes_read] = '\0';
857
858 int64_t pid;
859 int64_t uid;
860 int64_t group_leader_pid;
861 int64_t rss_in_pages;
862 struct memory_stat mem_st = {};
863 int16_t oom_score_adj;
864 int16_t min_score_adj;
865 int64_t starttime;
866 char* taskname = 0;
867 int64_t rss_kb;
868
869 int fields_read =
870 sscanf(rd_buf,
871 "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64
872 " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]",
873 &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault,
874 &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname);
875
876 /* only the death of the group leader process is logged */
877 if (fields_read == 10 && group_leader_pid == pid) {
878 mem_st.rss_in_bytes = rss_in_pages * pagesize;
879 rss_kb = mem_st.rss_in_bytes >> 10;
880 ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid, rss_kb);
881 mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK));
882
883 struct kill_stat kill_st = {
884 .uid = static_cast<int32_t>(uid),
885 .kill_reason = NONE,
886 .oom_score = oom_score_adj,
887 .min_oom_score = min_score_adj,
888 .free_mem_kb = 0,
889 .free_swap_kb = 0,
890 };
891 stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st);
892 }
893
894 free(taskname);
895 }
896 }
897
init_poll_kernel()898 static bool init_poll_kernel() {
899 kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC));
900
901 if (kpoll_fd < 0) {
902 ALOGE("kernel lmk event file could not be opened; errno=%d", errno);
903 return false;
904 }
905
906 return true;
907 }
908
pid_lookup(int pid)909 static struct proc *pid_lookup(int pid) {
910 struct proc *procp;
911
912 for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
913 procp = procp->pidhash_next)
914 ;
915
916 return procp;
917 }
918
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new_element)919 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
920 {
921 struct adjslot_list *next = head->next;
922 new_element->prev = head;
923 new_element->next = next;
924 next->prev = new_element;
925 head->next = new_element;
926 }
927
adjslot_remove(struct adjslot_list * old)928 static void adjslot_remove(struct adjslot_list *old)
929 {
930 struct adjslot_list *prev = old->prev;
931 struct adjslot_list *next = old->next;
932 next->prev = prev;
933 prev->next = next;
934 }
935
adjslot_tail(struct adjslot_list * head)936 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
937 struct adjslot_list *asl = head->prev;
938
939 return asl == head ? NULL : asl;
940 }
941
942 // Should be modified only from the main thread.
proc_slot(struct proc * procp)943 static void proc_slot(struct proc *procp) {
944 int adjslot = ADJTOSLOT(procp->oomadj);
945 std::scoped_lock lock(adjslot_list_lock);
946
947 adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
948 }
949
950 // Should be modified only from the main thread.
proc_unslot(struct proc * procp)951 static void proc_unslot(struct proc *procp) {
952 std::scoped_lock lock(adjslot_list_lock);
953
954 adjslot_remove(&procp->asl);
955 }
956
proc_insert(struct proc * procp)957 static void proc_insert(struct proc *procp) {
958 int hval = pid_hashfn(procp->pid);
959
960 procp->pidhash_next = pidhash[hval];
961 pidhash[hval] = procp;
962 proc_slot(procp);
963 }
964
965 // Can be called only from the main thread.
pid_remove(int pid)966 static int pid_remove(int pid) {
967 int hval = pid_hashfn(pid);
968 struct proc *procp;
969 struct proc *prevp;
970
971 for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
972 procp = procp->pidhash_next)
973 prevp = procp;
974
975 if (!procp)
976 return -1;
977
978 if (!prevp)
979 pidhash[hval] = procp->pidhash_next;
980 else
981 prevp->pidhash_next = procp->pidhash_next;
982
983 proc_unslot(procp);
984 /*
985 * Close pidfd here if we are not waiting for corresponding process to die,
986 * in which case stop_wait_for_proc_kill() will close the pidfd later
987 */
988 if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
989 close(procp->pidfd);
990 }
991 free(procp);
992 return 0;
993 }
994
pid_invalidate(int pid)995 static void pid_invalidate(int pid) {
996 std::shared_lock lock(adjslot_list_lock);
997 struct proc *procp = pid_lookup(pid);
998
999 if (procp) {
1000 procp->valid = false;
1001 }
1002 }
1003
1004 /*
1005 * Write a string to a file.
1006 * Returns false if the file does not exist.
1007 */
writefilestring(const char * path,const char * s,bool err_if_missing)1008 static bool writefilestring(const char *path, const char *s,
1009 bool err_if_missing) {
1010 int fd = open(path, O_WRONLY | O_CLOEXEC);
1011 ssize_t len = strlen(s);
1012 ssize_t ret;
1013
1014 if (fd < 0) {
1015 if (err_if_missing) {
1016 ALOGE("Error opening %s; errno=%d", path, errno);
1017 }
1018 return false;
1019 }
1020
1021 ret = TEMP_FAILURE_RETRY(write(fd, s, len));
1022 if (ret < 0) {
1023 ALOGE("Error writing %s; errno=%d", path, errno);
1024 } else if (ret < len) {
1025 ALOGE("Short write on %s; length=%zd", path, ret);
1026 }
1027
1028 close(fd);
1029 return true;
1030 }
1031
get_time_diff_ms(struct timespec * from,struct timespec * to)1032 static inline long get_time_diff_ms(struct timespec *from,
1033 struct timespec *to) {
1034 return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
1035 (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
1036 }
1037
1038 /* Reads /proc/pid/status into buf. */
read_proc_status(int pid,char * buf,size_t buf_sz)1039 static bool read_proc_status(int pid, char *buf, size_t buf_sz) {
1040 char path[PROCFS_PATH_MAX];
1041 int fd;
1042 ssize_t size;
1043
1044 snprintf(path, PROCFS_PATH_MAX, "/proc/%d/status", pid);
1045 fd = open(path, O_RDONLY | O_CLOEXEC);
1046 if (fd < 0) {
1047 return false;
1048 }
1049
1050 size = read_all(fd, buf, buf_sz - 1);
1051 close(fd);
1052 if (size <= 0) {
1053 return false;
1054 }
1055 buf[size] = 0;
1056 return true;
1057 }
1058
1059 /* Looks for tag in buf and parses the first integer */
parse_status_tag(char * buf,const char * tag,int64_t * out)1060 static bool parse_status_tag(char *buf, const char *tag, int64_t *out) {
1061 char *pos = buf;
1062 while (true) {
1063 pos = strstr(pos, tag);
1064 /* Stop if tag not found or found at the line beginning */
1065 if (pos == NULL || pos == buf || pos[-1] == '\n') {
1066 break;
1067 }
1068 pos++;
1069 }
1070
1071 if (pos == NULL) {
1072 return false;
1073 }
1074
1075 pos += strlen(tag);
1076 while (*pos == ' ') ++pos;
1077 return parse_int64(pos, out);
1078 }
1079
proc_get_size(int pid)1080 static int proc_get_size(int pid) {
1081 char path[PROCFS_PATH_MAX];
1082 char line[LINE_MAX];
1083 int fd;
1084 int rss = 0;
1085 int total;
1086 ssize_t ret;
1087
1088 /* gid containing AID_READPROC required */
1089 snprintf(path, PROCFS_PATH_MAX, "/proc/%d/statm", pid);
1090 fd = open(path, O_RDONLY | O_CLOEXEC);
1091 if (fd == -1)
1092 return -1;
1093
1094 ret = read_all(fd, line, sizeof(line) - 1);
1095 if (ret < 0) {
1096 close(fd);
1097 return -1;
1098 }
1099 line[ret] = '\0';
1100
1101 sscanf(line, "%d %d ", &total, &rss);
1102 close(fd);
1103 return rss;
1104 }
1105
proc_get_name(int pid,char * buf,size_t buf_size)1106 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
1107 char path[PROCFS_PATH_MAX];
1108 int fd;
1109 char *cp;
1110 ssize_t ret;
1111
1112 /* gid containing AID_READPROC required */
1113 snprintf(path, PROCFS_PATH_MAX, "/proc/%d/cmdline", pid);
1114 fd = open(path, O_RDONLY | O_CLOEXEC);
1115 if (fd == -1) {
1116 return NULL;
1117 }
1118 ret = read_all(fd, buf, buf_size - 1);
1119 close(fd);
1120 if (ret <= 0) {
1121 return NULL;
1122 }
1123 buf[ret] = '\0';
1124
1125 cp = strchr(buf, ' ');
1126 if (cp) {
1127 *cp = '\0';
1128 }
1129
1130 return buf;
1131 }
1132
register_oom_adj_proc(const struct lmk_procprio & proc,struct ucred * cred)1133 static void register_oom_adj_proc(const struct lmk_procprio& proc, struct ucred* cred) {
1134 char val[20];
1135 int soft_limit_mult;
1136 bool is_system_server;
1137 struct passwd *pwdrec;
1138 struct proc* procp;
1139 int oom_adj_score = proc.oomadj;
1140
1141 /* lmkd should not change soft limits for services */
1142 if (proc.ptype == PROC_TYPE_APP && per_app_memcg) {
1143 if (proc.oomadj >= 900) {
1144 soft_limit_mult = 0;
1145 } else if (proc.oomadj >= 800) {
1146 soft_limit_mult = 0;
1147 } else if (proc.oomadj >= 700) {
1148 soft_limit_mult = 0;
1149 } else if (proc.oomadj >= 600) {
1150 // Launcher should be perceptible, don't kill it.
1151 oom_adj_score = 200;
1152 soft_limit_mult = 1;
1153 } else if (proc.oomadj >= 500) {
1154 soft_limit_mult = 0;
1155 } else if (proc.oomadj >= 400) {
1156 soft_limit_mult = 0;
1157 } else if (proc.oomadj >= 300) {
1158 soft_limit_mult = 1;
1159 } else if (proc.oomadj >= 200) {
1160 soft_limit_mult = 8;
1161 } else if (proc.oomadj >= 100) {
1162 soft_limit_mult = 10;
1163 } else if (proc.oomadj >= 0) {
1164 soft_limit_mult = 20;
1165 } else {
1166 // Persistent processes will have a large
1167 // soft limit 512MB.
1168 soft_limit_mult = 64;
1169 }
1170
1171 std::string soft_limit_path;
1172 if (!CgroupGetAttributePathForTask("MemSoftLimit", proc.pid, &soft_limit_path)) {
1173 ALOGE("Querying MemSoftLimit path failed");
1174 return;
1175 }
1176
1177 snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
1178
1179 /*
1180 * system_server process has no memcg under /dev/memcg/apps but should be
1181 * registered with lmkd. This is the best way so far to identify it.
1182 */
1183 is_system_server = (oom_adj_score == SYSTEM_ADJ && (pwdrec = getpwnam("system")) != NULL &&
1184 proc.uid == pwdrec->pw_uid);
1185 writefilestring(soft_limit_path.c_str(), val, !is_system_server);
1186 }
1187
1188 procp = pid_lookup(proc.pid);
1189 if (!procp) {
1190 int pidfd = -1;
1191
1192 if (pidfd_supported) {
1193 pidfd = TEMP_FAILURE_RETRY(pidfd_open(proc.pid, 0));
1194 if (pidfd < 0) {
1195 ALOGE("pidfd_open for pid %d failed; errno=%d", proc.pid, errno);
1196 return;
1197 }
1198 }
1199
1200 procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
1201 if (!procp) {
1202 // Oh, the irony. May need to rebuild our state.
1203 return;
1204 }
1205
1206 procp->pid = proc.pid;
1207 procp->pidfd = pidfd;
1208 procp->uid = proc.uid;
1209 procp->reg_pid = cred->pid;
1210 procp->oomadj = oom_adj_score;
1211 procp->valid = true;
1212 proc_insert(procp);
1213 } else {
1214 if (!claim_record(procp, cred->pid)) {
1215 char buf[LINE_MAX];
1216 char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1217 /* Only registrant of the record can remove it */
1218 ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
1219 taskname ? taskname : "A process ", cred->uid, cred->pid);
1220 return;
1221 }
1222 proc_unslot(procp);
1223 procp->oomadj = oom_adj_score;
1224 proc_slot(procp);
1225 }
1226 }
1227
apply_proc_prio(const struct lmk_procprio & params,struct ucred * cred)1228 static void apply_proc_prio(const struct lmk_procprio& params, struct ucred* cred) {
1229 char path[PROCFS_PATH_MAX];
1230 char val[20];
1231 int64_t tgid;
1232 char buf[pagesize];
1233
1234 if (params.oomadj < OOM_SCORE_ADJ_MIN || params.oomadj > OOM_SCORE_ADJ_MAX) {
1235 ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
1236 return;
1237 }
1238
1239 if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
1240 ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
1241 return;
1242 }
1243
1244 /* Check if registered process is a thread group leader */
1245 if (read_proc_status(params.pid, buf, sizeof(buf))) {
1246 if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
1247 ALOGE("Attempt to register a task that is not a thread group leader "
1248 "(tid %d, tgid %" PRId64 ")",
1249 params.pid, tgid);
1250 return;
1251 }
1252 }
1253
1254 /* gid containing AID_READPROC required */
1255 /* CAP_SYS_RESOURCE required */
1256 /* CAP_DAC_OVERRIDE required */
1257 snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
1258 snprintf(val, sizeof(val), "%d", params.oomadj);
1259 if (!writefilestring(path, val, false)) {
1260 ALOGW("Failed to open %s; errno=%d: process %d might have been killed", path, errno,
1261 params.pid);
1262 /* If this file does not exist the process is dead. */
1263 return;
1264 }
1265
1266 if (use_inkernel_interface) {
1267 stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
1268 return;
1269 }
1270
1271 register_oom_adj_proc(params, cred);
1272 }
1273
cmd_procprio(LMKD_CTRL_PACKET packet,int field_count,struct ucred * cred)1274 static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred* cred) {
1275 struct lmk_procprio proc_prio;
1276
1277 lmkd_pack_get_procprio(packet, field_count, &proc_prio);
1278 apply_proc_prio(proc_prio, cred);
1279 }
1280
cmd_procremove(LMKD_CTRL_PACKET packet,struct ucred * cred)1281 static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) {
1282 struct lmk_procremove params;
1283 struct proc *procp;
1284
1285 lmkd_pack_get_procremove(packet, ¶ms);
1286
1287 if (use_inkernel_interface) {
1288 /*
1289 * Perform an extra check before the pid is removed, after which it
1290 * will be impossible for poll_kernel to get the taskname. poll_kernel()
1291 * is potentially a long-running blocking function; however this method
1292 * handles AMS requests but does not block AMS.
1293 */
1294 poll_kernel(kpoll_fd);
1295
1296 stats_remove_taskname(params.pid);
1297 return;
1298 }
1299
1300 procp = pid_lookup(params.pid);
1301 if (!procp) {
1302 return;
1303 }
1304
1305 if (!claim_record(procp, cred->pid)) {
1306 char buf[LINE_MAX];
1307 char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1308 /* Only registrant of the record can remove it */
1309 ALOGE("%s (%d, %d) attempts to unregister a process registered by another client",
1310 taskname ? taskname : "A process ", cred->uid, cred->pid);
1311 return;
1312 }
1313
1314 /*
1315 * WARNING: After pid_remove() procp is freed and can't be used!
1316 * Therefore placed at the end of the function.
1317 */
1318 pid_remove(params.pid);
1319 }
1320
cmd_procpurge(struct ucred * cred)1321 static void cmd_procpurge(struct ucred *cred) {
1322 int i;
1323 struct proc *procp;
1324 struct proc *next;
1325
1326 if (use_inkernel_interface) {
1327 stats_purge_tasknames();
1328 return;
1329 }
1330
1331 for (i = 0; i < PIDHASH_SZ; i++) {
1332 procp = pidhash[i];
1333 while (procp) {
1334 next = procp->pidhash_next;
1335 /* Purge only records created by the requestor */
1336 if (claim_record(procp, cred->pid)) {
1337 pid_remove(procp->pid);
1338 }
1339 procp = next;
1340 }
1341 }
1342 }
1343
cmd_subscribe(int dsock_idx,LMKD_CTRL_PACKET packet)1344 static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) {
1345 struct lmk_subscribe params;
1346
1347 lmkd_pack_get_subscribe(packet, ¶ms);
1348 data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type;
1349 }
1350
inc_killcnt(int oomadj)1351 static void inc_killcnt(int oomadj) {
1352 int slot = ADJTOSLOT(oomadj);
1353 uint8_t idx = killcnt_idx[slot];
1354
1355 if (idx == KILLCNT_INVALID_IDX) {
1356 /* index is not assigned for this oomadj */
1357 if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1358 killcnt_idx[slot] = killcnt_free_idx;
1359 killcnt[killcnt_free_idx] = 1;
1360 killcnt_free_idx++;
1361 } else {
1362 ALOGW("Number of distinct oomadj levels exceeds %d",
1363 MAX_DISTINCT_OOM_ADJ);
1364 }
1365 } else {
1366 /*
1367 * wraparound is highly unlikely and is detectable using total
1368 * counter because it has to be equal to the sum of all counters
1369 */
1370 killcnt[idx]++;
1371 }
1372 /* increment total kill counter */
1373 killcnt_total++;
1374 }
1375
get_killcnt(int min_oomadj,int max_oomadj)1376 static int get_killcnt(int min_oomadj, int max_oomadj) {
1377 int slot;
1378 int count = 0;
1379
1380 if (min_oomadj > max_oomadj)
1381 return 0;
1382
1383 /* special case to get total kill count */
1384 if (min_oomadj > OOM_SCORE_ADJ_MAX)
1385 return killcnt_total;
1386
1387 while (min_oomadj <= max_oomadj &&
1388 (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1389 uint8_t idx = killcnt_idx[slot];
1390 if (idx != KILLCNT_INVALID_IDX) {
1391 count += killcnt[idx];
1392 }
1393 min_oomadj++;
1394 }
1395
1396 return count;
1397 }
1398
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1399 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1400 struct lmk_getkillcnt params;
1401
1402 if (use_inkernel_interface) {
1403 /* kernel driver does not expose this information */
1404 return 0;
1405 }
1406
1407 lmkd_pack_get_getkillcnt(packet, ¶ms);
1408
1409 return get_killcnt(params.min_oomadj, params.max_oomadj);
1410 }
1411
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1412 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1413 int i;
1414 struct lmk_target target;
1415 char minfree_str[PROPERTY_VALUE_MAX];
1416 char *pstr = minfree_str;
1417 char *pend = minfree_str + sizeof(minfree_str);
1418 static struct timespec last_req_tm;
1419 struct timespec curr_tm;
1420
1421 if (ntargets < 1 || ntargets > (int)lowmem_adj.size()) {
1422 return;
1423 }
1424
1425 /*
1426 * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1427 * to prevent DoS attacks
1428 */
1429 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1430 ALOGE("Failed to get current time");
1431 return;
1432 }
1433
1434 if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1435 TARGET_UPDATE_MIN_INTERVAL_MS) {
1436 ALOGE("Ignoring frequent updated to lmkd limits");
1437 return;
1438 }
1439
1440 last_req_tm = curr_tm;
1441
1442 for (i = 0; i < ntargets; i++) {
1443 lmkd_pack_get_target(packet, i, &target);
1444 lowmem_minfree[i] = target.minfree;
1445 lowmem_adj[i] = target.oom_adj_score;
1446
1447 pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1448 target.oom_adj_score);
1449 if (pstr >= pend) {
1450 /* if no more space in the buffer then terminate the loop */
1451 pstr = pend;
1452 break;
1453 }
1454 }
1455
1456 lowmem_targets_size = ntargets;
1457
1458 /* Override the last extra comma */
1459 pstr[-1] = '\0';
1460 property_set("sys.lmk.minfree_levels", minfree_str);
1461
1462 if (has_inkernel_module) {
1463 char minfreestr[128];
1464 char killpriostr[128];
1465
1466 minfreestr[0] = '\0';
1467 killpriostr[0] = '\0';
1468
1469 for (i = 0; i < lowmem_targets_size; i++) {
1470 char val[40];
1471
1472 if (i) {
1473 strlcat(minfreestr, ",", sizeof(minfreestr));
1474 strlcat(killpriostr, ",", sizeof(killpriostr));
1475 }
1476
1477 snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1478 strlcat(minfreestr, val, sizeof(minfreestr));
1479 snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1480 strlcat(killpriostr, val, sizeof(killpriostr));
1481 }
1482
1483 writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1484 writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1485 }
1486 }
1487
cmd_procs_prio(LMKD_CTRL_PACKET packet,const int field_count,struct ucred * cred)1488 static void cmd_procs_prio(LMKD_CTRL_PACKET packet, const int field_count, struct ucred* cred) {
1489 struct lmk_procs_prio params;
1490
1491 const int procs_count = lmkd_pack_get_procs_prio(packet, ¶ms, field_count);
1492 if (procs_count < 0) {
1493 ALOGE("LMK_PROCS_PRIO received invalid packet format");
1494 return;
1495 }
1496
1497 for (int i = 0; i < procs_count; i++) {
1498 apply_proc_prio(params.procs[i], cred);
1499 }
1500 }
1501
ctrl_command_handler(int dsock_idx)1502 static void ctrl_command_handler(int dsock_idx) {
1503 LMKD_CTRL_PACKET packet;
1504 struct ucred cred;
1505 int len;
1506 enum lmk_cmd cmd;
1507 int nargs;
1508 int targets;
1509 int kill_cnt;
1510 int result;
1511
1512 len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
1513 if (len <= 0)
1514 return;
1515
1516 if (len < (int)sizeof(int)) {
1517 ALOGE("Wrong control socket read length len=%d", len);
1518 return;
1519 }
1520
1521 cmd = lmkd_pack_get_cmd(packet);
1522 nargs = len / sizeof(int) - 1;
1523 if (nargs < 0)
1524 goto wronglen;
1525
1526 switch(cmd) {
1527 case LMK_TARGET:
1528 targets = nargs / 2;
1529 if (nargs & 0x1 || targets > (int)lowmem_adj.size()) {
1530 goto wronglen;
1531 }
1532 cmd_target(targets, packet);
1533 break;
1534 case LMK_PROCPRIO:
1535 /* process type field is optional for backward compatibility */
1536 if (nargs < 3 || nargs > 4)
1537 goto wronglen;
1538 cmd_procprio(packet, nargs, &cred);
1539 break;
1540 case LMK_PROCREMOVE:
1541 if (nargs != 1)
1542 goto wronglen;
1543 cmd_procremove(packet, &cred);
1544 break;
1545 case LMK_PROCPURGE:
1546 if (nargs != 0)
1547 goto wronglen;
1548 cmd_procpurge(&cred);
1549 break;
1550 case LMK_GETKILLCNT:
1551 if (nargs != 2)
1552 goto wronglen;
1553 kill_cnt = cmd_getkillcnt(packet);
1554 len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1555 if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1556 return;
1557 break;
1558 case LMK_SUBSCRIBE:
1559 if (nargs != 1)
1560 goto wronglen;
1561 cmd_subscribe(dsock_idx, packet);
1562 break;
1563 case LMK_PROCKILL:
1564 /* This command code is NOT expected at all */
1565 ALOGE("Received unexpected command code %d", cmd);
1566 break;
1567 case LMK_UPDATE_PROPS:
1568 if (nargs != 0)
1569 goto wronglen;
1570 result = -1;
1571 if (update_props()) {
1572 if (!use_inkernel_interface && monitors_initialized) {
1573 /* Reinitialize monitors to apply new settings */
1574 destroy_monitors();
1575 if (init_monitors()) {
1576 result = 0;
1577 }
1578 } else {
1579 result = 0;
1580 }
1581
1582 if (direct_reclaim_threshold_ms > 0 && !memevent_listener) {
1583 ALOGW("Kernel support for direct_reclaim_threshold_ms is not found");
1584 direct_reclaim_threshold_ms = 0;
1585 }
1586 }
1587
1588 len = lmkd_pack_set_update_props_repl(packet, result);
1589 if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
1590 ALOGE("Failed to report operation results");
1591 }
1592 if (!result) {
1593 ALOGI("Properties reinitilized");
1594 } else {
1595 /* New settings can't be supported, crash to be restarted */
1596 ALOGE("New configuration is not supported. Exiting...");
1597 exit(1);
1598 }
1599 break;
1600 case LMK_START_MONITORING:
1601 if (nargs != 0)
1602 goto wronglen;
1603 // Registration is needed only if it was skipped earlier.
1604 if (monitors_initialized)
1605 return;
1606 if (!property_get_bool("sys.boot_completed", false)) {
1607 ALOGE("LMK_START_MONITORING cannot be handled before boot completed");
1608 return;
1609 }
1610
1611 if (!init_monitors()) {
1612 /* Failure to start psi monitoring, crash to be restarted */
1613 ALOGE("Failure to initialize monitoring. Exiting...");
1614 exit(1);
1615 }
1616 ALOGI("Initialized monitors after boot completed.");
1617 break;
1618 case LMK_BOOT_COMPLETED:
1619 if (nargs != 0) goto wronglen;
1620
1621 if (boot_completed_handled) {
1622 /* Notify we have already handled post boot-up operations */
1623 result = 1;
1624 } else if (!property_get_bool("sys.boot_completed", false)) {
1625 ALOGE("LMK_BOOT_COMPLETED cannot be handled before boot completed");
1626 result = -1;
1627 } else {
1628 /*
1629 * Initialize the memevent listener after boot is completed to prevent
1630 * waiting, during boot-up, for BPF programs to be loaded.
1631 */
1632 if (init_memevent_listener_monitoring()) {
1633 ALOGI("Using memevents for direct reclaim and kswapd detection");
1634 } else {
1635 ALOGI("Using vmstats for direct reclaim and kswapd detection");
1636 if (direct_reclaim_threshold_ms > 0) {
1637 ALOGW("Kernel support for direct_reclaim_threshold_ms is not found");
1638 direct_reclaim_threshold_ms = 0;
1639 }
1640 }
1641 result = 0;
1642 boot_completed_handled = true;
1643 }
1644
1645 len = lmkd_pack_set_boot_completed_notif_repl(packet, result);
1646 if (ctrl_data_write(dsock_idx, (char*)packet, len) != len) {
1647 ALOGE("Failed to report boot-completed operation results");
1648 }
1649 break;
1650 case LMK_PROCS_PRIO:
1651 cmd_procs_prio(packet, nargs, &cred);
1652 break;
1653 default:
1654 ALOGE("Received unknown command code %d", cmd);
1655 return;
1656 }
1657
1658 return;
1659
1660 wronglen:
1661 ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1662 }
1663
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1664 static void ctrl_data_handler(int data, uint32_t events,
1665 struct polling_params *poll_params __unused) {
1666 if (events & EPOLLIN) {
1667 ctrl_command_handler(data);
1668 }
1669 }
1670
get_free_dsock()1671 static int get_free_dsock() {
1672 for (int i = 0; i < MAX_DATA_CONN; i++) {
1673 if (data_sock[i].sock < 0) {
1674 return i;
1675 }
1676 }
1677 return -1;
1678 }
1679
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1680 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1681 struct polling_params *poll_params __unused) {
1682 struct epoll_event epev;
1683 int free_dscock_idx = get_free_dsock();
1684
1685 if (free_dscock_idx < 0) {
1686 /*
1687 * Number of data connections exceeded max supported. This should not
1688 * happen but if it does we drop all existing connections and accept
1689 * the new one. This prevents inactive connections from monopolizing
1690 * data socket and if we drop ActivityManager connection it will
1691 * immediately reconnect.
1692 */
1693 for (int i = 0; i < MAX_DATA_CONN; i++) {
1694 ctrl_data_close(i);
1695 }
1696 free_dscock_idx = 0;
1697 }
1698
1699 data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1700 if (data_sock[free_dscock_idx].sock < 0) {
1701 ALOGE("lmkd control socket accept failed; errno=%d", errno);
1702 return;
1703 }
1704
1705 ALOGI("lmkd data connection established");
1706 /* use data to store data connection idx */
1707 data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1708 data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1709 data_sock[free_dscock_idx].async_event_mask = 0;
1710 epev.events = EPOLLIN;
1711 epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1712 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1713 ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1714 ctrl_data_close(free_dscock_idx);
1715 return;
1716 }
1717 maxevents++;
1718 }
1719
1720 /*
1721 * /proc/zoneinfo parsing routines
1722 * Expected file format is:
1723 *
1724 * Node <node_id>, zone <zone_name>
1725 * (
1726 * per-node stats
1727 * (<per-node field name> <value>)+
1728 * )?
1729 * (pages free <value>
1730 * (<per-zone field name> <value>)+
1731 * pagesets
1732 * (<unused fields>)*
1733 * )+
1734 * ...
1735 */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1736 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1737 int zone_idx;
1738 int64_t max = 0;
1739 char *save_ptr;
1740
1741 for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1742 buf && zone_idx < MAX_NR_ZONES;
1743 buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1744 long long zoneval = strtoll(buf, &buf, 0);
1745 if (zoneval > max) {
1746 max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1747 }
1748 zone->protection[zone_idx] = zoneval;
1749 }
1750 zone->max_protection = max;
1751 }
1752
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1753 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1754 for (char *line = strtok_r(NULL, "\n", buf); line;
1755 line = strtok_r(NULL, "\n", buf)) {
1756 char *cp;
1757 char *ap;
1758 char *save_ptr;
1759 int64_t val;
1760 int field_idx;
1761 enum field_match_result match_res;
1762
1763 cp = strtok_r(line, " ", &save_ptr);
1764 if (!cp) {
1765 return false;
1766 }
1767
1768 field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1769 if (field_idx >= 0) {
1770 /* special field */
1771 if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1772 /* no mode fields we are interested in */
1773 return true;
1774 }
1775
1776 /* protection field */
1777 ap = strtok_r(NULL, ")", &save_ptr);
1778 if (ap) {
1779 zoneinfo_parse_protection(ap, zone);
1780 }
1781 continue;
1782 }
1783
1784 ap = strtok_r(NULL, " ", &save_ptr);
1785 if (!ap) {
1786 continue;
1787 }
1788
1789 match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1790 &val, &field_idx);
1791 if (match_res == PARSE_FAIL) {
1792 return false;
1793 }
1794 if (match_res == PARSE_SUCCESS) {
1795 zone->fields.arr[field_idx] = val;
1796 }
1797 if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1798 /* zone is not populated, stop parsing it */
1799 return true;
1800 }
1801 }
1802 return false;
1803 }
1804
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1805 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1806 int fields_to_match = ZI_NODE_FIELD_COUNT;
1807
1808 for (char *line = strtok_r(NULL, "\n", buf); line;
1809 line = strtok_r(NULL, "\n", buf)) {
1810 char *cp;
1811 char *ap;
1812 char *save_ptr;
1813 int64_t val;
1814 int field_idx;
1815 enum field_match_result match_res;
1816
1817 cp = strtok_r(line, " ", &save_ptr);
1818 if (!cp) {
1819 return false;
1820 }
1821
1822 ap = strtok_r(NULL, " ", &save_ptr);
1823 if (!ap) {
1824 return false;
1825 }
1826
1827 match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1828 &val, &field_idx);
1829 if (match_res == PARSE_FAIL) {
1830 return false;
1831 }
1832 if (match_res == PARSE_SUCCESS) {
1833 node->fields.arr[field_idx] = val;
1834 fields_to_match--;
1835 if (!fields_to_match) {
1836 return true;
1837 }
1838 }
1839 }
1840 return false;
1841 }
1842
zoneinfo_parse(struct zoneinfo * zi)1843 static int zoneinfo_parse(struct zoneinfo *zi) {
1844 static struct reread_data file_data = {
1845 .filename = ZONEINFO_PATH,
1846 .fd = -1,
1847 };
1848 char *buf;
1849 char *save_ptr;
1850 char *line;
1851 char zone_name[LINE_MAX + 1];
1852 struct zoneinfo_node *node = NULL;
1853 int node_idx = 0;
1854 int zone_idx = 0;
1855
1856 memset(zi, 0, sizeof(struct zoneinfo));
1857
1858 if ((buf = reread_file(&file_data)) == NULL) {
1859 return -1;
1860 }
1861
1862 for (line = strtok_r(buf, "\n", &save_ptr); line;
1863 line = strtok_r(NULL, "\n", &save_ptr)) {
1864 int node_id;
1865 if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1866 if (!node || node->id != node_id) {
1867 line = strtok_r(NULL, "\n", &save_ptr);
1868 if (strncmp(line, NODE_STATS_MARKER, strlen(NODE_STATS_MARKER)) != 0) {
1869 /*
1870 * per-node stats are only present in the first non-empty zone of
1871 * the node.
1872 */
1873 continue;
1874 }
1875
1876 /* new node is found */
1877 if (node) {
1878 node->zone_count = zone_idx + 1;
1879 node_idx++;
1880 if (node_idx == MAX_NR_NODES) {
1881 /* max node count exceeded */
1882 ALOGE("%s parse error", file_data.filename);
1883 return -1;
1884 }
1885 }
1886 node = &zi->nodes[node_idx];
1887 node->id = node_id;
1888 zone_idx = 0;
1889 if (!zoneinfo_parse_node(&save_ptr, node)) {
1890 ALOGE("%s parse error", file_data.filename);
1891 return -1;
1892 }
1893 } else {
1894 /* new zone is found */
1895 zone_idx++;
1896 }
1897 if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1898 ALOGE("%s parse error", file_data.filename);
1899 return -1;
1900 }
1901 }
1902 }
1903 if (!node) {
1904 ALOGE("%s parse error", file_data.filename);
1905 return -1;
1906 }
1907 node->zone_count = zone_idx + 1;
1908 zi->node_count = node_idx + 1;
1909
1910 /* calculate totals fields */
1911 for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1912 node = &zi->nodes[node_idx];
1913 for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1914 struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1915 zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1916 }
1917 zi->total_inactive_file += node->fields.field.nr_inactive_file;
1918 zi->total_active_file += node->fields.field.nr_active_file;
1919 }
1920 return 0;
1921 }
1922
1923 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1924 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1925 char *cp = line;
1926 char *ap;
1927 char *save_ptr;
1928 int64_t val;
1929 int field_idx;
1930 enum field_match_result match_res;
1931
1932 cp = strtok_r(line, " ", &save_ptr);
1933 if (!cp) {
1934 return false;
1935 }
1936
1937 ap = strtok_r(NULL, " ", &save_ptr);
1938 if (!ap) {
1939 return false;
1940 }
1941
1942 match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1943 &val, &field_idx);
1944 if (match_res == PARSE_SUCCESS) {
1945 mi->arr[field_idx] = val / page_k;
1946 }
1947 return (match_res != PARSE_FAIL);
1948 }
1949
read_gpu_total_kb()1950 static int64_t read_gpu_total_kb() {
1951 static android::base::unique_fd fd(
1952 android::bpf::mapRetrieveRO("/sys/fs/bpf/map_gpuMem_gpu_mem_total_map"));
1953 static constexpr uint64_t kBpfKeyGpuTotalUsage = 0;
1954 uint64_t value;
1955
1956 if (!fd.ok()) {
1957 return 0;
1958 }
1959
1960 return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value)
1961 ? 0
1962 : (int32_t)(value / 1024);
1963 }
1964
meminfo_parse(union meminfo * mi)1965 static int meminfo_parse(union meminfo *mi) {
1966 static struct reread_data file_data = {
1967 .filename = MEMINFO_PATH,
1968 .fd = -1,
1969 };
1970 char *buf;
1971 char *save_ptr;
1972 char *line;
1973
1974 memset(mi, 0, sizeof(union meminfo));
1975
1976 if ((buf = reread_file(&file_data)) == NULL) {
1977 return -1;
1978 }
1979
1980 for (line = strtok_r(buf, "\n", &save_ptr); line;
1981 line = strtok_r(NULL, "\n", &save_ptr)) {
1982 if (!meminfo_parse_line(line, mi)) {
1983 ALOGE("%s parse error", file_data.filename);
1984 return -1;
1985 }
1986 }
1987 mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1988 mi->field.buffers;
1989 mi->field.total_gpu_kb = read_gpu_total_kb();
1990 mi->field.easy_available = mi->field.nr_free_pages + mi->field.inactive_file;
1991
1992 return 0;
1993 }
1994
1995 // In the case of ZRAM, mi->field.free_swap can't be used directly because swap space is taken
1996 // from the free memory or reclaimed. Use the lowest of free_swap and easily available memory to
1997 // measure free swap because they represent how much swap space the system will consider to use
1998 // and how much it can actually use.
1999 // Swap compression ratio in the calculation can be adjusted using swap_compression_ratio tunable.
2000 // By setting swap_compression_ratio to 0, available memory can be ignored.
get_free_swap(union meminfo * mi)2001 static inline int64_t get_free_swap(union meminfo *mi) {
2002 if (swap_compression_ratio)
2003 return std::min(mi->field.free_swap, mi->field.easy_available * swap_compression_ratio);
2004 return mi->field.free_swap;
2005 }
2006
2007 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)2008 static bool vmstat_parse_line(char *line, union vmstat *vs) {
2009 char *cp;
2010 char *ap;
2011 char *save_ptr;
2012 int64_t val;
2013 int field_idx;
2014 enum field_match_result match_res;
2015
2016 cp = strtok_r(line, " ", &save_ptr);
2017 if (!cp) {
2018 return false;
2019 }
2020
2021 ap = strtok_r(NULL, " ", &save_ptr);
2022 if (!ap) {
2023 return false;
2024 }
2025
2026 match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
2027 &val, &field_idx);
2028 if (match_res == PARSE_SUCCESS) {
2029 vs->arr[field_idx] = val;
2030 }
2031 return (match_res != PARSE_FAIL);
2032 }
2033
vmstat_parse(union vmstat * vs)2034 static int vmstat_parse(union vmstat *vs) {
2035 static struct reread_data file_data = {
2036 .filename = VMSTAT_PATH,
2037 .fd = -1,
2038 };
2039 char *buf;
2040 char *save_ptr;
2041 char *line;
2042
2043 memset(vs, 0, sizeof(union vmstat));
2044
2045 if ((buf = reread_file(&file_data)) == NULL) {
2046 return -1;
2047 }
2048
2049 for (line = strtok_r(buf, "\n", &save_ptr); line;
2050 line = strtok_r(NULL, "\n", &save_ptr)) {
2051 if (!vmstat_parse_line(line, vs)) {
2052 ALOGE("%s parse error", file_data.filename);
2053 return -1;
2054 }
2055 }
2056
2057 return 0;
2058 }
2059
psi_parse(struct reread_data * file_data,struct psi_stats stats[],bool full)2060 static int psi_parse(struct reread_data *file_data, struct psi_stats stats[], bool full) {
2061 char *buf;
2062 char *save_ptr;
2063 char *line;
2064
2065 if ((buf = reread_file(file_data)) == NULL) {
2066 return -1;
2067 }
2068
2069 line = strtok_r(buf, "\n", &save_ptr);
2070 if (parse_psi_line(line, PSI_SOME, stats)) {
2071 return -1;
2072 }
2073 if (full) {
2074 line = strtok_r(NULL, "\n", &save_ptr);
2075 if (parse_psi_line(line, PSI_FULL, stats)) {
2076 return -1;
2077 }
2078 }
2079
2080 return 0;
2081 }
2082
psi_parse_mem(struct psi_data * psi_data)2083 static int psi_parse_mem(struct psi_data *psi_data) {
2084 static struct reread_data file_data = {
2085 .filename = psi_resource_file[PSI_MEMORY],
2086 .fd = -1,
2087 };
2088 return psi_parse(&file_data, psi_data->mem_stats, true);
2089 }
2090
psi_parse_io(struct psi_data * psi_data)2091 static int psi_parse_io(struct psi_data *psi_data) {
2092 static struct reread_data file_data = {
2093 .filename = psi_resource_file[PSI_IO],
2094 .fd = -1,
2095 };
2096 return psi_parse(&file_data, psi_data->io_stats, true);
2097 }
2098
psi_parse_cpu(struct psi_data * psi_data)2099 static int psi_parse_cpu(struct psi_data *psi_data) {
2100 static struct reread_data file_data = {
2101 .filename = psi_resource_file[PSI_CPU],
2102 .fd = -1,
2103 };
2104 return psi_parse(&file_data, psi_data->cpu_stats, false);
2105 }
2106
2107 enum wakeup_reason {
2108 Event,
2109 Polling
2110 };
2111
2112 struct wakeup_info {
2113 struct timespec wakeup_tm;
2114 struct timespec prev_wakeup_tm;
2115 struct timespec last_event_tm;
2116 int wakeups_since_event;
2117 int skipped_wakeups;
2118 };
2119
2120 /*
2121 * After the initial memory pressure event is received lmkd schedules periodic wakeups to check
2122 * the memory conditions and kill if needed (polling). This is done because pressure events are
2123 * rate-limited and memory conditions can change in between events. Therefore after the initial
2124 * event there might be multiple wakeups. This function records the wakeup information such as the
2125 * timestamps of the last event and the last wakeup, the number of wakeups since the last event
2126 * and how many of those wakeups were skipped (some wakeups are skipped if previously killed
2127 * process is still freeing its memory).
2128 */
record_wakeup_time(struct timespec * tm,enum wakeup_reason reason,struct wakeup_info * wi)2129 static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason,
2130 struct wakeup_info *wi) {
2131 wi->prev_wakeup_tm = wi->wakeup_tm;
2132 wi->wakeup_tm = *tm;
2133 if (reason == Event) {
2134 wi->last_event_tm = *tm;
2135 wi->wakeups_since_event = 0;
2136 wi->skipped_wakeups = 0;
2137 } else {
2138 wi->wakeups_since_event++;
2139 }
2140 }
2141
2142 struct kill_info {
2143 enum kill_reasons kill_reason;
2144 const char *kill_desc;
2145 int thrashing;
2146 int max_thrashing;
2147 };
2148
killinfo_log(struct proc * procp,int min_oom_score,int rss_kb,int swap_kb,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2149 static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb,
2150 int swap_kb, struct kill_info *ki, union meminfo *mi,
2151 struct wakeup_info *wi, struct timespec *tm, struct psi_data *pd) {
2152 /* log process information */
2153 android_log_write_int32(ctx, procp->pid);
2154 android_log_write_int32(ctx, procp->uid);
2155 android_log_write_int32(ctx, procp->oomadj);
2156 android_log_write_int32(ctx, min_oom_score);
2157 android_log_write_int32(ctx, std::min(rss_kb, (int)INT32_MAX));
2158 android_log_write_int32(ctx, ki ? ki->kill_reason : NONE);
2159
2160 /* log meminfo fields */
2161 for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
2162 android_log_write_int32(ctx,
2163 mi ? std::min(mi->arr[field_idx] * page_k, (int64_t)INT32_MAX) : 0);
2164 }
2165
2166 /* log lmkd wakeup information */
2167 if (wi) {
2168 android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm));
2169 android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm));
2170 android_log_write_int32(ctx, wi->wakeups_since_event);
2171 android_log_write_int32(ctx, wi->skipped_wakeups);
2172 } else {
2173 android_log_write_int32(ctx, 0);
2174 android_log_write_int32(ctx, 0);
2175 android_log_write_int32(ctx, 0);
2176 android_log_write_int32(ctx, 0);
2177 }
2178
2179 android_log_write_int32(ctx, std::min(swap_kb, (int)INT32_MAX));
2180 android_log_write_int32(ctx, mi ? (int32_t)mi->field.total_gpu_kb : 0);
2181 if (ki) {
2182 android_log_write_int32(ctx, ki->thrashing);
2183 android_log_write_int32(ctx, ki->max_thrashing);
2184 } else {
2185 android_log_write_int32(ctx, 0);
2186 android_log_write_int32(ctx, 0);
2187 }
2188
2189 if (pd) {
2190 android_log_write_float32(ctx, pd->mem_stats[PSI_SOME].avg10);
2191 android_log_write_float32(ctx, pd->mem_stats[PSI_FULL].avg10);
2192 android_log_write_float32(ctx, pd->io_stats[PSI_SOME].avg10);
2193 android_log_write_float32(ctx, pd->io_stats[PSI_FULL].avg10);
2194 android_log_write_float32(ctx, pd->cpu_stats[PSI_SOME].avg10);
2195 } else {
2196 for (int i = 0; i < 5; i++) {
2197 android_log_write_float32(ctx, 0);
2198 }
2199 }
2200
2201 android_log_write_list(ctx, LOG_ID_EVENTS);
2202 android_log_reset(ctx);
2203 }
2204
2205 // Note: returned entry is only an anchor and does not hold a valid process info.
2206 // When called from a non-main thread, adjslot_list_lock read lock should be taken.
proc_adj_head(int oomadj)2207 static struct proc *proc_adj_head(int oomadj) {
2208 return (struct proc *)&procadjslot_list[ADJTOSLOT(oomadj)];
2209 }
2210
2211 // When called from a non-main thread, adjslot_list_lock read lock should be taken.
proc_adj_tail(int oomadj)2212 static struct proc *proc_adj_tail(int oomadj) {
2213 return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
2214 }
2215
2216 // When called from a non-main thread, adjslot_list_lock read lock should be taken.
proc_adj_prev(int oomadj,int pid)2217 static struct proc *proc_adj_prev(int oomadj, int pid) {
2218 struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
2219 struct adjslot_list *curr = adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
2220
2221 while (curr != head) {
2222 if (((struct proc *)curr)->pid == pid) {
2223 return (struct proc *)curr->prev;
2224 }
2225 curr = curr->prev;
2226 }
2227
2228 return NULL;
2229 }
2230
2231 // Can be called only from the main thread.
proc_get_heaviest(int oomadj)2232 static struct proc *proc_get_heaviest(int oomadj) {
2233 struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
2234 struct adjslot_list *curr = head->next;
2235 struct proc *maxprocp = NULL;
2236 int maxsize = 0;
2237 if ((curr != head) && (curr->next == head)) {
2238 // Our list only has one process. No need to access procfs for its size.
2239 return (struct proc *)curr;
2240 }
2241 while (curr != head) {
2242 int pid = ((struct proc *)curr)->pid;
2243 int tasksize = proc_get_size(pid);
2244 if (tasksize < 0) {
2245 struct adjslot_list *next = curr->next;
2246 pid_remove(pid);
2247 curr = next;
2248 } else {
2249 if (tasksize > maxsize) {
2250 maxsize = tasksize;
2251 maxprocp = (struct proc *)curr;
2252 }
2253 curr = curr->next;
2254 }
2255 }
2256 return maxprocp;
2257 }
2258
find_victim(int oom_score,int prev_pid,struct proc & target_proc)2259 static bool find_victim(int oom_score, int prev_pid, struct proc &target_proc) {
2260 struct proc *procp;
2261 std::shared_lock lock(adjslot_list_lock);
2262
2263 if (!prev_pid) {
2264 procp = proc_adj_tail(oom_score);
2265 } else {
2266 procp = proc_adj_prev(oom_score, prev_pid);
2267 if (!procp) {
2268 // pid was removed, restart at the tail
2269 procp = proc_adj_tail(oom_score);
2270 }
2271 }
2272
2273 // the list is empty at this oom_score or we looped through it
2274 if (!procp || procp == proc_adj_head(oom_score)) {
2275 return false;
2276 }
2277
2278 // make a copy because original might be destroyed after adjslot_list_lock is released
2279 target_proc = *procp;
2280
2281 return true;
2282 }
2283
watchdog_callback()2284 static void watchdog_callback() {
2285 int prev_pid = 0;
2286
2287 ALOGW("lmkd watchdog timed out!");
2288 for (int oom_score = OOM_SCORE_ADJ_MAX; oom_score >= 0;) {
2289 struct proc target;
2290
2291 if (!find_victim(oom_score, prev_pid, target)) {
2292 oom_score--;
2293 prev_pid = 0;
2294 continue;
2295 }
2296
2297 if (target.valid && reaper.kill({ target.pidfd, target.pid, target.uid }, true) == 0) {
2298 ALOGW("lmkd watchdog killed process %d, oom_score_adj %d", target.pid, oom_score);
2299 killinfo_log(&target, 0, 0, 0, NULL, NULL, NULL, NULL, NULL);
2300 // Can't call pid_remove() from non-main thread, therefore just invalidate the record
2301 pid_invalidate(target.pid);
2302 break;
2303 }
2304 prev_pid = target.pid;
2305 }
2306 }
2307
2308 static Watchdog watchdog(WATCHDOG_TIMEOUT_SEC, watchdog_callback);
2309
is_kill_pending(void)2310 static bool is_kill_pending(void) {
2311 char buf[24];
2312
2313 if (last_kill_pid_or_fd < 0) {
2314 return false;
2315 }
2316
2317 if (pidfd_supported) {
2318 return true;
2319 }
2320
2321 /* when pidfd is not supported base the decision on /proc/<pid> existence */
2322 snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
2323 if (access(buf, F_OK) == 0) {
2324 return true;
2325 }
2326
2327 return false;
2328 }
2329
is_waiting_for_kill(void)2330 static bool is_waiting_for_kill(void) {
2331 return pidfd_supported && last_kill_pid_or_fd >= 0;
2332 }
2333
stop_wait_for_proc_kill(bool finished)2334 static void stop_wait_for_proc_kill(bool finished) {
2335 struct epoll_event epev;
2336
2337 if (last_kill_pid_or_fd < 0) {
2338 return;
2339 }
2340
2341 if (debug_process_killing) {
2342 struct timespec curr_tm;
2343
2344 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2345 /*
2346 * curr_tm is used here merely to report kill duration, so this failure is not fatal.
2347 * Log an error and continue.
2348 */
2349 ALOGE("Failed to get current time");
2350 }
2351
2352 if (finished) {
2353 ALOGI("Process got killed in %ldms",
2354 get_time_diff_ms(&last_kill_tm, &curr_tm));
2355 } else {
2356 ALOGI("Stop waiting for process kill after %ldms",
2357 get_time_diff_ms(&last_kill_tm, &curr_tm));
2358 }
2359 }
2360
2361 if (pidfd_supported) {
2362 /* unregister fd */
2363 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) {
2364 // Log an error and keep going
2365 ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
2366 }
2367 maxevents--;
2368 close(last_kill_pid_or_fd);
2369 }
2370
2371 last_kill_pid_or_fd = -1;
2372 }
2373
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2374 static void kill_done_handler(int data __unused, uint32_t events __unused,
2375 struct polling_params *poll_params) {
2376 stop_wait_for_proc_kill(true);
2377 poll_params->update = POLLING_RESUME;
2378 }
2379
kill_fail_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2380 static void kill_fail_handler(int data __unused, uint32_t events __unused,
2381 struct polling_params *poll_params) {
2382 int pid;
2383
2384 // Extract pid from the communication pipe. Clearing the pipe this way allows further
2385 // epoll_wait calls to sleep until the next event.
2386 if (TEMP_FAILURE_RETRY(read(reaper_comm_fd[0], &pid, sizeof(pid))) != sizeof(pid)) {
2387 ALOGE("thread communication read failed: %s", strerror(errno));
2388 }
2389 stop_wait_for_proc_kill(false);
2390 poll_params->update = POLLING_RESUME;
2391 }
2392
start_wait_for_proc_kill(int pid_or_fd)2393 static void start_wait_for_proc_kill(int pid_or_fd) {
2394 static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
2395 struct epoll_event epev;
2396
2397 if (last_kill_pid_or_fd >= 0) {
2398 /* Should not happen but if it does we should stop previous wait */
2399 ALOGE("Attempt to wait for a kill while another wait is in progress");
2400 stop_wait_for_proc_kill(false);
2401 }
2402
2403 last_kill_pid_or_fd = pid_or_fd;
2404
2405 if (!pidfd_supported) {
2406 /* If pidfd is not supported just store PID and exit */
2407 return;
2408 }
2409
2410 epev.events = EPOLLIN;
2411 epev.data.ptr = (void *)&kill_done_hinfo;
2412 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
2413 ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
2414 close(last_kill_pid_or_fd);
2415 last_kill_pid_or_fd = -1;
2416 return;
2417 }
2418 maxevents++;
2419 }
2420
2421 /* Kill one process specified by procp. Returns the size (in pages) of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2422 static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
2423 union meminfo *mi, struct wakeup_info *wi, struct timespec *tm,
2424 struct psi_data *pd) {
2425 int pid = procp->pid;
2426 int pidfd = procp->pidfd;
2427 uid_t uid = procp->uid;
2428 char *taskname;
2429 int kill_result;
2430 int result = -1;
2431 struct memory_stat *mem_st;
2432 struct kill_stat kill_st;
2433 int64_t tgid;
2434 int64_t rss_kb;
2435 int64_t swap_kb;
2436 char buf[pagesize];
2437 char desc[LINE_MAX];
2438
2439 if (!procp->valid || !read_proc_status(pid, buf, sizeof(buf))) {
2440 goto out;
2441 }
2442 if (!parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid)) {
2443 ALOGE("Unable to parse tgid from /proc/%d/status", pid);
2444 goto out;
2445 }
2446 if (tgid != pid) {
2447 ALOGE("Possible pid reuse detected (pid %d, tgid %" PRId64 ")!", pid, tgid);
2448 goto out;
2449 }
2450 // Zombie processes will not have RSS / Swap fields.
2451 if (!parse_status_tag(buf, PROC_STATUS_RSS_FIELD, &rss_kb)) {
2452 goto out;
2453 }
2454 if (!parse_status_tag(buf, PROC_STATUS_SWAP_FIELD, &swap_kb)) {
2455 goto out;
2456 }
2457
2458 taskname = proc_get_name(pid, buf, sizeof(buf));
2459 // taskname will point inside buf, do not reuse buf onwards.
2460 if (!taskname) {
2461 goto out;
2462 }
2463
2464 mem_st = stats_read_memory_stat(per_app_memcg, pid, uid, rss_kb * 1024, swap_kb * 1024);
2465
2466 snprintf(desc, sizeof(desc), "lmk,%d,%d,%d,%d,%d", pid, ki ? (int)ki->kill_reason : -1,
2467 procp->oomadj, min_oom_score, ki ? ki->max_thrashing : -1);
2468
2469 result = lmkd_free_memory_before_kill_hook(procp, rss_kb / page_k, procp->oomadj,
2470 ki ? (int)ki->kill_reason : -1);
2471 if (result > 0) {
2472 /*
2473 * Memory was freed elsewhere; no need to kill. Note: intentionally do not
2474 * pid_remove(pid) since it was not killed.
2475 */
2476 ALOGI("Skipping kill; %ld kB freed elsewhere.", result * page_k);
2477 return result;
2478 }
2479
2480 trace_kill_start(desc);
2481
2482 start_wait_for_proc_kill(pidfd < 0 ? pid : pidfd);
2483 kill_result = reaper.kill({ pidfd, pid, uid }, false);
2484
2485 trace_kill_end();
2486
2487 if (kill_result) {
2488 stop_wait_for_proc_kill(false);
2489 ALOGE("kill(%d): errno=%d", pid, errno);
2490 /* Delete process record even when we fail to kill so that we don't get stuck on it */
2491 goto out;
2492 }
2493
2494 last_kill_tm = *tm;
2495
2496 inc_killcnt(procp->oomadj);
2497
2498 if (ki) {
2499 kill_st.kill_reason = ki->kill_reason;
2500 kill_st.thrashing = ki->thrashing;
2501 kill_st.max_thrashing = ki->max_thrashing;
2502 ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2503 "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb,
2504 ki->kill_desc);
2505 } else {
2506 kill_st.kill_reason = NONE;
2507 kill_st.thrashing = 0;
2508 kill_st.max_thrashing = 0;
2509 ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2510 "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb);
2511 }
2512 killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki, mi, wi, tm, pd);
2513
2514 kill_st.uid = static_cast<int32_t>(uid);
2515 kill_st.taskname = taskname;
2516 kill_st.oom_score = procp->oomadj;
2517 kill_st.min_oom_score = min_oom_score;
2518 kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
2519 kill_st.free_swap_kb = get_free_swap(mi) * page_k;
2520 stats_write_lmk_kill_occurred(&kill_st, mem_st);
2521
2522 ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid, rss_kb);
2523
2524 result = rss_kb / page_k;
2525
2526 out:
2527 /*
2528 * WARNING: After pid_remove() procp is freed and can't be used!
2529 * Therefore placed at the end of the function.
2530 */
2531 pid_remove(pid);
2532 return result;
2533 }
2534
2535 /*
2536 * Find one process to kill at or above the given oom_score_adj level.
2537 * Returns size of the killed process.
2538 */
find_and_kill_process(int min_score_adj,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2539 static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
2540 struct wakeup_info *wi, struct timespec *tm,
2541 struct psi_data *pd) {
2542 int i;
2543 int killed_size = 0;
2544 bool choose_heaviest_task = kill_heaviest_task;
2545
2546 for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
2547 struct proc *procp;
2548
2549 if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
2550 /*
2551 * If we have to choose a perceptible process, choose the heaviest one to
2552 * hopefully minimize the number of victims.
2553 */
2554 choose_heaviest_task = true;
2555 }
2556
2557 while (true) {
2558 procp = choose_heaviest_task ?
2559 proc_get_heaviest(i) : proc_adj_tail(i);
2560
2561 if (!procp)
2562 break;
2563
2564 killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm, pd);
2565 if (killed_size >= 0) {
2566 break;
2567 }
2568 }
2569 if (killed_size) {
2570 break;
2571 }
2572 }
2573
2574 return killed_size;
2575 }
2576
get_memory_usage(struct reread_data * file_data)2577 static int64_t get_memory_usage(struct reread_data *file_data) {
2578 int64_t mem_usage;
2579 char *buf;
2580
2581 if ((buf = reread_file(file_data)) == NULL) {
2582 return -1;
2583 }
2584
2585 if (!parse_int64(buf, &mem_usage)) {
2586 ALOGE("%s parse error", file_data->filename);
2587 return -1;
2588 }
2589 if (mem_usage == 0) {
2590 ALOGE("No memory!");
2591 return -1;
2592 }
2593 return mem_usage;
2594 }
2595
record_low_pressure_levels(union meminfo * mi)2596 void record_low_pressure_levels(union meminfo *mi) {
2597 if (low_pressure_mem.min_nr_free_pages == -1 ||
2598 low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
2599 if (debug_process_killing) {
2600 ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
2601 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
2602 }
2603 low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
2604 }
2605 /*
2606 * Free memory at low vmpressure events occasionally gets spikes,
2607 * possibly a stale low vmpressure event with memory already
2608 * freed up (no memory pressure should have been reported).
2609 * Ignore large jumps in max_nr_free_pages that would mess up our stats.
2610 */
2611 if (low_pressure_mem.max_nr_free_pages == -1 ||
2612 (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
2613 mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
2614 low_pressure_mem.max_nr_free_pages * 0.1)) {
2615 if (debug_process_killing) {
2616 ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
2617 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
2618 }
2619 low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
2620 }
2621 }
2622
upgrade_level(enum vmpressure_level level)2623 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
2624 return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
2625 level + 1 : level);
2626 }
2627
downgrade_level(enum vmpressure_level level)2628 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
2629 return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
2630 level - 1 : level);
2631 }
2632
2633 enum zone_watermark {
2634 WMARK_MIN = 0,
2635 WMARK_LOW,
2636 WMARK_HIGH,
2637 WMARK_NONE
2638 };
2639
2640 struct zone_watermarks {
2641 long high_wmark;
2642 long low_wmark;
2643 long min_wmark;
2644 };
2645
2646 static struct zone_watermarks watermarks;
2647
2648 /*
2649 * Returns lowest breached watermark or WMARK_NONE.
2650 */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)2651 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
2652 struct zone_watermarks *watermarks)
2653 {
2654 int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
2655
2656 if (nr_free_pages < watermarks->min_wmark) {
2657 return WMARK_MIN;
2658 }
2659 if (nr_free_pages < watermarks->low_wmark) {
2660 return WMARK_LOW;
2661 }
2662 if (nr_free_pages < watermarks->high_wmark) {
2663 return WMARK_HIGH;
2664 }
2665 return WMARK_NONE;
2666 }
2667
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2668 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2669 memset(watermarks, 0, sizeof(struct zone_watermarks));
2670
2671 for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2672 struct zoneinfo_node *node = &zi->nodes[node_idx];
2673 for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2674 struct zoneinfo_zone *zone = &node->zones[zone_idx];
2675
2676 if (!zone->fields.field.present) {
2677 continue;
2678 }
2679
2680 watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2681 watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2682 watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2683 }
2684 }
2685 }
2686
update_zoneinfo_watermarks(struct zoneinfo * zi)2687 static int update_zoneinfo_watermarks(struct zoneinfo *zi) {
2688 if (zoneinfo_parse(zi) < 0) {
2689 ALOGE("Failed to parse zoneinfo!");
2690 return -1;
2691 }
2692 calc_zone_watermarks(zi, &watermarks);
2693 return 0;
2694 }
2695
calc_swap_utilization(union meminfo * mi)2696 static int calc_swap_utilization(union meminfo *mi) {
2697 int64_t swap_used = mi->field.total_swap - get_free_swap(mi);
2698 int64_t total_swappable = mi->field.active_anon + mi->field.inactive_anon +
2699 mi->field.shmem + swap_used;
2700 return total_swappable > 0 ? (swap_used * 100) / total_swappable : 0;
2701 }
2702
2703 enum event_source {
2704 PSI,
2705 VENDOR,
2706 };
2707
2708 union psi_event_data {
2709 enum vmpressure_level level;
2710 mem_event_t vendor_event;
2711 };
2712
__mp_event_psi(enum event_source source,union psi_event_data data,uint32_t events,struct polling_params * poll_params)2713 static void __mp_event_psi(enum event_source source, union psi_event_data data,
2714 uint32_t events, struct polling_params *poll_params) {
2715 enum reclaim_state {
2716 NO_RECLAIM = 0,
2717 KSWAPD_RECLAIM,
2718 DIRECT_RECLAIM,
2719 };
2720 static int64_t init_ws_refault;
2721 static int64_t prev_workingset_refault;
2722 static int64_t base_file_lru;
2723 static int64_t init_pgscan_kswapd;
2724 static int64_t init_pgscan_direct;
2725 static int64_t init_pgrefill;
2726 static bool killing;
2727 static int thrashing_limit = thrashing_limit_pct;
2728 static struct timespec wmark_update_tm;
2729 static struct wakeup_info wi;
2730 static struct timespec thrashing_reset_tm;
2731 static int64_t prev_thrash_growth = 0;
2732 static bool check_filecache = false;
2733 static int max_thrashing = 0;
2734
2735 union meminfo mi;
2736 union vmstat vs;
2737 struct psi_data psi_data;
2738 struct timespec curr_tm;
2739 int64_t thrashing = 0;
2740 bool swap_is_low = false;
2741 enum vmpressure_level level = (source == PSI) ? data.level: (enum vmpressure_level)0;
2742 enum kill_reasons kill_reason = NONE;
2743 bool cycle_after_kill = false;
2744 enum reclaim_state reclaim = NO_RECLAIM;
2745 enum zone_watermark wmark = WMARK_NONE;
2746 char kill_desc[LINE_MAX];
2747 bool cut_thrashing_limit = false;
2748 int min_score_adj = 0;
2749 int swap_util = 0;
2750 int64_t swap_low_threshold;
2751 long since_thrashing_reset_ms;
2752 int64_t workingset_refault_file;
2753 bool critical_stall = false;
2754 bool in_direct_reclaim;
2755 long direct_reclaim_duration_ms;
2756 bool in_kswapd_reclaim;
2757
2758 mp_event_count++;
2759 if (debug_process_killing) {
2760 if (source == PSI)
2761 ALOGI("%s memory pressure event #%" PRIu64 " is triggered",
2762 level_name[level], mp_event_count);
2763 else
2764 ALOGI("vendor kill event #%" PRIu64 " is triggered", mp_event_count);
2765 }
2766
2767 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2768 ALOGE("Failed to get current time");
2769 return;
2770 }
2771
2772 if (source == PSI) {
2773 if (events > 0 ) {
2774 /* Ignore a lower event within the first polling window. */
2775 if (level < prev_level) {
2776 if (debug_process_killing)
2777 ALOGI("Ignoring %s pressure event; occurred too soon.",
2778 level_name[level]);
2779 return;
2780 }
2781 prev_level = level;
2782 } else {
2783 /* Reset event level after the first polling window. */
2784 prev_level = VMPRESS_LEVEL_LOW;
2785 }
2786
2787 record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2788 }
2789
2790 bool kill_pending = is_kill_pending();
2791 if (kill_pending && (kill_timeout_ms == 0 ||
2792 get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
2793 /* Skip while still killing a process */
2794 wi.skipped_wakeups++;
2795 goto no_kill;
2796 }
2797 /*
2798 * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
2799 * supported and death notification already caused waiting to stop.
2800 */
2801 stop_wait_for_proc_kill(!kill_pending);
2802
2803 if (vmstat_parse(&vs) < 0) {
2804 ALOGE("Failed to parse vmstat!");
2805 return;
2806 }
2807 /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */
2808 workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;
2809
2810 if (meminfo_parse(&mi) < 0) {
2811 ALOGE("Failed to parse meminfo!");
2812 return;
2813 }
2814
2815 /* Reset states after process got killed */
2816 if (killing) {
2817 killing = false;
2818 cycle_after_kill = true;
2819 /* Reset file-backed pagecache size and refault amounts after a kill */
2820 base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2821 init_ws_refault = workingset_refault_file;
2822 thrashing_reset_tm = curr_tm;
2823 prev_thrash_growth = 0;
2824 }
2825
2826 /* Check free swap levels */
2827 if (swap_free_low_percentage) {
2828 swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2829 swap_is_low = get_free_swap(&mi) < swap_low_threshold;
2830 } else {
2831 swap_low_threshold = 0;
2832 }
2833
2834 if (memevent_listener) {
2835 in_direct_reclaim =
2836 direct_reclaim_start_tm.tv_sec != 0 || direct_reclaim_start_tm.tv_nsec != 0;
2837 in_kswapd_reclaim = kswapd_start_tm.tv_sec != 0 || kswapd_start_tm.tv_nsec != 0;
2838 } else {
2839 in_direct_reclaim = vs.field.pgscan_direct != init_pgscan_direct;
2840 in_kswapd_reclaim = (vs.field.pgscan_kswapd != init_pgscan_kswapd) ||
2841 (vs.field.pgrefill != init_pgrefill);
2842 }
2843
2844 /* Identify reclaim state */
2845 if (in_direct_reclaim) {
2846 init_pgscan_direct = vs.field.pgscan_direct;
2847 init_pgscan_kswapd = vs.field.pgscan_kswapd;
2848 init_pgrefill = vs.field.pgrefill;
2849 direct_reclaim_duration_ms = get_time_diff_ms(&direct_reclaim_start_tm, &curr_tm);
2850 reclaim = DIRECT_RECLAIM;
2851 } else if (in_kswapd_reclaim) {
2852 init_pgscan_kswapd = vs.field.pgscan_kswapd;
2853 init_pgrefill = vs.field.pgrefill;
2854 reclaim = KSWAPD_RECLAIM;
2855 } else if ((workingset_refault_file == prev_workingset_refault) &&
2856 (source == PSI)) {
2857 /*
2858 * Device is not thrashing and not reclaiming, bail out early until we see these stats
2859 * changing
2860 */
2861 goto no_kill;
2862 }
2863
2864 prev_workingset_refault = workingset_refault_file;
2865
2866 /*
2867 * It's possible we fail to find an eligible process to kill (ex. no process is
2868 * above oom_adj_min). When this happens, we should retry to find a new process
2869 * for a kill whenever a new eligible process is available. This is especially
2870 * important for a slow growing refault case. While retrying, we should keep
2871 * monitoring new thrashing counter as someone could release the memory to mitigate
2872 * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
2873 * counter by window counts. If the counter is still greater than thrashing limit,
2874 * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
2875 * we reset the prev_thrash counter so we will stop retrying.
2876 */
2877 since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
2878 if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
2879 long windows_passed;
2880 /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
2881 prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
2882 / (base_file_lru + 1);
2883 windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
2884 /*
2885 * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
2886 * just crossed, which means there were no eligible processes to kill. We preserve the
2887 * counter in that case to ensure a kill if a new eligible process appears.
2888 */
2889 if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
2890 prev_thrash_growth >>= windows_passed;
2891 }
2892
2893 /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
2894 base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2895 init_ws_refault = workingset_refault_file;
2896 thrashing_reset_tm = curr_tm;
2897 thrashing_limit = thrashing_limit_pct;
2898 } else {
2899 /* Calculate what % of the file-backed pagecache refaulted so far */
2900 thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
2901 }
2902 /* Add previous cycle's decayed thrashing amount */
2903 thrashing += prev_thrash_growth;
2904 if (max_thrashing < thrashing) {
2905 max_thrashing = thrashing;
2906 }
2907
2908 update_watermarks:
2909 /*
2910 * Refresh watermarks:
2911 * 1. watermarks haven't been initialized (high_wmark == 0)
2912 * 2. per min in case user updated one of the margins if mem_event update_zoneinfo is NOT
2913 * supported.
2914 */
2915 if (watermarks.high_wmark == 0 || (!mem_event_update_zoneinfo_supported &&
2916 get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000)) {
2917 struct zoneinfo zi;
2918
2919 if (update_zoneinfo_watermarks(&zi) < 0) {
2920 return;
2921 }
2922 wmark_update_tm = curr_tm;
2923 }
2924
2925 /* Find out which watermark is breached if any */
2926 wmark = get_lowest_watermark(&mi, &watermarks);
2927
2928 if (!psi_parse_mem(&psi_data)) {
2929 critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
2930 }
2931 /*
2932 * TODO: move this logic into a separate function
2933 * Decide if killing a process is necessary and record the reason
2934 */
2935 if (source == VENDOR) {
2936 int vendor_kill_reason = data.vendor_event.event_data.vendor_kill.reason;
2937 short vendor_kill_min_oom_score_adj =
2938 data.vendor_event.event_data.vendor_kill.min_oom_score_adj;
2939 if (vendor_kill_reason < 0 ||
2940 vendor_kill_reason > VENDOR_KILL_REASON_END ||
2941 vendor_kill_min_oom_score_adj < 0) {
2942 ALOGE("Invalid vendor kill reason %d, min_oom_score_adj %d",
2943 vendor_kill_reason, vendor_kill_min_oom_score_adj);
2944 return;
2945 }
2946
2947 kill_reason = (enum kill_reasons)(vendor_kill_reason + VENDOR_KILL_REASON_BASE);
2948 min_score_adj = vendor_kill_min_oom_score_adj;
2949 snprintf(kill_desc, sizeof(kill_desc),
2950 "vendor kill with the reason %d, min_score_adj %d", kill_reason, min_score_adj);
2951 } else if (cycle_after_kill && wmark < WMARK_LOW) {
2952 /*
2953 * Prevent kills not freeing enough memory which might lead to OOM kill.
2954 * This might happen when a process is consuming memory faster than reclaim can
2955 * free even after a kill. Mostly happens when running memory stress tests.
2956 */
2957 min_score_adj = pressure_after_kill_min_score;
2958 kill_reason = PRESSURE_AFTER_KILL;
2959 strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2960 kill_desc[sizeof(kill_desc) - 1] = '\0';
2961 } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2962 /*
2963 * Device is too busy reclaiming memory which might lead to ANR.
2964 * Critical level is triggered when PSI complete stall (all tasks are blocked because
2965 * of the memory congestion) breaches the configured threshold.
2966 */
2967 kill_reason = NOT_RESPONDING;
2968 strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2969 kill_desc[sizeof(kill_desc) - 1] = '\0';
2970 } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2971 /* Page cache is thrashing while swap is low */
2972 kill_reason = LOW_SWAP_AND_THRASHING;
2973 snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2974 "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2975 get_free_swap(&mi) * page_k, swap_low_threshold * page_k, thrashing);
2976 /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2977 if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2978 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2979 }
2980 check_filecache = true;
2981 } else if (swap_is_low && wmark < WMARK_HIGH) {
2982 /* Both free memory and swap are low */
2983 kill_reason = LOW_MEM_AND_SWAP;
2984 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2985 PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
2986 get_free_swap(&mi) * page_k, swap_low_threshold * page_k);
2987 /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2988 if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2989 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2990 }
2991 } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
2992 (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
2993 /*
2994 * Too much anon memory is swapped out but swap is not low.
2995 * Non-swappable allocations created memory pressure.
2996 */
2997 kill_reason = LOW_MEM_AND_SWAP_UTIL;
2998 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
2999 " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
3000 swap_util, swap_util_max);
3001 } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
3002 /* Page cache is thrashing while memory is low */
3003 kill_reason = LOW_MEM_AND_THRASHING;
3004 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
3005 PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
3006 cut_thrashing_limit = true;
3007 /* Do not kill perceptible apps unless thrashing at critical levels */
3008 if (thrashing < thrashing_critical_pct) {
3009 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
3010 }
3011 check_filecache = true;
3012 } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
3013 /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
3014 kill_reason = DIRECT_RECL_AND_THRASHING;
3015 snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
3016 PRId64 "%%)", thrashing);
3017 cut_thrashing_limit = true;
3018 /* Do not kill perceptible apps unless thrashing at critical levels */
3019 if (thrashing < thrashing_critical_pct) {
3020 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
3021 }
3022 check_filecache = true;
3023 } else if (reclaim == DIRECT_RECLAIM && direct_reclaim_threshold_ms > 0 &&
3024 direct_reclaim_duration_ms > direct_reclaim_threshold_ms) {
3025 kill_reason = DIRECT_RECL_STUCK;
3026 snprintf(kill_desc, sizeof(kill_desc), "device is stuck in direct reclaim (%ldms > %dms)",
3027 direct_reclaim_duration_ms, direct_reclaim_threshold_ms);
3028 } else if (check_filecache) {
3029 int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
3030
3031 if (file_lru_kb < filecache_min_kb) {
3032 /* File cache is too low after thrashing, keep killing background processes */
3033 kill_reason = LOW_FILECACHE_AFTER_THRASHING;
3034 snprintf(kill_desc, sizeof(kill_desc),
3035 "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
3036 file_lru_kb, filecache_min_kb);
3037 min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
3038 } else {
3039 /* File cache is big enough, stop checking */
3040 check_filecache = false;
3041 }
3042 }
3043
3044 /* Check if a cached app should be killed */
3045 if (kill_reason == NONE && wmark < WMARK_HIGH) {
3046 kill_reason = LOW_MEM;
3047 snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached",
3048 wmark < WMARK_LOW ? "min" : "low");
3049 min_score_adj = lowmem_min_oom_score;
3050 }
3051
3052 /* Kill a process if necessary */
3053 if (kill_reason != NONE) {
3054 struct kill_info ki = {
3055 .kill_reason = kill_reason,
3056 .kill_desc = kill_desc,
3057 .thrashing = (int)thrashing,
3058 .max_thrashing = max_thrashing,
3059 };
3060 static bool first_kill = true;
3061
3062 /* Make sure watermarks are correct before the first kill */
3063 if (first_kill) {
3064 first_kill = false;
3065 watermarks.high_wmark = 0; // force recomputation
3066 goto update_watermarks;
3067 }
3068
3069 /* Allow killing perceptible apps if the system is stalled */
3070 if (critical_stall) {
3071 min_score_adj = 0;
3072 }
3073 psi_parse_io(&psi_data);
3074 psi_parse_cpu(&psi_data);
3075 int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm, &psi_data);
3076 if (pages_freed > 0) {
3077 killing = true;
3078 max_thrashing = 0;
3079 if (cut_thrashing_limit) {
3080 /*
3081 * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
3082 * thrashing limit until the system stops thrashing.
3083 */
3084 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
3085 }
3086 }
3087 }
3088
3089 no_kill:
3090 /* Do not poll if kernel supports pidfd waiting */
3091 if (is_waiting_for_kill()) {
3092 /* Pause polling if we are waiting for process death notification */
3093 poll_params->update = POLLING_PAUSE;
3094 return;
3095 }
3096
3097 /*
3098 * Start polling after initial PSI event;
3099 * extend polling while device is in direct reclaim or process is being killed;
3100 * do not extend when kswapd reclaims because that might go on for a long time
3101 * without causing memory pressure
3102 */
3103 if (events || killing || reclaim == DIRECT_RECLAIM) {
3104 poll_params->update = POLLING_START;
3105 }
3106
3107 /* Decide the polling interval */
3108 if (swap_is_low || killing) {
3109 /* Fast polling during and after a kill or when swap is low */
3110 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3111 } else {
3112 /* By default use long intervals */
3113 poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
3114 }
3115 }
3116
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)3117 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
3118 union psi_event_data event_data = {.level = (enum vmpressure_level)data};
3119 __mp_event_psi(PSI, event_data, events, poll_params);
3120 }
3121
GetCgroupAttributePath(const char * attr)3122 static std::string GetCgroupAttributePath(const char* attr) {
3123 std::string path;
3124 if (!CgroupGetAttributePath(attr, &path)) {
3125 ALOGE("Unknown cgroup attribute %s", attr);
3126 }
3127 return path;
3128 }
3129
3130 // The implementation of this function relies on memcg statistics that are only available in the
3131 // v1 cgroup hierarchy.
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)3132 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
3133 unsigned long long evcount;
3134 int64_t mem_usage, memsw_usage;
3135 int64_t mem_pressure;
3136 union meminfo mi;
3137 struct zoneinfo zi;
3138 struct timespec curr_tm;
3139 static unsigned long kill_skip_count = 0;
3140 enum vmpressure_level level = (enum vmpressure_level)data;
3141 long other_free = 0, other_file = 0;
3142 int min_score_adj;
3143 int minfree = 0;
3144 static const std::string mem_usage_path = GetCgroupAttributePath("MemUsage");
3145 static struct reread_data mem_usage_file_data = {
3146 .filename = mem_usage_path.c_str(),
3147 .fd = -1,
3148 };
3149 static const std::string memsw_usage_path = GetCgroupAttributePath("MemAndSwapUsage");
3150 static struct reread_data memsw_usage_file_data = {
3151 .filename = memsw_usage_path.c_str(),
3152 .fd = -1,
3153 };
3154 static struct wakeup_info wi;
3155
3156 mp_event_count++;
3157 if (debug_process_killing) {
3158 ALOGI("%s memory pressure event #%" PRIu64 " is triggered",
3159 level_name[level], mp_event_count);
3160 }
3161
3162 if (!use_psi_monitors) {
3163 /*
3164 * Check all event counters from low to critical
3165 * and upgrade to the highest priority one. By reading
3166 * eventfd we also reset the event counters.
3167 */
3168 for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
3169 if (mpevfd[lvl] != -1 &&
3170 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
3171 &evcount, sizeof(evcount))) > 0 &&
3172 evcount > 0 && lvl > level) {
3173 level = static_cast<vmpressure_level>(lvl);
3174 }
3175 }
3176 }
3177
3178 /* Start polling after initial PSI event */
3179 if (use_psi_monitors && events) {
3180 /* Override polling params only if current event is more critical */
3181 if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
3182 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3183 poll_params->update = POLLING_START;
3184 }
3185 }
3186
3187 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
3188 ALOGE("Failed to get current time");
3189 return;
3190 }
3191
3192 record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
3193
3194 if (kill_timeout_ms &&
3195 get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms)) {
3196 /*
3197 * If we're within the no-kill timeout, see if there's pending reclaim work
3198 * from the last killed process. If so, skip killing for now.
3199 */
3200 if (is_kill_pending()) {
3201 kill_skip_count++;
3202 wi.skipped_wakeups++;
3203 return;
3204 }
3205 /*
3206 * Process is dead, stop waiting. This has no effect if pidfds are supported and
3207 * death notification already caused waiting to stop.
3208 */
3209 stop_wait_for_proc_kill(true);
3210 } else {
3211 /*
3212 * Killing took longer than no-kill timeout. Stop waiting for the last process
3213 * to die because we are ready to kill again.
3214 */
3215 stop_wait_for_proc_kill(false);
3216 }
3217
3218 if (kill_skip_count > 0) {
3219 ALOGI("%lu memory pressure events were skipped after a kill!",
3220 kill_skip_count);
3221 kill_skip_count = 0;
3222 }
3223
3224 if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
3225 ALOGE("Failed to get free memory!");
3226 return;
3227 }
3228
3229 if (use_minfree_levels) {
3230 int i;
3231
3232 other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
3233 if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
3234 other_file = (mi.field.nr_file_pages - mi.field.shmem -
3235 mi.field.unevictable - mi.field.swap_cached);
3236 } else {
3237 other_file = 0;
3238 }
3239
3240 min_score_adj = OOM_SCORE_ADJ_MAX + 1;
3241 for (i = 0; i < lowmem_targets_size; i++) {
3242 minfree = lowmem_minfree[i];
3243 if (other_free < minfree && other_file < minfree) {
3244 min_score_adj = lowmem_adj[i];
3245 break;
3246 }
3247 }
3248
3249 if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
3250 if (debug_process_killing && lowmem_targets_size) {
3251 ALOGI("Ignore %s memory pressure event "
3252 "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
3253 level_name[level], other_free * page_k, other_file * page_k,
3254 (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
3255 }
3256 return;
3257 }
3258
3259 goto do_kill;
3260 }
3261
3262 if (level == VMPRESS_LEVEL_LOW) {
3263 record_low_pressure_levels(&mi);
3264 }
3265
3266 if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
3267 /* Do not monitor this pressure level */
3268 return;
3269 }
3270
3271 if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
3272 goto do_kill;
3273 }
3274 if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
3275 goto do_kill;
3276 }
3277
3278 // Calculate percent for swappinness.
3279 mem_pressure = (mem_usage * 100) / memsw_usage;
3280
3281 if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
3282 // We are swapping too much.
3283 if (mem_pressure < upgrade_pressure) {
3284 level = upgrade_level(level);
3285 if (debug_process_killing) {
3286 ALOGI("Event upgraded to %s", level_name[level]);
3287 }
3288 }
3289 }
3290
3291 // If we still have enough swap space available, check if we want to
3292 // ignore/downgrade pressure events.
3293 if (get_free_swap(&mi) >=
3294 mi.field.total_swap * swap_free_low_percentage / 100) {
3295 // If the pressure is larger than downgrade_pressure lmk will not
3296 // kill any process, since enough memory is available.
3297 if (mem_pressure > downgrade_pressure) {
3298 if (debug_process_killing) {
3299 ALOGI("Ignore %s memory pressure", level_name[level]);
3300 }
3301 return;
3302 } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
3303 if (debug_process_killing) {
3304 ALOGI("Downgrade critical memory pressure");
3305 }
3306 // Downgrade event, since enough memory available.
3307 level = downgrade_level(level);
3308 }
3309 }
3310
3311 do_kill:
3312 if (low_ram_device) {
3313 /* For Go devices kill only one task */
3314 if (find_and_kill_process(use_minfree_levels ? min_score_adj : level_oomadj[level],
3315 NULL, &mi, &wi, &curr_tm, NULL) == 0) {
3316 if (debug_process_killing) {
3317 ALOGI("Nothing to kill");
3318 }
3319 }
3320 } else {
3321 int pages_freed;
3322 static struct timespec last_report_tm;
3323 static unsigned long report_skip_count = 0;
3324
3325 if (!use_minfree_levels) {
3326 /* Free up enough memory to downgrate the memory pressure to low level */
3327 if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
3328 if (debug_process_killing) {
3329 ALOGI("Ignoring pressure since more memory is "
3330 "available (%" PRId64 ") than watermark (%" PRId64 ")",
3331 mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
3332 }
3333 return;
3334 }
3335 min_score_adj = level_oomadj[level];
3336 }
3337
3338 pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm, NULL);
3339
3340 if (pages_freed == 0 && min_score_adj == 0) {
3341 lmkd_no_kill_candidates_hook();
3342 }
3343
3344 if (pages_freed == 0) {
3345 /* Rate limit kill reports when nothing was reclaimed */
3346 if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
3347 report_skip_count++;
3348 return;
3349 }
3350 }
3351
3352 /* Log whenever we kill or when report rate limit allows */
3353 if (use_minfree_levels) {
3354 ALOGI("Reclaimed %ldkB, cache(%ldkB) and free(%" PRId64 "kB)-reserved(%" PRId64 "kB) "
3355 "below min(%ldkB) for oom_score_adj %d",
3356 pages_freed * page_k,
3357 other_file * page_k, mi.field.nr_free_pages * page_k,
3358 zi.totalreserve_pages * page_k,
3359 minfree * page_k, min_score_adj);
3360 } else {
3361 ALOGI("Reclaimed %ldkB at oom_score_adj %d", pages_freed * page_k, min_score_adj);
3362 }
3363
3364 if (report_skip_count > 0) {
3365 ALOGI("Suppressed %lu failed kill reports", report_skip_count);
3366 report_skip_count = 0;
3367 }
3368
3369 last_report_tm = curr_tm;
3370 }
3371 if (is_waiting_for_kill()) {
3372 /* pause polling if we are waiting for process death notification */
3373 poll_params->update = POLLING_PAUSE;
3374 }
3375 }
3376
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)3377 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
3378 int fd;
3379
3380 /* Do not register a handler if threshold_ms is not set */
3381 if (!psi_thresholds[level].threshold_ms) {
3382 return true;
3383 }
3384
3385 fd = init_psi_monitor(psi_thresholds[level].stall_type,
3386 psi_thresholds[level].threshold_ms * US_PER_MS,
3387 PSI_WINDOW_SIZE_MS * US_PER_MS);
3388
3389 if (fd < 0) {
3390 return false;
3391 }
3392
3393 vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
3394 vmpressure_hinfo[level].data = level;
3395 if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
3396 destroy_psi_monitor(fd);
3397 return false;
3398 }
3399 maxevents++;
3400 mpevfd[level] = fd;
3401
3402 return true;
3403 }
3404
destroy_mp_psi(enum vmpressure_level level)3405 static void destroy_mp_psi(enum vmpressure_level level) {
3406 int fd = mpevfd[level];
3407
3408 if (fd < 0) {
3409 return;
3410 }
3411
3412 if (unregister_psi_monitor(epollfd, fd) < 0) {
3413 ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
3414 level_name[level], errno);
3415 }
3416 maxevents--;
3417 destroy_psi_monitor(fd);
3418 mpevfd[level] = -1;
3419 }
3420
3421 enum class MemcgVersion {
3422 kNotFound,
3423 kV1,
3424 kV2,
3425 };
3426
__memcg_version()3427 static MemcgVersion __memcg_version() {
3428 std::string cgroupv2_path, memcg_path;
3429
3430 if (!CgroupGetControllerPath("memory", &memcg_path)) {
3431 return MemcgVersion::kNotFound;
3432 }
3433 return CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &cgroupv2_path) &&
3434 cgroupv2_path == memcg_path
3435 ? MemcgVersion::kV2
3436 : MemcgVersion::kV1;
3437 }
3438
memcg_version()3439 static MemcgVersion memcg_version() {
3440 static MemcgVersion version = __memcg_version();
3441
3442 return version;
3443 }
3444
memevent_listener_notification(int data __unused,uint32_t events __unused,struct polling_params * poll_params)3445 static void memevent_listener_notification(int data __unused, uint32_t events __unused,
3446 struct polling_params* poll_params) {
3447 struct timespec curr_tm;
3448 std::vector<mem_event_t> mem_events;
3449
3450 if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
3451 direct_reclaim_start_tm.tv_sec = 0;
3452 direct_reclaim_start_tm.tv_nsec = 0;
3453 ALOGE("Failed to get current time for memevent listener notification.");
3454 return;
3455 }
3456
3457 if (!memevent_listener->getMemEvents(mem_events)) {
3458 direct_reclaim_start_tm.tv_sec = 0;
3459 direct_reclaim_start_tm.tv_nsec = 0;
3460 ALOGE("Failed fetching memory listener events.");
3461 return;
3462 }
3463
3464 for (const mem_event_t& mem_event : mem_events) {
3465 switch (mem_event.type) {
3466 /* Direct Reclaim */
3467 case MEM_EVENT_DIRECT_RECLAIM_BEGIN:
3468 direct_reclaim_start_tm = curr_tm;
3469 break;
3470 case MEM_EVENT_DIRECT_RECLAIM_END:
3471 direct_reclaim_start_tm.tv_sec = 0;
3472 direct_reclaim_start_tm.tv_nsec = 0;
3473 break;
3474
3475 /* kswapd */
3476 case MEM_EVENT_KSWAPD_WAKE:
3477 kswapd_start_tm = curr_tm;
3478 break;
3479 case MEM_EVENT_KSWAPD_SLEEP:
3480 kswapd_start_tm.tv_sec = 0;
3481 kswapd_start_tm.tv_nsec = 0;
3482 break;
3483 case MEM_EVENT_VENDOR_LMK_KILL: {
3484 union psi_event_data event_data = {.vendor_event = mem_event};
3485 __mp_event_psi(VENDOR, event_data, 0, poll_params);
3486 break;
3487 }
3488 case MEM_EVENT_UPDATE_ZONEINFO: {
3489 struct zoneinfo zi;
3490 update_zoneinfo_watermarks(&zi);
3491 break;
3492 }
3493 }
3494 }
3495 }
3496
init_memevent_listener_monitoring()3497 static bool init_memevent_listener_monitoring() {
3498 static struct event_handler_info direct_reclaim_poll_hinfo = {0,
3499 memevent_listener_notification};
3500
3501 if (memevent_listener) return true;
3502
3503 // Make sure bpf programs are loaded, else we'll wait until they are loaded
3504 android::bpf::waitForProgsLoaded();
3505 memevent_listener = std::make_unique<android::bpf::memevents::MemEventListener>(
3506 android::bpf::memevents::MemEventClient::LMKD);
3507
3508 if (!memevent_listener->ok()) {
3509 ALOGE("Failed to initialize memevents listener");
3510 memevent_listener.reset();
3511 return false;
3512 }
3513
3514 if (!memevent_listener->registerEvent(MEM_EVENT_DIRECT_RECLAIM_BEGIN) ||
3515 !memevent_listener->registerEvent(MEM_EVENT_DIRECT_RECLAIM_END)) {
3516 ALOGE("Failed to register direct reclaim memevents");
3517 memevent_listener.reset();
3518 return false;
3519 }
3520 if (!memevent_listener->registerEvent(MEM_EVENT_KSWAPD_WAKE) ||
3521 !memevent_listener->registerEvent(MEM_EVENT_KSWAPD_SLEEP)) {
3522 ALOGE("Failed to register kswapd memevents");
3523 memevent_listener.reset();
3524 return false;
3525 }
3526
3527 if (!memevent_listener->registerEvent(MEM_EVENT_VENDOR_LMK_KILL)) {
3528 ALOGI("Failed to register android_vendor_kill memevents");
3529 }
3530
3531 if (!memevent_listener->registerEvent(MEM_EVENT_UPDATE_ZONEINFO)) {
3532 mem_event_update_zoneinfo_supported = false;
3533 ALOGI("update_zoneinfo memevents are not supported");
3534 } else {
3535 mem_event_update_zoneinfo_supported = true;
3536 }
3537
3538 int memevent_listener_fd = memevent_listener->getRingBufferFd();
3539 if (memevent_listener_fd < 0) {
3540 memevent_listener.reset();
3541 ALOGE("Invalid memevent_listener fd: %d", memevent_listener_fd);
3542 return false;
3543 }
3544
3545 struct epoll_event epev;
3546 epev.events = EPOLLIN;
3547 epev.data.ptr = (void*)&direct_reclaim_poll_hinfo;
3548 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, memevent_listener_fd, &epev) < 0) {
3549 ALOGE("Failed registering memevent_listener fd: %d; errno=%d", memevent_listener_fd, errno);
3550 memevent_listener.reset();
3551 return false;
3552 }
3553
3554 direct_reclaim_start_tm.tv_sec = 0;
3555 direct_reclaim_start_tm.tv_nsec = 0;
3556
3557 maxevents++;
3558 return true;
3559 }
3560
init_psi_monitors()3561 static bool init_psi_monitors() {
3562 /*
3563 * When PSI is used on low-ram devices or on high-end devices without memfree levels
3564 * use new kill strategy based on zone watermarks, free swap and thrashing stats.
3565 * Also use the new strategy if memcg has not been mounted in the v1 cgroups hiearchy since
3566 * the old strategy relies on memcg attributes that are available only in the v1 cgroups
3567 * hiearchy.
3568 */
3569 bool use_new_strategy =
3570 GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels);
3571 if (!use_new_strategy && memcg_version() != MemcgVersion::kV1) {
3572 ALOGE("Old kill strategy can only be used with v1 cgroup hierarchy");
3573 return false;
3574 }
3575 /* In default PSI mode override stall amounts using system properties */
3576 if (use_new_strategy) {
3577 /* Do not use low pressure level */
3578 psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
3579 psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
3580 psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
3581 }
3582
3583 if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
3584 return false;
3585 }
3586 if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
3587 destroy_mp_psi(VMPRESS_LEVEL_LOW);
3588 return false;
3589 }
3590 if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
3591 destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3592 destroy_mp_psi(VMPRESS_LEVEL_LOW);
3593 return false;
3594 }
3595 return true;
3596 }
3597
init_mp_common(enum vmpressure_level level)3598 static bool init_mp_common(enum vmpressure_level level) {
3599 // The implementation of this function relies on memcg statistics that are only available in the
3600 // v1 cgroup hierarchy.
3601 if (memcg_version() != MemcgVersion::kV1) {
3602 ALOGE("%s: global monitoring is only available for the v1 cgroup hierarchy", __func__);
3603 return false;
3604 }
3605
3606 int mpfd;
3607 int evfd;
3608 int evctlfd;
3609 char buf[256];
3610 struct epoll_event epev;
3611 int ret;
3612 int level_idx = (int)level;
3613 const char *levelstr = level_name[level_idx];
3614
3615 /* gid containing AID_SYSTEM required */
3616 mpfd = open(GetCgroupAttributePath("MemPressureLevel").c_str(), O_RDONLY | O_CLOEXEC);
3617 if (mpfd < 0) {
3618 ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
3619 goto err_open_mpfd;
3620 }
3621
3622 evctlfd = open(GetCgroupAttributePath("MemCgroupEventControl").c_str(), O_WRONLY | O_CLOEXEC);
3623 if (evctlfd < 0) {
3624 ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
3625 goto err_open_evctlfd;
3626 }
3627
3628 evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
3629 if (evfd < 0) {
3630 ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
3631 goto err_eventfd;
3632 }
3633
3634 ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
3635 if (ret >= (ssize_t)sizeof(buf)) {
3636 ALOGE("cgroup.event_control line overflow for level %s", levelstr);
3637 goto err;
3638 }
3639
3640 ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
3641 if (ret == -1) {
3642 ALOGE("cgroup.event_control write failed for level %s; errno=%d",
3643 levelstr, errno);
3644 goto err;
3645 }
3646
3647 epev.events = EPOLLIN;
3648 /* use data to store event level */
3649 vmpressure_hinfo[level_idx].data = level_idx;
3650 vmpressure_hinfo[level_idx].handler = mp_event_common;
3651 epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
3652 ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
3653 if (ret == -1) {
3654 ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
3655 goto err;
3656 }
3657 maxevents++;
3658 mpevfd[level] = evfd;
3659 close(evctlfd);
3660 return true;
3661
3662 err:
3663 close(evfd);
3664 err_eventfd:
3665 close(evctlfd);
3666 err_open_evctlfd:
3667 close(mpfd);
3668 err_open_mpfd:
3669 return false;
3670 }
3671
destroy_mp_common(enum vmpressure_level level)3672 static void destroy_mp_common(enum vmpressure_level level) {
3673 struct epoll_event epev;
3674 int fd = mpevfd[level];
3675
3676 if (fd < 0) {
3677 return;
3678 }
3679
3680 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) {
3681 // Log an error and keep going
3682 ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno);
3683 }
3684 maxevents--;
3685 close(fd);
3686 mpevfd[level] = -1;
3687 }
3688
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)3689 static void kernel_event_handler(int data __unused, uint32_t events __unused,
3690 struct polling_params *poll_params __unused) {
3691 poll_kernel(kpoll_fd);
3692 }
3693
init_monitors()3694 static bool init_monitors() {
3695 ALOGI("Wakeup counter is reset from %" PRIu64 " to 0", mp_event_count);
3696 mp_event_count = 0;
3697 /* Try to use psi monitor first if kernel has it */
3698 use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) &&
3699 init_psi_monitors();
3700 /* Fall back to vmpressure */
3701 if (!use_psi_monitors &&
3702 (!init_mp_common(VMPRESS_LEVEL_LOW) ||
3703 !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
3704 !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
3705 ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
3706 return false;
3707 }
3708 if (use_psi_monitors) {
3709 ALOGI("Using psi monitors for memory pressure detection");
3710 } else {
3711 ALOGI("Using vmpressure for memory pressure detection");
3712 }
3713
3714 monitors_initialized = true;
3715 return true;
3716 }
3717
destroy_monitors()3718 static void destroy_monitors() {
3719 if (use_psi_monitors) {
3720 destroy_mp_psi(VMPRESS_LEVEL_CRITICAL);
3721 destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3722 destroy_mp_psi(VMPRESS_LEVEL_LOW);
3723 } else {
3724 destroy_mp_common(VMPRESS_LEVEL_CRITICAL);
3725 destroy_mp_common(VMPRESS_LEVEL_MEDIUM);
3726 destroy_mp_common(VMPRESS_LEVEL_LOW);
3727 }
3728 }
3729
drop_reaper_comm()3730 static void drop_reaper_comm() {
3731 close(reaper_comm_fd[0]);
3732 close(reaper_comm_fd[1]);
3733 }
3734
setup_reaper_comm()3735 static bool setup_reaper_comm() {
3736 if (pipe(reaper_comm_fd)) {
3737 ALOGE("pipe failed: %s", strerror(errno));
3738 return false;
3739 }
3740
3741 // Ensure main thread never blocks on read
3742 int flags = fcntl(reaper_comm_fd[0], F_GETFL);
3743 if (fcntl(reaper_comm_fd[0], F_SETFL, flags | O_NONBLOCK)) {
3744 ALOGE("fcntl failed: %s", strerror(errno));
3745 drop_reaper_comm();
3746 return false;
3747 }
3748
3749 return true;
3750 }
3751
init_reaper()3752 static bool init_reaper() {
3753 if (!reaper.is_reaping_supported()) {
3754 ALOGI("Process reaping is not supported");
3755 return false;
3756 }
3757
3758 if (!setup_reaper_comm()) {
3759 ALOGE("Failed to create thread communication channel");
3760 return false;
3761 }
3762
3763 // Setup epoll handler
3764 struct epoll_event epev;
3765 static struct event_handler_info kill_failed_hinfo = { 0, kill_fail_handler };
3766 epev.events = EPOLLIN;
3767 epev.data.ptr = (void *)&kill_failed_hinfo;
3768 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, reaper_comm_fd[0], &epev)) {
3769 ALOGE("epoll_ctl failed: %s", strerror(errno));
3770 drop_reaper_comm();
3771 return false;
3772 }
3773
3774 if (!reaper.init(reaper_comm_fd[1])) {
3775 ALOGE("Failed to initialize reaper object");
3776 if (epoll_ctl(epollfd, EPOLL_CTL_DEL, reaper_comm_fd[0], &epev)) {
3777 ALOGE("epoll_ctl failed: %s", strerror(errno));
3778 }
3779 drop_reaper_comm();
3780 return false;
3781 }
3782 maxevents++;
3783
3784 return true;
3785 }
3786
init(void)3787 static int init(void) {
3788 static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
3789 struct reread_data file_data = {
3790 .filename = ZONEINFO_PATH,
3791 .fd = -1,
3792 };
3793 struct epoll_event epev;
3794 int pidfd;
3795 int i;
3796 int ret;
3797
3798 // Initialize page size
3799 pagesize = getpagesize();
3800 page_k = pagesize / 1024;
3801
3802 epollfd = epoll_create(MAX_EPOLL_EVENTS);
3803 if (epollfd == -1) {
3804 ALOGE("epoll_create failed (errno=%d)", errno);
3805 return -1;
3806 }
3807
3808 // mark data connections as not connected
3809 for (int i = 0; i < MAX_DATA_CONN; i++) {
3810 data_sock[i].sock = -1;
3811 }
3812
3813 ctrl_sock.sock = android_get_control_socket("lmkd");
3814 if (ctrl_sock.sock < 0) {
3815 ALOGE("get lmkd control socket failed");
3816 return -1;
3817 }
3818
3819 ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
3820 if (ret < 0) {
3821 ALOGE("lmkd control socket listen failed (errno=%d)", errno);
3822 return -1;
3823 }
3824
3825 epev.events = EPOLLIN;
3826 ctrl_sock.handler_info.handler = ctrl_connect_handler;
3827 epev.data.ptr = (void *)&(ctrl_sock.handler_info);
3828 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
3829 ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
3830 return -1;
3831 }
3832 maxevents++;
3833
3834 has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
3835 use_inkernel_interface = has_inkernel_module;
3836
3837 if (use_inkernel_interface) {
3838 ALOGI("Using in-kernel low memory killer interface");
3839 if (init_poll_kernel()) {
3840 epev.events = EPOLLIN;
3841 epev.data.ptr = (void*)&kernel_poll_hinfo;
3842 if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
3843 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
3844 close(kpoll_fd);
3845 kpoll_fd = -1;
3846 } else {
3847 maxevents++;
3848 /* let the others know it does support reporting kills */
3849 property_set("sys.lmk.reportkills", "1");
3850 }
3851 }
3852 } else {
3853 // Do not register monitors until boot completed for devices configured
3854 // for delaying monitors. This is done to save CPU cycles for low
3855 // resource devices during boot up.
3856 if (!delay_monitors_until_boot || property_get_bool("sys.boot_completed", false)) {
3857 if (!init_monitors()) {
3858 return -1;
3859 }
3860 }
3861 /* let the others know it does support reporting kills */
3862 property_set("sys.lmk.reportkills", "1");
3863 }
3864
3865 for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
3866 procadjslot_list[i].next = &procadjslot_list[i];
3867 procadjslot_list[i].prev = &procadjslot_list[i];
3868 }
3869
3870 memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
3871
3872 /*
3873 * Read zoneinfo as the biggest file we read to create and size the initial
3874 * read buffer and avoid memory re-allocations during memory pressure
3875 */
3876 if (reread_file(&file_data) == NULL) {
3877 ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
3878 }
3879
3880 /* check if kernel supports pidfd_open syscall */
3881 pidfd = TEMP_FAILURE_RETRY(pidfd_open(getpid(), 0));
3882 if (pidfd < 0) {
3883 pidfd_supported = (errno != ENOSYS);
3884 } else {
3885 pidfd_supported = true;
3886 close(pidfd);
3887 }
3888 ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
3889
3890 if (!lmkd_init_hook()) {
3891 ALOGE("Failed to initialize LMKD hooks.");
3892 return -1;
3893 }
3894
3895 return 0;
3896 }
3897
polling_paused(struct polling_params * poll_params)3898 static bool polling_paused(struct polling_params *poll_params) {
3899 return poll_params->paused_handler != NULL;
3900 }
3901
resume_polling(struct polling_params * poll_params,struct timespec curr_tm)3902 static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) {
3903 poll_params->poll_start_tm = curr_tm;
3904 poll_params->poll_handler = poll_params->paused_handler;
3905 poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3906 poll_params->paused_handler = NULL;
3907 }
3908
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)3909 static void call_handler(struct event_handler_info* handler_info,
3910 struct polling_params *poll_params, uint32_t events) {
3911 struct timespec curr_tm;
3912
3913 watchdog.start();
3914 poll_params->update = POLLING_DO_NOT_CHANGE;
3915 handler_info->handler(handler_info->data, events, poll_params);
3916 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3917 if (poll_params->poll_handler == handler_info) {
3918 poll_params->last_poll_tm = curr_tm;
3919 }
3920
3921 switch (poll_params->update) {
3922 case POLLING_START:
3923 /*
3924 * Poll for the duration of PSI_WINDOW_SIZE_MS after the
3925 * initial PSI event because psi events are rate-limited
3926 * at one per sec.
3927 */
3928 poll_params->poll_start_tm = curr_tm;
3929 poll_params->poll_handler = handler_info;
3930 poll_params->last_poll_tm = curr_tm;
3931 break;
3932 case POLLING_PAUSE:
3933 poll_params->paused_handler = handler_info;
3934 poll_params->poll_handler = NULL;
3935 break;
3936 case POLLING_RESUME:
3937 resume_polling(poll_params, curr_tm);
3938 break;
3939 case POLLING_DO_NOT_CHANGE:
3940 if (poll_params->poll_handler &&
3941 get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
3942 /* Polled for the duration of PSI window, time to stop */
3943 poll_params->poll_handler = NULL;
3944 }
3945 break;
3946 }
3947 watchdog.stop();
3948 }
3949
mainloop(void)3950 static void mainloop(void) {
3951 struct event_handler_info* handler_info;
3952 struct polling_params poll_params;
3953 struct timespec curr_tm;
3954 struct epoll_event *evt;
3955 long delay = -1;
3956
3957 poll_params.poll_handler = NULL;
3958 poll_params.paused_handler = NULL;
3959
3960 while (1) {
3961 struct epoll_event events[MAX_EPOLL_EVENTS];
3962 int nevents;
3963 int i;
3964
3965 if (poll_params.poll_handler) {
3966 bool poll_now;
3967
3968 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3969 if (poll_params.update == POLLING_RESUME) {
3970 /* Just transitioned into POLLING_RESUME, poll immediately. */
3971 poll_now = true;
3972 nevents = 0;
3973 } else {
3974 /* Calculate next timeout */
3975 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
3976 delay = (delay < poll_params.polling_interval_ms) ?
3977 poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
3978
3979 /* Wait for events until the next polling timeout */
3980 nevents = epoll_wait(epollfd, events, maxevents, delay);
3981
3982 /* Update current time after wait */
3983 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3984 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
3985 poll_params.polling_interval_ms);
3986 }
3987 if (poll_now) {
3988 call_handler(poll_params.poll_handler, &poll_params, 0);
3989 }
3990 } else {
3991 if (kill_timeout_ms && is_waiting_for_kill()) {
3992 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3993 delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
3994 /* Wait for pidfds notification or kill timeout to expire */
3995 nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
3996 if (nevents == 0) {
3997 /* Kill notification timed out */
3998 stop_wait_for_proc_kill(false);
3999 if (polling_paused(&poll_params)) {
4000 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
4001 poll_params.update = POLLING_RESUME;
4002 resume_polling(&poll_params, curr_tm);
4003 }
4004 }
4005 } else {
4006 /* Wait for events with no timeout */
4007 nevents = epoll_wait(epollfd, events, maxevents, -1);
4008 }
4009 }
4010
4011 if (nevents == -1) {
4012 if (errno == EINTR)
4013 continue;
4014 ALOGE("epoll_wait failed (errno=%d)", errno);
4015 continue;
4016 }
4017
4018 /*
4019 * First pass to see if any data socket connections were dropped.
4020 * Dropped connection should be handled before any other events
4021 * to deallocate data connection and correctly handle cases when
4022 * connection gets dropped and reestablished in the same epoll cycle.
4023 * In such cases it's essential to handle connection closures first.
4024 */
4025 for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
4026 if ((evt->events & EPOLLHUP) && evt->data.ptr) {
4027 handler_info = (struct event_handler_info*)evt->data.ptr;
4028 if (handler_info->handler == kill_done_handler) {
4029 call_handler(handler_info, &poll_params, evt->events);
4030 } else {
4031 ALOGI("lmkd data connection dropped");
4032 watchdog.start();
4033 ctrl_data_close(handler_info->data);
4034 watchdog.stop();
4035 }
4036 }
4037 }
4038
4039 /* Second pass to handle all other events */
4040 for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
4041 if (evt->events & EPOLLERR) {
4042 ALOGD("EPOLLERR on event #%d", i);
4043 }
4044 if (evt->events & EPOLLHUP) {
4045 /* This case was handled in the first pass */
4046 continue;
4047 }
4048 if (evt->data.ptr) {
4049 handler_info = (struct event_handler_info*)evt->data.ptr;
4050 call_handler(handler_info, &poll_params, evt->events);
4051 }
4052 }
4053 }
4054 }
4055
issue_reinit()4056 int issue_reinit() {
4057 int sock;
4058
4059 sock = lmkd_connect();
4060 if (sock < 0) {
4061 ALOGE("failed to connect to lmkd: %s", strerror(errno));
4062 return -1;
4063 }
4064
4065 enum update_props_result res = lmkd_update_props(sock);
4066 switch (res) {
4067 case UPDATE_PROPS_SUCCESS:
4068 ALOGI("lmkd updated properties successfully");
4069 break;
4070 case UPDATE_PROPS_SEND_ERR:
4071 ALOGE("failed to send lmkd request: %s", strerror(errno));
4072 break;
4073 case UPDATE_PROPS_RECV_ERR:
4074 ALOGE("failed to receive lmkd reply: %s", strerror(errno));
4075 break;
4076 case UPDATE_PROPS_FORMAT_ERR:
4077 ALOGE("lmkd reply is invalid");
4078 break;
4079 case UPDATE_PROPS_FAIL:
4080 ALOGE("lmkd failed to update its properties");
4081 break;
4082 }
4083
4084 close(sock);
4085 return res == UPDATE_PROPS_SUCCESS ? 0 : -1;
4086 }
4087
on_boot_completed()4088 static int on_boot_completed() {
4089 int sock;
4090
4091 sock = lmkd_connect();
4092 if (sock < 0) {
4093 ALOGE("failed to connect to lmkd: %s", strerror(errno));
4094 return -1;
4095 }
4096
4097 enum boot_completed_notification_result res = lmkd_notify_boot_completed(sock);
4098
4099 switch (res) {
4100 case BOOT_COMPLETED_NOTIF_SUCCESS:
4101 break;
4102 case BOOT_COMPLETED_NOTIF_ALREADY_HANDLED:
4103 ALOGW("lmkd already handled boot-completed operations");
4104 break;
4105 case BOOT_COMPLETED_NOTIF_SEND_ERR:
4106 ALOGE("failed to send lmkd request: %m");
4107 break;
4108 case BOOT_COMPLETED_NOTIF_RECV_ERR:
4109 ALOGE("failed to receive request: %m");
4110 break;
4111 case BOOT_COMPLETED_NOTIF_FORMAT_ERR:
4112 ALOGE("lmkd reply is invalid");
4113 break;
4114 case BOOT_COMPLETED_NOTIF_FAILS:
4115 ALOGE("lmkd failed to receive boot-completed notification");
4116 break;
4117 }
4118
4119 close(sock);
4120 return res == BOOT_COMPLETED_NOTIF_SUCCESS ? 0 : -1;
4121 }
4122
update_props()4123 static bool update_props() {
4124 /* By default disable low level vmpressure events */
4125 level_oomadj[VMPRESS_LEVEL_LOW] =
4126 GET_LMK_PROPERTY(int32, "low", OOM_SCORE_ADJ_MAX + 1);
4127 level_oomadj[VMPRESS_LEVEL_MEDIUM] =
4128 GET_LMK_PROPERTY(int32, "medium", 800);
4129 level_oomadj[VMPRESS_LEVEL_CRITICAL] =
4130 GET_LMK_PROPERTY(int32, "critical", 0);
4131 debug_process_killing = GET_LMK_PROPERTY(bool, "debug", false);
4132
4133 /* By default disable upgrade/downgrade logic */
4134 enable_pressure_upgrade =
4135 GET_LMK_PROPERTY(bool, "critical_upgrade", false);
4136 upgrade_pressure =
4137 (int64_t)GET_LMK_PROPERTY(int32, "upgrade_pressure", 100);
4138 downgrade_pressure =
4139 (int64_t)GET_LMK_PROPERTY(int32, "downgrade_pressure", 100);
4140 kill_heaviest_task =
4141 GET_LMK_PROPERTY(bool, "kill_heaviest_task", false);
4142 low_ram_device = property_get_bool("ro.config.low_ram", false);
4143 kill_timeout_ms =
4144 (unsigned long)GET_LMK_PROPERTY(int32, "kill_timeout_ms", 100);
4145 pressure_after_kill_min_score =
4146 (unsigned long)GET_LMK_PROPERTY(int32, "pressure_after_kill_min_score", 0);
4147 use_minfree_levels =
4148 GET_LMK_PROPERTY(bool, "use_minfree_levels", false);
4149 per_app_memcg =
4150 property_get_bool("ro.config.per_app_memcg", low_ram_device);
4151 swap_free_low_percentage = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_free_low_percentage",
4152 DEF_LOW_SWAP));
4153 psi_partial_stall_ms = GET_LMK_PROPERTY(int32, "psi_partial_stall_ms",
4154 low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
4155 psi_complete_stall_ms = GET_LMK_PROPERTY(int32, "psi_complete_stall_ms",
4156 DEF_COMPLETE_STALL);
4157 thrashing_limit_pct =
4158 std::max(0, GET_LMK_PROPERTY(int32, "thrashing_limit",
4159 low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
4160 thrashing_limit_decay_pct = clamp(0, 100, GET_LMK_PROPERTY(int32, "thrashing_limit_decay",
4161 low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
4162 thrashing_critical_pct = std::max(
4163 0, GET_LMK_PROPERTY(int32, "thrashing_limit_critical", thrashing_limit_pct * 3));
4164 swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100));
4165 filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
4166 stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);
4167 delay_monitors_until_boot = GET_LMK_PROPERTY(bool, "delay_monitors_until_boot", false);
4168 direct_reclaim_threshold_ms =
4169 GET_LMK_PROPERTY(int64, "direct_reclaim_threshold_ms", DEF_DIRECT_RECL_THRESH_MS);
4170 swap_compression_ratio =
4171 GET_LMK_PROPERTY(int64, "swap_compression_ratio", DEF_SWAP_COMP_RATIO);
4172 lowmem_min_oom_score =
4173 std::max(PERCEPTIBLE_APP_ADJ + 1,
4174 GET_LMK_PROPERTY(int32, "lowmem_min_oom_score", DEF_LOWMEM_MIN_SCORE));
4175
4176 reaper.enable_debug(debug_process_killing);
4177
4178 /* Call the update props hook */
4179 if (!lmkd_update_props_hook()) {
4180 ALOGE("Failed to update LMKD hook props.");
4181 return false;
4182 }
4183
4184 return true;
4185 }
4186
main(int argc,char ** argv)4187 int main(int argc, char **argv) {
4188 if ((argc > 1) && argv[1]) {
4189 if (!strcmp(argv[1], "--reinit")) {
4190 if (property_set(LMKD_REINIT_PROP, "")) {
4191 ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
4192 }
4193 return issue_reinit();
4194 } else if (!strcmp(argv[1], "--boot_completed")) {
4195 return on_boot_completed();
4196 }
4197 }
4198
4199 if (!update_props()) {
4200 ALOGE("Failed to initialize props, exiting.");
4201 return -1;
4202 }
4203
4204 ctx = create_android_logger(KILLINFO_LOG_TAG);
4205
4206 if (!init()) {
4207 if (!use_inkernel_interface) {
4208 /*
4209 * MCL_ONFAULT pins pages as they fault instead of loading
4210 * everything immediately all at once. (Which would be bad,
4211 * because as of this writing, we have a lot of mapped pages we
4212 * never use.) Old kernels will see MCL_ONFAULT and fail with
4213 * EINVAL; we ignore this failure.
4214 *
4215 * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
4216 * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
4217 * in pages.
4218 */
4219 /* CAP_IPC_LOCK required */
4220 if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
4221 ALOGW("mlockall failed %s", strerror(errno));
4222 }
4223
4224 /* CAP_NICE required */
4225 struct sched_param param = {
4226 .sched_priority = 1,
4227 };
4228 if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
4229 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
4230 }
4231 }
4232
4233 if (init_reaper()) {
4234 ALOGI("Process reaper initialized with %d threads in the pool",
4235 reaper.thread_cnt());
4236 }
4237
4238 if (!watchdog.init()) {
4239 ALOGE("Failed to initialize the watchdog");
4240 }
4241
4242 mainloop();
4243 }
4244
4245 android_log_destroy(&ctx);
4246
4247 ALOGI("exiting");
4248 return 0;
4249 }
4250