• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "lowmemorykiller"
18 
19 #include <errno.h>
20 #include <inttypes.h>
21 #include <pwd.h>
22 #include <sched.h>
23 #include <stdbool.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <sys/cdefs.h>
27 #include <sys/epoll.h>
28 #include <sys/eventfd.h>
29 #include <sys/mman.h>
30 #include <sys/pidfd.h>
31 #include <sys/socket.h>
32 #include <sys/syscall.h>
33 #include <sys/sysinfo.h>
34 #include <time.h>
35 #include <unistd.h>
36 
37 #include <algorithm>
38 #include <array>
39 #include <memory>
40 #include <shared_mutex>
41 #include <vector>
42 
43 #include <BpfSyscallWrappers.h>
44 #include <android-base/unique_fd.h>
45 #include <bpf/WaitForProgsLoaded.h>
46 #include <cutils/properties.h>
47 #include <cutils/sockets.h>
48 #include <liblmkd_utils.h>
49 #include <lmkd.h>
50 #include <lmkd_hooks.h>
51 #include <log/log.h>
52 #include <log/log_event_list.h>
53 #include <log/log_time.h>
54 #include <memevents/memevents.h>
55 #include <private/android_filesystem_config.h>
56 #include <processgroup/processgroup.h>
57 #include <psi/psi.h>
58 
59 #include "reaper.h"
60 #include "statslog.h"
61 #include "watchdog.h"
62 
63 /*
64  * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
65  * to profile and correlate with OOM kills
66  */
67 #ifdef LMKD_TRACE_KILLS
68 
69 #define ATRACE_TAG ATRACE_TAG_ALWAYS
70 #include <cutils/trace.h>
71 
trace_kill_start(const char * desc)72 static inline void trace_kill_start(const char *desc) {
73     ATRACE_BEGIN(desc);
74 }
75 
trace_kill_end()76 static inline void trace_kill_end() {
77     ATRACE_END();
78 }
79 
80 #else /* LMKD_TRACE_KILLS */
81 
trace_kill_start(const char *)82 static inline void trace_kill_start(const char *) {}
trace_kill_end()83 static inline void trace_kill_end() {}
84 
85 #endif /* LMKD_TRACE_KILLS */
86 
87 #ifndef __unused
88 #define __unused __attribute__((__unused__))
89 #endif
90 
91 #define ZONEINFO_PATH "/proc/zoneinfo"
92 #define MEMINFO_PATH "/proc/meminfo"
93 #define VMSTAT_PATH "/proc/vmstat"
94 #define PROC_STATUS_TGID_FIELD "Tgid:"
95 #define PROC_STATUS_RSS_FIELD "VmRSS:"
96 #define PROC_STATUS_SWAP_FIELD "VmSwap:"
97 #define NODE_STATS_MARKER "  per-node stats"
98 
99 #define PERCEPTIBLE_APP_ADJ 200
100 #define PREVIOUS_APP_ADJ 700
101 
102 /* Android Logger event logtags (see event.logtags) */
103 #define KILLINFO_LOG_TAG 10195355
104 
105 /* gid containing AID_SYSTEM required */
106 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
107 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
108 
109 #define EIGHT_MEGA (1 << 23)
110 
111 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
112 #define THRASHING_RESET_INTERVAL_MS 1000
113 
114 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
115 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
116 
117 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
118 #define SYSTEM_ADJ (-900)
119 
120 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
121 #define STRINGIFY_INTERNAL(x) #x
122 
123 #define PROCFS_PATH_MAX 64
124 
125 /*
126  * Read lmk property with persist.device_config.lmkd_native.<name> overriding ro.lmk.<name>
127  * persist.device_config.lmkd_native.* properties are being set by experiments. If a new property
128  * can be controlled by an experiment then use GET_LMK_PROPERTY instead of property_get_xxx and
129  * add "on property" triggers in lmkd.rc to react to the experiment flag changes.
130  */
131 #define GET_LMK_PROPERTY(type, name, def) \
132     property_get_##type("persist.device_config.lmkd_native." name, \
133         property_get_##type("ro.lmk." name, def))
134 
135 /*
136  * PSI monitor tracking window size.
137  * PSI monitor generates events at most once per window,
138  * therefore we poll memory state for the duration of
139  * PSI_WINDOW_SIZE_MS after the event happens.
140  */
141 #define PSI_WINDOW_SIZE_MS 1000
142 /* Polling period after PSI signal when pressure is high */
143 #define PSI_POLL_PERIOD_SHORT_MS 10
144 /* Polling period after PSI signal when pressure is low */
145 #define PSI_POLL_PERIOD_LONG_MS 100
146 
147 #define FAIL_REPORT_RLIMIT_MS 1000
148 
149 /*
150  * System property defaults
151  */
152 /* ro.lmk.swap_free_low_percentage property defaults */
153 #define DEF_LOW_SWAP 10
154 /* ro.lmk.thrashing_limit property defaults */
155 #define DEF_THRASHING_LOWRAM 30
156 #define DEF_THRASHING 100
157 /* ro.lmk.thrashing_limit_decay property defaults */
158 #define DEF_THRASHING_DECAY_LOWRAM 50
159 #define DEF_THRASHING_DECAY 10
160 /* ro.lmk.psi_partial_stall_ms property defaults */
161 #define DEF_PARTIAL_STALL_LOWRAM 200
162 #define DEF_PARTIAL_STALL 70
163 /* ro.lmk.psi_complete_stall_ms property defaults */
164 #define DEF_COMPLETE_STALL 700
165 /* ro.lmk.direct_reclaim_threshold_ms property defaults */
166 #define DEF_DIRECT_RECL_THRESH_MS 0
167 /* ro.lmk.swap_compression_ratio property defaults */
168 #define DEF_SWAP_COMP_RATIO 1
169 /* ro.lmk.lowmem_min_oom_score defaults */
170 #define DEF_LOWMEM_MIN_SCORE (PREVIOUS_APP_ADJ + 1)
171 
172 #define LMKD_REINIT_PROP "lmkd.reinit"
173 
174 #define WATCHDOG_TIMEOUT_SEC 2
175 
176 /* default to old in-kernel interface if no memory pressure events */
177 static bool use_inkernel_interface = true;
178 static bool has_inkernel_module;
179 
180 /* memory pressure levels */
181 enum vmpressure_level {
182     VMPRESS_LEVEL_LOW = 0,
183     VMPRESS_LEVEL_MEDIUM,
184     VMPRESS_LEVEL_CRITICAL,
185     VMPRESS_LEVEL_COUNT
186 };
187 
188 static const char *level_name[] = {
189     "low",
190     "medium",
191     "critical"
192 };
193 
194 struct {
195     int64_t min_nr_free_pages; /* recorded but not used yet */
196     int64_t max_nr_free_pages;
197 } low_pressure_mem = { -1, -1 };
198 
199 struct psi_threshold {
200     enum psi_stall_type stall_type;
201     int threshold_ms;
202 };
203 
204 /* Listener for direct reclaim and kswapd state changes */
205 static std::unique_ptr<android::bpf::memevents::MemEventListener> memevent_listener(nullptr);
206 static struct timespec direct_reclaim_start_tm;
207 static struct timespec kswapd_start_tm;
208 
209 static int level_oomadj[VMPRESS_LEVEL_COUNT];
210 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
211 static bool pidfd_supported;
212 static int last_kill_pid_or_fd = -1;
213 static struct timespec last_kill_tm;
214 enum vmpressure_level prev_level = VMPRESS_LEVEL_LOW;
215 static bool monitors_initialized;
216 static bool boot_completed_handled = false;
217 static bool mem_event_update_zoneinfo_supported;
218 
219 /* lmkd configurable parameters */
220 static bool debug_process_killing;
221 static bool enable_pressure_upgrade;
222 static int64_t upgrade_pressure;
223 static int64_t downgrade_pressure;
224 static bool low_ram_device;
225 static bool kill_heaviest_task;
226 static unsigned long kill_timeout_ms;
227 static int pressure_after_kill_min_score;
228 static bool use_minfree_levels;
229 static bool per_app_memcg;
230 static int swap_free_low_percentage;
231 static int psi_partial_stall_ms;
232 static int psi_complete_stall_ms;
233 static int thrashing_limit_pct;
234 static int thrashing_limit_decay_pct;
235 static int thrashing_critical_pct;
236 static int swap_util_max;
237 static int64_t filecache_min_kb;
238 static int64_t stall_limit_critical;
239 static bool use_psi_monitors = false;
240 static int kpoll_fd;
241 static bool delay_monitors_until_boot;
242 static int direct_reclaim_threshold_ms;
243 static int swap_compression_ratio;
244 static int lowmem_min_oom_score;
245 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
246     { PSI_SOME, 70 },    /* 70ms out of 1sec for partial stall */
247     { PSI_SOME, 100 },   /* 100ms out of 1sec for partial stall */
248     { PSI_FULL, 70 },    /* 70ms out of 1sec for complete stall */
249 };
250 
251 static uint64_t mp_event_count;
252 
253 static android_log_context ctx;
254 static Reaper reaper;
255 static int reaper_comm_fd[2];
256 
257 enum polling_update {
258     POLLING_DO_NOT_CHANGE,
259     POLLING_START,
260     POLLING_PAUSE,
261     POLLING_RESUME,
262 };
263 
264 /*
265  * Data used for periodic polling for the memory state of the device.
266  * Note that when system is not polling poll_handler is set to NULL,
267  * when polling starts poll_handler gets set and is reset back to
268  * NULL when polling stops.
269  */
270 struct polling_params {
271     struct event_handler_info* poll_handler;
272     struct event_handler_info* paused_handler;
273     struct timespec poll_start_tm;
274     struct timespec last_poll_tm;
275     int polling_interval_ms;
276     enum polling_update update;
277 };
278 
279 /* data required to handle events */
280 struct event_handler_info {
281     int data;
282     void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
283 };
284 
285 /* data required to handle socket events */
286 struct sock_event_handler_info {
287     int sock;
288     pid_t pid;
289     uint32_t async_event_mask;
290     struct event_handler_info handler_info;
291 };
292 
293 /* max supported number of data connections (AMS, init, tests) */
294 #define MAX_DATA_CONN 3
295 
296 /* socket event handler data */
297 static struct sock_event_handler_info ctrl_sock;
298 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
299 
300 /* vmpressure event handler data */
301 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
302 
303 /*
304  * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
305  * 1 lmk events + 1 fd to wait for process death + 1 fd to receive kill failure notifications
306  * + 1 fd to receive memevent_listener notifications
307  */
308 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1 + 1 + 1)
309 static int epollfd;
310 static int maxevents;
311 
312 /* OOM score values used by both kernel and framework */
313 #define OOM_SCORE_ADJ_MIN       (-1000)
314 #define OOM_SCORE_ADJ_MAX       1000
315 
316 static std::array<int, MAX_TARGETS> lowmem_adj;
317 static std::array<int, MAX_TARGETS> lowmem_minfree;
318 static int lowmem_targets_size;
319 
320 /* Fields to parse in /proc/zoneinfo */
321 /* zoneinfo per-zone fields */
322 enum zoneinfo_zone_field {
323     ZI_ZONE_NR_FREE_PAGES = 0,
324     ZI_ZONE_MIN,
325     ZI_ZONE_LOW,
326     ZI_ZONE_HIGH,
327     ZI_ZONE_PRESENT,
328     ZI_ZONE_NR_FREE_CMA,
329     ZI_ZONE_FIELD_COUNT
330 };
331 
332 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
333     "nr_free_pages",
334     "min",
335     "low",
336     "high",
337     "present",
338     "nr_free_cma",
339 };
340 
341 /* zoneinfo per-zone special fields */
342 enum zoneinfo_zone_spec_field {
343     ZI_ZONE_SPEC_PROTECTION = 0,
344     ZI_ZONE_SPEC_PAGESETS,
345     ZI_ZONE_SPEC_FIELD_COUNT,
346 };
347 
348 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
349     "protection:",
350     "pagesets",
351 };
352 
353 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
354 #define MAX_NR_ZONES 6
355 
356 union zoneinfo_zone_fields {
357     struct {
358         int64_t nr_free_pages;
359         int64_t min;
360         int64_t low;
361         int64_t high;
362         int64_t present;
363         int64_t nr_free_cma;
364     } field;
365     int64_t arr[ZI_ZONE_FIELD_COUNT];
366 };
367 
368 struct zoneinfo_zone {
369     union zoneinfo_zone_fields fields;
370     int64_t protection[MAX_NR_ZONES];
371     int64_t max_protection;
372 };
373 
374 /* zoneinfo per-node fields */
375 enum zoneinfo_node_field {
376     ZI_NODE_NR_INACTIVE_FILE = 0,
377     ZI_NODE_NR_ACTIVE_FILE,
378     ZI_NODE_FIELD_COUNT
379 };
380 
381 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
382     "nr_inactive_file",
383     "nr_active_file",
384 };
385 
386 union zoneinfo_node_fields {
387     struct {
388         int64_t nr_inactive_file;
389         int64_t nr_active_file;
390     } field;
391     int64_t arr[ZI_NODE_FIELD_COUNT];
392 };
393 
394 struct zoneinfo_node {
395     int id;
396     int zone_count;
397     struct zoneinfo_zone zones[MAX_NR_ZONES];
398     union zoneinfo_node_fields fields;
399 };
400 
401 /* for now two memory nodes is more than enough */
402 #define MAX_NR_NODES 2
403 
404 struct zoneinfo {
405     int node_count;
406     struct zoneinfo_node nodes[MAX_NR_NODES];
407     int64_t totalreserve_pages;
408     int64_t total_inactive_file;
409     int64_t total_active_file;
410 };
411 
412 /* Fields to parse in /proc/meminfo */
413 enum meminfo_field {
414     MI_NR_FREE_PAGES = 0,
415     MI_CACHED,
416     MI_SWAP_CACHED,
417     MI_BUFFERS,
418     MI_SHMEM,
419     MI_UNEVICTABLE,
420     MI_TOTAL_SWAP,
421     MI_FREE_SWAP,
422     MI_ACTIVE_ANON,
423     MI_INACTIVE_ANON,
424     MI_ACTIVE_FILE,
425     MI_INACTIVE_FILE,
426     MI_SRECLAIMABLE,
427     MI_SUNRECLAIM,
428     MI_KERNEL_STACK,
429     MI_PAGE_TABLES,
430     MI_ION_HELP,
431     MI_ION_HELP_POOL,
432     MI_CMA_FREE,
433     MI_FIELD_COUNT
434 };
435 
436 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
437     "MemFree:",
438     "Cached:",
439     "SwapCached:",
440     "Buffers:",
441     "Shmem:",
442     "Unevictable:",
443     "SwapTotal:",
444     "SwapFree:",
445     "Active(anon):",
446     "Inactive(anon):",
447     "Active(file):",
448     "Inactive(file):",
449     "SReclaimable:",
450     "SUnreclaim:",
451     "KernelStack:",
452     "PageTables:",
453     "ION_heap:",
454     "ION_heap_pool:",
455     "CmaFree:",
456 };
457 
458 union meminfo {
459     struct {
460         int64_t nr_free_pages;
461         int64_t cached;
462         int64_t swap_cached;
463         int64_t buffers;
464         int64_t shmem;
465         int64_t unevictable;
466         int64_t total_swap;
467         int64_t free_swap;
468         int64_t active_anon;
469         int64_t inactive_anon;
470         int64_t active_file;
471         int64_t inactive_file;
472         int64_t sreclaimable;
473         int64_t sunreclaimable;
474         int64_t kernel_stack;
475         int64_t page_tables;
476         int64_t ion_heap;
477         int64_t ion_heap_pool;
478         int64_t cma_free;
479         /* fields below are calculated rather than read from the file */
480         int64_t nr_file_pages;
481         int64_t total_gpu_kb;
482         int64_t easy_available;
483     } field;
484     int64_t arr[MI_FIELD_COUNT];
485 };
486 
487 /* Fields to parse in /proc/vmstat */
488 enum vmstat_field {
489     VS_FREE_PAGES,
490     VS_INACTIVE_FILE,
491     VS_ACTIVE_FILE,
492     VS_WORKINGSET_REFAULT,
493     VS_WORKINGSET_REFAULT_FILE,
494     VS_PGSCAN_KSWAPD,
495     VS_PGSCAN_DIRECT,
496     VS_PGSCAN_DIRECT_THROTTLE,
497     VS_PGREFILL,
498     VS_FIELD_COUNT
499 };
500 
501 static const char* const vmstat_field_names[VS_FIELD_COUNT] = {
502     "nr_free_pages",
503     "nr_inactive_file",
504     "nr_active_file",
505     "workingset_refault",
506     "workingset_refault_file",
507     "pgscan_kswapd",
508     "pgscan_direct",
509     "pgscan_direct_throttle",
510     "pgrefill",
511 };
512 
513 union vmstat {
514     struct {
515         int64_t nr_free_pages;
516         int64_t nr_inactive_file;
517         int64_t nr_active_file;
518         int64_t workingset_refault;
519         int64_t workingset_refault_file;
520         int64_t pgscan_kswapd;
521         int64_t pgscan_direct;
522         int64_t pgscan_direct_throttle;
523         int64_t pgrefill;
524     } field;
525     int64_t arr[VS_FIELD_COUNT];
526 };
527 
528 enum field_match_result {
529     NO_MATCH,
530     PARSE_FAIL,
531     PARSE_SUCCESS
532 };
533 
534 struct adjslot_list {
535     struct adjslot_list *next;
536     struct adjslot_list *prev;
537 };
538 
539 struct proc {
540     struct adjslot_list asl;
541     int pid;
542     int pidfd;
543     uid_t uid;
544     int oomadj;
545     pid_t reg_pid; /* PID of the process that registered this record */
546     bool valid;
547     struct proc *pidhash_next;
548 };
549 
550 struct reread_data {
551     const char* const filename;
552     int fd;
553 };
554 
555 #define PIDHASH_SZ 1024
556 static struct proc *pidhash[PIDHASH_SZ];
557 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
558 
559 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
560 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
561 
562 // protects procadjslot_list from concurrent access
563 static std::shared_mutex adjslot_list_lock;
564 // procadjslot_list should be modified only from the main thread while exclusively holding
565 // adjslot_list_lock. Readers from non-main threads should hold adjslot_list_lock shared lock.
566 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
567 
568 #define MAX_DISTINCT_OOM_ADJ 32
569 #define KILLCNT_INVALID_IDX 0xFF
570 /*
571  * Because killcnt array is sparse a two-level indirection is used
572  * to keep the size small. killcnt_idx stores index of the element in
573  * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
574  */
575 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
576 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
577 static int killcnt_free_idx = 0;
578 static uint32_t killcnt_total = 0;
579 
580 static int pagesize;
581 static long page_k; /* page size in kB */
582 
583 static bool update_props();
584 static bool init_monitors();
585 static void destroy_monitors();
586 static bool init_memevent_listener_monitoring();
587 
clamp(int low,int high,int value)588 static int clamp(int low, int high, int value) {
589     return std::max(std::min(value, high), low);
590 }
591 
parse_int64(const char * str,int64_t * ret)592 static bool parse_int64(const char* str, int64_t* ret) {
593     char* endptr;
594     long long val = strtoll(str, &endptr, 10);
595     if (str == endptr || val > INT64_MAX) {
596         return false;
597     }
598     *ret = (int64_t)val;
599     return true;
600 }
601 
find_field(const char * name,const char * const field_names[],int field_count)602 static int find_field(const char* name, const char* const field_names[], int field_count) {
603     for (int i = 0; i < field_count; i++) {
604         if (!strcmp(name, field_names[i])) {
605             return i;
606         }
607     }
608     return -1;
609 }
610 
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)611 static enum field_match_result match_field(const char* cp, const char* ap,
612                                    const char* const field_names[],
613                                    int field_count, int64_t* field,
614                                    int *field_idx) {
615     int i = find_field(cp, field_names, field_count);
616     if (i < 0) {
617         return NO_MATCH;
618     }
619     *field_idx = i;
620     return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
621 }
622 
623 /*
624  * Read file content from the beginning up to max_len bytes or EOF
625  * whichever happens first.
626  */
read_all(int fd,char * buf,size_t max_len)627 static ssize_t read_all(int fd, char *buf, size_t max_len)
628 {
629     ssize_t ret = 0;
630     off_t offset = 0;
631 
632     while (max_len > 0) {
633         ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
634         if (r == 0) {
635             break;
636         }
637         if (r == -1) {
638             return -1;
639         }
640         ret += r;
641         buf += r;
642         offset += r;
643         max_len -= r;
644     }
645 
646     return ret;
647 }
648 
649 /*
650  * Read a new or already opened file from the beginning.
651  * If the file has not been opened yet data->fd should be set to -1.
652  * To be used with files which are read often and possibly during high
653  * memory pressure to minimize file opening which by itself requires kernel
654  * memory allocation and might result in a stall on memory stressed system.
655  */
reread_file(struct reread_data * data)656 static char *reread_file(struct reread_data *data) {
657     /* start with page-size buffer and increase if needed */
658     static ssize_t buf_size = pagesize;
659     static char *new_buf, *buf = NULL;
660     ssize_t size;
661 
662     if (data->fd == -1) {
663         /* First-time buffer initialization */
664         if (!buf && (buf = static_cast<char*>(malloc(buf_size))) == nullptr) {
665             return NULL;
666         }
667 
668         data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
669         if (data->fd < 0) {
670             ALOGE("%s open: %s", data->filename, strerror(errno));
671             return NULL;
672         }
673     }
674 
675     while (true) {
676         size = read_all(data->fd, buf, buf_size - 1);
677         if (size < 0) {
678             ALOGE("%s read: %s", data->filename, strerror(errno));
679             close(data->fd);
680             data->fd = -1;
681             return NULL;
682         }
683         if (size < buf_size - 1) {
684             break;
685         }
686         /*
687          * Since we are reading /proc files we can't use fstat to find out
688          * the real size of the file. Double the buffer size and keep retrying.
689          */
690         if ((new_buf = static_cast<char*>(realloc(buf, buf_size * 2))) == nullptr) {
691             errno = ENOMEM;
692             return NULL;
693         }
694         buf = new_buf;
695         buf_size *= 2;
696     }
697     buf[size] = 0;
698 
699     return buf;
700 }
701 
claim_record(struct proc * procp,pid_t pid)702 static bool claim_record(struct proc* procp, pid_t pid) {
703     if (procp->reg_pid == pid) {
704         /* Record already belongs to the registrant */
705         return true;
706     }
707     if (procp->reg_pid == 0) {
708         /* Old registrant is gone, claim the record */
709         procp->reg_pid = pid;
710         return true;
711     }
712     /* The record is owned by another registrant */
713     return false;
714 }
715 
remove_claims(pid_t pid)716 static void remove_claims(pid_t pid) {
717     int i;
718 
719     for (i = 0; i < PIDHASH_SZ; i++) {
720         struct proc* procp = pidhash[i];
721         while (procp) {
722             if (procp->reg_pid == pid) {
723                 procp->reg_pid = 0;
724             }
725             procp = procp->pidhash_next;
726         }
727     }
728 }
729 
ctrl_data_close(int dsock_idx)730 static void ctrl_data_close(int dsock_idx) {
731     struct epoll_event epev;
732 
733     ALOGI("closing lmkd data connection");
734     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
735         // Log a warning and keep going
736         ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
737     }
738     maxevents--;
739 
740     close(data_sock[dsock_idx].sock);
741     data_sock[dsock_idx].sock = -1;
742 
743     /* Mark all records of the old registrant as unclaimed */
744     remove_claims(data_sock[dsock_idx].pid);
745 }
746 
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz,struct ucred * sender_cred)747 static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) {
748     struct iovec iov = {buf, bufsz};
749     char control[CMSG_SPACE(sizeof(struct ucred))];
750     struct msghdr hdr = {
751             NULL, 0, &iov, 1, control, sizeof(control), 0,
752     };
753     ssize_t ret;
754     ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0));
755     if (ret == -1) {
756         ALOGE("control data socket read failed; %s", strerror(errno));
757         return -1;
758     }
759     if (ret == 0) {
760         ALOGE("Got EOF on control data socket");
761         return -1;
762     }
763 
764     struct ucred* cred = NULL;
765     struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
766     while (cmsg != NULL) {
767         if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) {
768             cred = (struct ucred*)CMSG_DATA(cmsg);
769             break;
770         }
771         cmsg = CMSG_NXTHDR(&hdr, cmsg);
772     }
773 
774     if (cred == NULL) {
775         ALOGE("Failed to retrieve sender credentials");
776         /* Close the connection */
777         ctrl_data_close(dsock_idx);
778         return -1;
779     }
780 
781     memcpy(sender_cred, cred, sizeof(struct ucred));
782 
783     /* Store PID of the peer */
784     data_sock[dsock_idx].pid = cred->pid;
785 
786     return ret;
787 }
788 
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)789 static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) {
790     int ret = 0;
791 
792     ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
793 
794     if (ret == -1) {
795         ALOGE("control data socket write failed; errno=%d", errno);
796     } else if (ret == 0) {
797         ALOGE("Got EOF on control data socket");
798         ret = -1;
799     }
800 
801     return ret;
802 }
803 
804 /*
805  * Write the pid/uid pair over the data socket, note: all active clients
806  * will receive this unsolicited notification.
807  */
ctrl_data_write_lmk_kill_occurred(pid_t pid,uid_t uid,int64_t rss_kb)808 static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid, int64_t rss_kb) {
809     LMKD_CTRL_PACKET packet;
810     size_t len = lmkd_pack_set_prockills(packet, pid, uid, static_cast<int>(rss_kb));
811 
812     for (int i = 0; i < MAX_DATA_CONN; i++) {
813         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) {
814             ctrl_data_write(i, (char*)packet, len);
815         }
816     }
817 }
818 
819 /*
820  * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd
821  */
stats_write_lmk_kill_occurred(struct kill_stat * kill_st,struct memory_stat * mem_st)822 static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st,
823                                           struct memory_stat *mem_st) {
824     LMK_KILL_OCCURRED_PACKET packet;
825     const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st);
826     if (len == 0) {
827         return;
828     }
829 
830     for (int i = 0; i < MAX_DATA_CONN; i++) {
831         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
832             ctrl_data_write(i, packet, len);
833         }
834     }
835 
836 }
837 
stats_write_lmk_kill_occurred_pid(int pid,struct kill_stat * kill_st,struct memory_stat * mem_st)838 static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st,
839                                               struct memory_stat *mem_st) {
840     kill_st->taskname = stats_get_task_name(pid);
841     if (kill_st->taskname != NULL) {
842         stats_write_lmk_kill_occurred(kill_st, mem_st);
843     }
844 }
845 
poll_kernel(int poll_fd)846 static void poll_kernel(int poll_fd) {
847     if (poll_fd == -1) {
848         // not waiting
849         return;
850     }
851 
852     while (1) {
853         char rd_buf[256];
854         int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf) - 1, 0));
855         if (bytes_read <= 0) break;
856         rd_buf[bytes_read] = '\0';
857 
858         int64_t pid;
859         int64_t uid;
860         int64_t group_leader_pid;
861         int64_t rss_in_pages;
862         struct memory_stat mem_st = {};
863         int16_t oom_score_adj;
864         int16_t min_score_adj;
865         int64_t starttime;
866         char* taskname = 0;
867         int64_t rss_kb;
868 
869         int fields_read =
870                 sscanf(rd_buf,
871                        "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64
872                        " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]",
873                        &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault,
874                        &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname);
875 
876         /* only the death of the group leader process is logged */
877         if (fields_read == 10 && group_leader_pid == pid) {
878             mem_st.rss_in_bytes = rss_in_pages * pagesize;
879             rss_kb = mem_st.rss_in_bytes >> 10;
880             ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid, rss_kb);
881             mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK));
882 
883             struct kill_stat kill_st = {
884                 .uid = static_cast<int32_t>(uid),
885                 .kill_reason = NONE,
886                 .oom_score = oom_score_adj,
887                 .min_oom_score = min_score_adj,
888                 .free_mem_kb = 0,
889                 .free_swap_kb = 0,
890             };
891             stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st);
892         }
893 
894         free(taskname);
895     }
896 }
897 
init_poll_kernel()898 static bool init_poll_kernel() {
899     kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC));
900 
901     if (kpoll_fd < 0) {
902         ALOGE("kernel lmk event file could not be opened; errno=%d", errno);
903         return false;
904     }
905 
906     return true;
907 }
908 
pid_lookup(int pid)909 static struct proc *pid_lookup(int pid) {
910     struct proc *procp;
911 
912     for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
913          procp = procp->pidhash_next)
914             ;
915 
916     return procp;
917 }
918 
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new_element)919 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
920 {
921     struct adjslot_list *next = head->next;
922     new_element->prev = head;
923     new_element->next = next;
924     next->prev = new_element;
925     head->next = new_element;
926 }
927 
adjslot_remove(struct adjslot_list * old)928 static void adjslot_remove(struct adjslot_list *old)
929 {
930     struct adjslot_list *prev = old->prev;
931     struct adjslot_list *next = old->next;
932     next->prev = prev;
933     prev->next = next;
934 }
935 
adjslot_tail(struct adjslot_list * head)936 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
937     struct adjslot_list *asl = head->prev;
938 
939     return asl == head ? NULL : asl;
940 }
941 
942 // Should be modified only from the main thread.
proc_slot(struct proc * procp)943 static void proc_slot(struct proc *procp) {
944     int adjslot = ADJTOSLOT(procp->oomadj);
945     std::scoped_lock lock(adjslot_list_lock);
946 
947     adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
948 }
949 
950 // Should be modified only from the main thread.
proc_unslot(struct proc * procp)951 static void proc_unslot(struct proc *procp) {
952     std::scoped_lock lock(adjslot_list_lock);
953 
954     adjslot_remove(&procp->asl);
955 }
956 
proc_insert(struct proc * procp)957 static void proc_insert(struct proc *procp) {
958     int hval = pid_hashfn(procp->pid);
959 
960     procp->pidhash_next = pidhash[hval];
961     pidhash[hval] = procp;
962     proc_slot(procp);
963 }
964 
965 // Can be called only from the main thread.
pid_remove(int pid)966 static int pid_remove(int pid) {
967     int hval = pid_hashfn(pid);
968     struct proc *procp;
969     struct proc *prevp;
970 
971     for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
972          procp = procp->pidhash_next)
973             prevp = procp;
974 
975     if (!procp)
976         return -1;
977 
978     if (!prevp)
979         pidhash[hval] = procp->pidhash_next;
980     else
981         prevp->pidhash_next = procp->pidhash_next;
982 
983     proc_unslot(procp);
984     /*
985      * Close pidfd here if we are not waiting for corresponding process to die,
986      * in which case stop_wait_for_proc_kill() will close the pidfd later
987      */
988     if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
989         close(procp->pidfd);
990     }
991     free(procp);
992     return 0;
993 }
994 
pid_invalidate(int pid)995 static void pid_invalidate(int pid) {
996     std::shared_lock lock(adjslot_list_lock);
997     struct proc *procp = pid_lookup(pid);
998 
999     if (procp) {
1000         procp->valid = false;
1001     }
1002 }
1003 
1004 /*
1005  * Write a string to a file.
1006  * Returns false if the file does not exist.
1007  */
writefilestring(const char * path,const char * s,bool err_if_missing)1008 static bool writefilestring(const char *path, const char *s,
1009                             bool err_if_missing) {
1010     int fd = open(path, O_WRONLY | O_CLOEXEC);
1011     ssize_t len = strlen(s);
1012     ssize_t ret;
1013 
1014     if (fd < 0) {
1015         if (err_if_missing) {
1016             ALOGE("Error opening %s; errno=%d", path, errno);
1017         }
1018         return false;
1019     }
1020 
1021     ret = TEMP_FAILURE_RETRY(write(fd, s, len));
1022     if (ret < 0) {
1023         ALOGE("Error writing %s; errno=%d", path, errno);
1024     } else if (ret < len) {
1025         ALOGE("Short write on %s; length=%zd", path, ret);
1026     }
1027 
1028     close(fd);
1029     return true;
1030 }
1031 
get_time_diff_ms(struct timespec * from,struct timespec * to)1032 static inline long get_time_diff_ms(struct timespec *from,
1033                                     struct timespec *to) {
1034     return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
1035            (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
1036 }
1037 
1038 /* Reads /proc/pid/status into buf. */
read_proc_status(int pid,char * buf,size_t buf_sz)1039 static bool read_proc_status(int pid, char *buf, size_t buf_sz) {
1040     char path[PROCFS_PATH_MAX];
1041     int fd;
1042     ssize_t size;
1043 
1044     snprintf(path, PROCFS_PATH_MAX, "/proc/%d/status", pid);
1045     fd = open(path, O_RDONLY | O_CLOEXEC);
1046     if (fd < 0) {
1047         return false;
1048     }
1049 
1050     size = read_all(fd, buf, buf_sz - 1);
1051     close(fd);
1052     if (size <= 0) {
1053         return false;
1054     }
1055     buf[size] = 0;
1056     return true;
1057 }
1058 
1059 /* Looks for tag in buf and parses the first integer */
parse_status_tag(char * buf,const char * tag,int64_t * out)1060 static bool parse_status_tag(char *buf, const char *tag, int64_t *out) {
1061     char *pos = buf;
1062     while (true) {
1063         pos = strstr(pos, tag);
1064         /* Stop if tag not found or found at the line beginning */
1065         if (pos == NULL || pos == buf || pos[-1] == '\n') {
1066             break;
1067         }
1068         pos++;
1069     }
1070 
1071     if (pos == NULL) {
1072         return false;
1073     }
1074 
1075     pos += strlen(tag);
1076     while (*pos == ' ') ++pos;
1077     return parse_int64(pos, out);
1078 }
1079 
proc_get_size(int pid)1080 static int proc_get_size(int pid) {
1081     char path[PROCFS_PATH_MAX];
1082     char line[LINE_MAX];
1083     int fd;
1084     int rss = 0;
1085     int total;
1086     ssize_t ret;
1087 
1088     /* gid containing AID_READPROC required */
1089     snprintf(path, PROCFS_PATH_MAX, "/proc/%d/statm", pid);
1090     fd = open(path, O_RDONLY | O_CLOEXEC);
1091     if (fd == -1)
1092         return -1;
1093 
1094     ret = read_all(fd, line, sizeof(line) - 1);
1095     if (ret < 0) {
1096         close(fd);
1097         return -1;
1098     }
1099     line[ret] = '\0';
1100 
1101     sscanf(line, "%d %d ", &total, &rss);
1102     close(fd);
1103     return rss;
1104 }
1105 
proc_get_name(int pid,char * buf,size_t buf_size)1106 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
1107     char path[PROCFS_PATH_MAX];
1108     int fd;
1109     char *cp;
1110     ssize_t ret;
1111 
1112     /* gid containing AID_READPROC required */
1113     snprintf(path, PROCFS_PATH_MAX, "/proc/%d/cmdline", pid);
1114     fd = open(path, O_RDONLY | O_CLOEXEC);
1115     if (fd == -1) {
1116         return NULL;
1117     }
1118     ret = read_all(fd, buf, buf_size - 1);
1119     close(fd);
1120     if (ret <= 0) {
1121         return NULL;
1122     }
1123     buf[ret] = '\0';
1124 
1125     cp = strchr(buf, ' ');
1126     if (cp) {
1127         *cp = '\0';
1128     }
1129 
1130     return buf;
1131 }
1132 
register_oom_adj_proc(const struct lmk_procprio & proc,struct ucred * cred)1133 static void register_oom_adj_proc(const struct lmk_procprio& proc, struct ucred* cred) {
1134     char val[20];
1135     int soft_limit_mult;
1136     bool is_system_server;
1137     struct passwd *pwdrec;
1138     struct proc* procp;
1139     int oom_adj_score = proc.oomadj;
1140 
1141     /* lmkd should not change soft limits for services */
1142     if (proc.ptype == PROC_TYPE_APP && per_app_memcg) {
1143         if (proc.oomadj >= 900) {
1144             soft_limit_mult = 0;
1145         } else if (proc.oomadj >= 800) {
1146             soft_limit_mult = 0;
1147         } else if (proc.oomadj >= 700) {
1148             soft_limit_mult = 0;
1149         } else if (proc.oomadj >= 600) {
1150             // Launcher should be perceptible, don't kill it.
1151             oom_adj_score = 200;
1152             soft_limit_mult = 1;
1153         } else if (proc.oomadj >= 500) {
1154             soft_limit_mult = 0;
1155         } else if (proc.oomadj >= 400) {
1156             soft_limit_mult = 0;
1157         } else if (proc.oomadj >= 300) {
1158             soft_limit_mult = 1;
1159         } else if (proc.oomadj >= 200) {
1160             soft_limit_mult = 8;
1161         } else if (proc.oomadj >= 100) {
1162             soft_limit_mult = 10;
1163         } else if (proc.oomadj >= 0) {
1164             soft_limit_mult = 20;
1165         } else {
1166             // Persistent processes will have a large
1167             // soft limit 512MB.
1168             soft_limit_mult = 64;
1169         }
1170 
1171         std::string soft_limit_path;
1172         if (!CgroupGetAttributePathForTask("MemSoftLimit", proc.pid, &soft_limit_path)) {
1173             ALOGE("Querying MemSoftLimit path failed");
1174             return;
1175         }
1176 
1177         snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
1178 
1179         /*
1180          * system_server process has no memcg under /dev/memcg/apps but should be
1181          * registered with lmkd. This is the best way so far to identify it.
1182          */
1183         is_system_server = (oom_adj_score == SYSTEM_ADJ && (pwdrec = getpwnam("system")) != NULL &&
1184                             proc.uid == pwdrec->pw_uid);
1185         writefilestring(soft_limit_path.c_str(), val, !is_system_server);
1186     }
1187 
1188     procp = pid_lookup(proc.pid);
1189     if (!procp) {
1190         int pidfd = -1;
1191 
1192         if (pidfd_supported) {
1193             pidfd = TEMP_FAILURE_RETRY(pidfd_open(proc.pid, 0));
1194             if (pidfd < 0) {
1195                 ALOGE("pidfd_open for pid %d failed; errno=%d", proc.pid, errno);
1196                 return;
1197             }
1198         }
1199 
1200         procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
1201         if (!procp) {
1202             // Oh, the irony.  May need to rebuild our state.
1203             return;
1204         }
1205 
1206         procp->pid = proc.pid;
1207         procp->pidfd = pidfd;
1208         procp->uid = proc.uid;
1209         procp->reg_pid = cred->pid;
1210         procp->oomadj = oom_adj_score;
1211         procp->valid = true;
1212         proc_insert(procp);
1213     } else {
1214         if (!claim_record(procp, cred->pid)) {
1215             char buf[LINE_MAX];
1216             char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1217             /* Only registrant of the record can remove it */
1218             ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
1219                 taskname ? taskname : "A process ", cred->uid, cred->pid);
1220             return;
1221         }
1222         proc_unslot(procp);
1223         procp->oomadj = oom_adj_score;
1224         proc_slot(procp);
1225     }
1226 }
1227 
apply_proc_prio(const struct lmk_procprio & params,struct ucred * cred)1228 static void apply_proc_prio(const struct lmk_procprio& params, struct ucred* cred) {
1229     char path[PROCFS_PATH_MAX];
1230     char val[20];
1231     int64_t tgid;
1232     char buf[pagesize];
1233 
1234     if (params.oomadj < OOM_SCORE_ADJ_MIN || params.oomadj > OOM_SCORE_ADJ_MAX) {
1235         ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
1236         return;
1237     }
1238 
1239     if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
1240         ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
1241         return;
1242     }
1243 
1244     /* Check if registered process is a thread group leader */
1245     if (read_proc_status(params.pid, buf, sizeof(buf))) {
1246         if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
1247             ALOGE("Attempt to register a task that is not a thread group leader "
1248                   "(tid %d, tgid %" PRId64 ")",
1249                   params.pid, tgid);
1250             return;
1251         }
1252     }
1253 
1254     /* gid containing AID_READPROC required */
1255     /* CAP_SYS_RESOURCE required */
1256     /* CAP_DAC_OVERRIDE required */
1257     snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
1258     snprintf(val, sizeof(val), "%d", params.oomadj);
1259     if (!writefilestring(path, val, false)) {
1260         ALOGW("Failed to open %s; errno=%d: process %d might have been killed", path, errno,
1261               params.pid);
1262         /* If this file does not exist the process is dead. */
1263         return;
1264     }
1265 
1266     if (use_inkernel_interface) {
1267         stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
1268         return;
1269     }
1270 
1271     register_oom_adj_proc(params, cred);
1272 }
1273 
cmd_procprio(LMKD_CTRL_PACKET packet,int field_count,struct ucred * cred)1274 static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred* cred) {
1275     struct lmk_procprio proc_prio;
1276 
1277     lmkd_pack_get_procprio(packet, field_count, &proc_prio);
1278     apply_proc_prio(proc_prio, cred);
1279 }
1280 
cmd_procremove(LMKD_CTRL_PACKET packet,struct ucred * cred)1281 static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) {
1282     struct lmk_procremove params;
1283     struct proc *procp;
1284 
1285     lmkd_pack_get_procremove(packet, &params);
1286 
1287     if (use_inkernel_interface) {
1288         /*
1289          * Perform an extra check before the pid is removed, after which it
1290          * will be impossible for poll_kernel to get the taskname. poll_kernel()
1291          * is potentially a long-running blocking function; however this method
1292          * handles AMS requests but does not block AMS.
1293          */
1294         poll_kernel(kpoll_fd);
1295 
1296         stats_remove_taskname(params.pid);
1297         return;
1298     }
1299 
1300     procp = pid_lookup(params.pid);
1301     if (!procp) {
1302         return;
1303     }
1304 
1305     if (!claim_record(procp, cred->pid)) {
1306         char buf[LINE_MAX];
1307         char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1308         /* Only registrant of the record can remove it */
1309         ALOGE("%s (%d, %d) attempts to unregister a process registered by another client",
1310             taskname ? taskname : "A process ", cred->uid, cred->pid);
1311         return;
1312     }
1313 
1314     /*
1315      * WARNING: After pid_remove() procp is freed and can't be used!
1316      * Therefore placed at the end of the function.
1317      */
1318     pid_remove(params.pid);
1319 }
1320 
cmd_procpurge(struct ucred * cred)1321 static void cmd_procpurge(struct ucred *cred) {
1322     int i;
1323     struct proc *procp;
1324     struct proc *next;
1325 
1326     if (use_inkernel_interface) {
1327         stats_purge_tasknames();
1328         return;
1329     }
1330 
1331     for (i = 0; i < PIDHASH_SZ; i++) {
1332         procp = pidhash[i];
1333         while (procp) {
1334             next = procp->pidhash_next;
1335             /* Purge only records created by the requestor */
1336             if (claim_record(procp, cred->pid)) {
1337                 pid_remove(procp->pid);
1338             }
1339             procp = next;
1340         }
1341     }
1342 }
1343 
cmd_subscribe(int dsock_idx,LMKD_CTRL_PACKET packet)1344 static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) {
1345     struct lmk_subscribe params;
1346 
1347     lmkd_pack_get_subscribe(packet, &params);
1348     data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type;
1349 }
1350 
inc_killcnt(int oomadj)1351 static void inc_killcnt(int oomadj) {
1352     int slot = ADJTOSLOT(oomadj);
1353     uint8_t idx = killcnt_idx[slot];
1354 
1355     if (idx == KILLCNT_INVALID_IDX) {
1356         /* index is not assigned for this oomadj */
1357         if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1358             killcnt_idx[slot] = killcnt_free_idx;
1359             killcnt[killcnt_free_idx] = 1;
1360             killcnt_free_idx++;
1361         } else {
1362             ALOGW("Number of distinct oomadj levels exceeds %d",
1363                 MAX_DISTINCT_OOM_ADJ);
1364         }
1365     } else {
1366         /*
1367          * wraparound is highly unlikely and is detectable using total
1368          * counter because it has to be equal to the sum of all counters
1369          */
1370         killcnt[idx]++;
1371     }
1372     /* increment total kill counter */
1373     killcnt_total++;
1374 }
1375 
get_killcnt(int min_oomadj,int max_oomadj)1376 static int get_killcnt(int min_oomadj, int max_oomadj) {
1377     int slot;
1378     int count = 0;
1379 
1380     if (min_oomadj > max_oomadj)
1381         return 0;
1382 
1383     /* special case to get total kill count */
1384     if (min_oomadj > OOM_SCORE_ADJ_MAX)
1385         return killcnt_total;
1386 
1387     while (min_oomadj <= max_oomadj &&
1388            (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1389         uint8_t idx = killcnt_idx[slot];
1390         if (idx != KILLCNT_INVALID_IDX) {
1391             count += killcnt[idx];
1392         }
1393         min_oomadj++;
1394     }
1395 
1396     return count;
1397 }
1398 
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1399 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1400     struct lmk_getkillcnt params;
1401 
1402     if (use_inkernel_interface) {
1403         /* kernel driver does not expose this information */
1404         return 0;
1405     }
1406 
1407     lmkd_pack_get_getkillcnt(packet, &params);
1408 
1409     return get_killcnt(params.min_oomadj, params.max_oomadj);
1410 }
1411 
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1412 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1413     int i;
1414     struct lmk_target target;
1415     char minfree_str[PROPERTY_VALUE_MAX];
1416     char *pstr = minfree_str;
1417     char *pend = minfree_str + sizeof(minfree_str);
1418     static struct timespec last_req_tm;
1419     struct timespec curr_tm;
1420 
1421     if (ntargets < 1 || ntargets > (int)lowmem_adj.size()) {
1422         return;
1423     }
1424 
1425     /*
1426      * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1427      * to prevent DoS attacks
1428      */
1429     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1430         ALOGE("Failed to get current time");
1431         return;
1432     }
1433 
1434     if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1435         TARGET_UPDATE_MIN_INTERVAL_MS) {
1436         ALOGE("Ignoring frequent updated to lmkd limits");
1437         return;
1438     }
1439 
1440     last_req_tm = curr_tm;
1441 
1442     for (i = 0; i < ntargets; i++) {
1443         lmkd_pack_get_target(packet, i, &target);
1444         lowmem_minfree[i] = target.minfree;
1445         lowmem_adj[i] = target.oom_adj_score;
1446 
1447         pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1448             target.oom_adj_score);
1449         if (pstr >= pend) {
1450             /* if no more space in the buffer then terminate the loop */
1451             pstr = pend;
1452             break;
1453         }
1454     }
1455 
1456     lowmem_targets_size = ntargets;
1457 
1458     /* Override the last extra comma */
1459     pstr[-1] = '\0';
1460     property_set("sys.lmk.minfree_levels", minfree_str);
1461 
1462     if (has_inkernel_module) {
1463         char minfreestr[128];
1464         char killpriostr[128];
1465 
1466         minfreestr[0] = '\0';
1467         killpriostr[0] = '\0';
1468 
1469         for (i = 0; i < lowmem_targets_size; i++) {
1470             char val[40];
1471 
1472             if (i) {
1473                 strlcat(minfreestr, ",", sizeof(minfreestr));
1474                 strlcat(killpriostr, ",", sizeof(killpriostr));
1475             }
1476 
1477             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1478             strlcat(minfreestr, val, sizeof(minfreestr));
1479             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1480             strlcat(killpriostr, val, sizeof(killpriostr));
1481         }
1482 
1483         writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1484         writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1485     }
1486 }
1487 
cmd_procs_prio(LMKD_CTRL_PACKET packet,const int field_count,struct ucred * cred)1488 static void cmd_procs_prio(LMKD_CTRL_PACKET packet, const int field_count, struct ucred* cred) {
1489     struct lmk_procs_prio params;
1490 
1491     const int procs_count = lmkd_pack_get_procs_prio(packet, &params, field_count);
1492     if (procs_count < 0) {
1493         ALOGE("LMK_PROCS_PRIO received invalid packet format");
1494         return;
1495     }
1496 
1497     for (int i = 0; i < procs_count; i++) {
1498         apply_proc_prio(params.procs[i], cred);
1499     }
1500 }
1501 
ctrl_command_handler(int dsock_idx)1502 static void ctrl_command_handler(int dsock_idx) {
1503     LMKD_CTRL_PACKET packet;
1504     struct ucred cred;
1505     int len;
1506     enum lmk_cmd cmd;
1507     int nargs;
1508     int targets;
1509     int kill_cnt;
1510     int result;
1511 
1512     len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
1513     if (len <= 0)
1514         return;
1515 
1516     if (len < (int)sizeof(int)) {
1517         ALOGE("Wrong control socket read length len=%d", len);
1518         return;
1519     }
1520 
1521     cmd = lmkd_pack_get_cmd(packet);
1522     nargs = len / sizeof(int) - 1;
1523     if (nargs < 0)
1524         goto wronglen;
1525 
1526     switch(cmd) {
1527     case LMK_TARGET:
1528         targets = nargs / 2;
1529         if (nargs & 0x1 || targets > (int)lowmem_adj.size()) {
1530             goto wronglen;
1531         }
1532         cmd_target(targets, packet);
1533         break;
1534     case LMK_PROCPRIO:
1535         /* process type field is optional for backward compatibility */
1536         if (nargs < 3 || nargs > 4)
1537             goto wronglen;
1538         cmd_procprio(packet, nargs, &cred);
1539         break;
1540     case LMK_PROCREMOVE:
1541         if (nargs != 1)
1542             goto wronglen;
1543         cmd_procremove(packet, &cred);
1544         break;
1545     case LMK_PROCPURGE:
1546         if (nargs != 0)
1547             goto wronglen;
1548         cmd_procpurge(&cred);
1549         break;
1550     case LMK_GETKILLCNT:
1551         if (nargs != 2)
1552             goto wronglen;
1553         kill_cnt = cmd_getkillcnt(packet);
1554         len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1555         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1556             return;
1557         break;
1558     case LMK_SUBSCRIBE:
1559         if (nargs != 1)
1560             goto wronglen;
1561         cmd_subscribe(dsock_idx, packet);
1562         break;
1563     case LMK_PROCKILL:
1564         /* This command code is NOT expected at all */
1565         ALOGE("Received unexpected command code %d", cmd);
1566         break;
1567     case LMK_UPDATE_PROPS:
1568         if (nargs != 0)
1569             goto wronglen;
1570         result = -1;
1571         if (update_props()) {
1572             if (!use_inkernel_interface && monitors_initialized) {
1573                 /* Reinitialize monitors to apply new settings */
1574                 destroy_monitors();
1575                 if (init_monitors()) {
1576                     result = 0;
1577                 }
1578             } else {
1579                 result = 0;
1580             }
1581 
1582             if (direct_reclaim_threshold_ms > 0 && !memevent_listener) {
1583                 ALOGW("Kernel support for direct_reclaim_threshold_ms is not found");
1584                 direct_reclaim_threshold_ms = 0;
1585             }
1586         }
1587 
1588         len = lmkd_pack_set_update_props_repl(packet, result);
1589         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
1590             ALOGE("Failed to report operation results");
1591         }
1592         if (!result) {
1593             ALOGI("Properties reinitilized");
1594         } else {
1595             /* New settings can't be supported, crash to be restarted */
1596             ALOGE("New configuration is not supported. Exiting...");
1597             exit(1);
1598         }
1599         break;
1600     case LMK_START_MONITORING:
1601         if (nargs != 0)
1602             goto wronglen;
1603         // Registration is needed only if it was skipped earlier.
1604         if (monitors_initialized)
1605             return;
1606         if (!property_get_bool("sys.boot_completed", false)) {
1607             ALOGE("LMK_START_MONITORING cannot be handled before boot completed");
1608             return;
1609         }
1610 
1611         if (!init_monitors()) {
1612             /* Failure to start psi monitoring, crash to be restarted */
1613             ALOGE("Failure to initialize monitoring. Exiting...");
1614             exit(1);
1615         }
1616         ALOGI("Initialized monitors after boot completed.");
1617         break;
1618     case LMK_BOOT_COMPLETED:
1619         if (nargs != 0) goto wronglen;
1620 
1621         if (boot_completed_handled) {
1622             /* Notify we have already handled post boot-up operations */
1623             result = 1;
1624         } else if (!property_get_bool("sys.boot_completed", false)) {
1625             ALOGE("LMK_BOOT_COMPLETED cannot be handled before boot completed");
1626             result = -1;
1627         } else {
1628             /*
1629              * Initialize the memevent listener after boot is completed to prevent
1630              * waiting, during boot-up, for BPF programs to be loaded.
1631              */
1632             if (init_memevent_listener_monitoring()) {
1633                 ALOGI("Using memevents for direct reclaim and kswapd detection");
1634             } else {
1635                 ALOGI("Using vmstats for direct reclaim and kswapd detection");
1636                 if (direct_reclaim_threshold_ms > 0) {
1637                     ALOGW("Kernel support for direct_reclaim_threshold_ms is not found");
1638                     direct_reclaim_threshold_ms = 0;
1639                 }
1640             }
1641             result = 0;
1642             boot_completed_handled = true;
1643         }
1644 
1645         len = lmkd_pack_set_boot_completed_notif_repl(packet, result);
1646         if (ctrl_data_write(dsock_idx, (char*)packet, len) != len) {
1647             ALOGE("Failed to report boot-completed operation results");
1648         }
1649         break;
1650     case LMK_PROCS_PRIO:
1651         cmd_procs_prio(packet, nargs, &cred);
1652         break;
1653     default:
1654         ALOGE("Received unknown command code %d", cmd);
1655         return;
1656     }
1657 
1658     return;
1659 
1660 wronglen:
1661     ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1662 }
1663 
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1664 static void ctrl_data_handler(int data, uint32_t events,
1665                               struct polling_params *poll_params __unused) {
1666     if (events & EPOLLIN) {
1667         ctrl_command_handler(data);
1668     }
1669 }
1670 
get_free_dsock()1671 static int get_free_dsock() {
1672     for (int i = 0; i < MAX_DATA_CONN; i++) {
1673         if (data_sock[i].sock < 0) {
1674             return i;
1675         }
1676     }
1677     return -1;
1678 }
1679 
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1680 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1681                                  struct polling_params *poll_params __unused) {
1682     struct epoll_event epev;
1683     int free_dscock_idx = get_free_dsock();
1684 
1685     if (free_dscock_idx < 0) {
1686         /*
1687          * Number of data connections exceeded max supported. This should not
1688          * happen but if it does we drop all existing connections and accept
1689          * the new one. This prevents inactive connections from monopolizing
1690          * data socket and if we drop ActivityManager connection it will
1691          * immediately reconnect.
1692          */
1693         for (int i = 0; i < MAX_DATA_CONN; i++) {
1694             ctrl_data_close(i);
1695         }
1696         free_dscock_idx = 0;
1697     }
1698 
1699     data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1700     if (data_sock[free_dscock_idx].sock < 0) {
1701         ALOGE("lmkd control socket accept failed; errno=%d", errno);
1702         return;
1703     }
1704 
1705     ALOGI("lmkd data connection established");
1706     /* use data to store data connection idx */
1707     data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1708     data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1709     data_sock[free_dscock_idx].async_event_mask = 0;
1710     epev.events = EPOLLIN;
1711     epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1712     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1713         ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1714         ctrl_data_close(free_dscock_idx);
1715         return;
1716     }
1717     maxevents++;
1718 }
1719 
1720 /*
1721  * /proc/zoneinfo parsing routines
1722  * Expected file format is:
1723  *
1724  *   Node <node_id>, zone   <zone_name>
1725  *   (
1726  *    per-node stats
1727  *       (<per-node field name> <value>)+
1728  *   )?
1729  *   (pages free     <value>
1730  *       (<per-zone field name> <value>)+
1731  *    pagesets
1732  *       (<unused fields>)*
1733  *   )+
1734  *   ...
1735  */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1736 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1737     int zone_idx;
1738     int64_t max = 0;
1739     char *save_ptr;
1740 
1741     for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1742          buf && zone_idx < MAX_NR_ZONES;
1743          buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1744         long long zoneval = strtoll(buf, &buf, 0);
1745         if (zoneval > max) {
1746             max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1747         }
1748         zone->protection[zone_idx] = zoneval;
1749     }
1750     zone->max_protection = max;
1751 }
1752 
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1753 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1754     for (char *line = strtok_r(NULL, "\n", buf); line;
1755          line = strtok_r(NULL, "\n", buf)) {
1756         char *cp;
1757         char *ap;
1758         char *save_ptr;
1759         int64_t val;
1760         int field_idx;
1761         enum field_match_result match_res;
1762 
1763         cp = strtok_r(line, " ", &save_ptr);
1764         if (!cp) {
1765             return false;
1766         }
1767 
1768         field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1769         if (field_idx >= 0) {
1770             /* special field */
1771             if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1772                 /* no mode fields we are interested in */
1773                 return true;
1774             }
1775 
1776             /* protection field */
1777             ap = strtok_r(NULL, ")", &save_ptr);
1778             if (ap) {
1779                 zoneinfo_parse_protection(ap, zone);
1780             }
1781             continue;
1782         }
1783 
1784         ap = strtok_r(NULL, " ", &save_ptr);
1785         if (!ap) {
1786             continue;
1787         }
1788 
1789         match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1790             &val, &field_idx);
1791         if (match_res == PARSE_FAIL) {
1792             return false;
1793         }
1794         if (match_res == PARSE_SUCCESS) {
1795             zone->fields.arr[field_idx] = val;
1796         }
1797         if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1798             /* zone is not populated, stop parsing it */
1799             return true;
1800         }
1801     }
1802     return false;
1803 }
1804 
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1805 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1806     int fields_to_match = ZI_NODE_FIELD_COUNT;
1807 
1808     for (char *line = strtok_r(NULL, "\n", buf); line;
1809          line = strtok_r(NULL, "\n", buf)) {
1810         char *cp;
1811         char *ap;
1812         char *save_ptr;
1813         int64_t val;
1814         int field_idx;
1815         enum field_match_result match_res;
1816 
1817         cp = strtok_r(line, " ", &save_ptr);
1818         if (!cp) {
1819             return false;
1820         }
1821 
1822         ap = strtok_r(NULL, " ", &save_ptr);
1823         if (!ap) {
1824             return false;
1825         }
1826 
1827         match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1828             &val, &field_idx);
1829         if (match_res == PARSE_FAIL) {
1830             return false;
1831         }
1832         if (match_res == PARSE_SUCCESS) {
1833             node->fields.arr[field_idx] = val;
1834             fields_to_match--;
1835             if (!fields_to_match) {
1836                 return true;
1837             }
1838         }
1839     }
1840     return false;
1841 }
1842 
zoneinfo_parse(struct zoneinfo * zi)1843 static int zoneinfo_parse(struct zoneinfo *zi) {
1844     static struct reread_data file_data = {
1845         .filename = ZONEINFO_PATH,
1846         .fd = -1,
1847     };
1848     char *buf;
1849     char *save_ptr;
1850     char *line;
1851     char zone_name[LINE_MAX + 1];
1852     struct zoneinfo_node *node = NULL;
1853     int node_idx = 0;
1854     int zone_idx = 0;
1855 
1856     memset(zi, 0, sizeof(struct zoneinfo));
1857 
1858     if ((buf = reread_file(&file_data)) == NULL) {
1859         return -1;
1860     }
1861 
1862     for (line = strtok_r(buf, "\n", &save_ptr); line;
1863          line = strtok_r(NULL, "\n", &save_ptr)) {
1864         int node_id;
1865         if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1866             if (!node || node->id != node_id) {
1867                 line = strtok_r(NULL, "\n", &save_ptr);
1868                 if (strncmp(line, NODE_STATS_MARKER, strlen(NODE_STATS_MARKER)) != 0) {
1869                     /*
1870                      * per-node stats are only present in the first non-empty zone of
1871                      * the node.
1872                      */
1873                     continue;
1874                 }
1875 
1876                 /* new node is found */
1877                 if (node) {
1878                     node->zone_count = zone_idx + 1;
1879                     node_idx++;
1880                     if (node_idx == MAX_NR_NODES) {
1881                         /* max node count exceeded */
1882                         ALOGE("%s parse error", file_data.filename);
1883                         return -1;
1884                     }
1885                 }
1886                 node = &zi->nodes[node_idx];
1887                 node->id = node_id;
1888                 zone_idx = 0;
1889                 if (!zoneinfo_parse_node(&save_ptr, node)) {
1890                     ALOGE("%s parse error", file_data.filename);
1891                     return -1;
1892                 }
1893             } else {
1894                 /* new zone is found */
1895                 zone_idx++;
1896             }
1897             if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1898                 ALOGE("%s parse error", file_data.filename);
1899                 return -1;
1900             }
1901         }
1902     }
1903     if (!node) {
1904         ALOGE("%s parse error", file_data.filename);
1905         return -1;
1906     }
1907     node->zone_count = zone_idx + 1;
1908     zi->node_count = node_idx + 1;
1909 
1910     /* calculate totals fields */
1911     for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1912         node = &zi->nodes[node_idx];
1913         for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1914             struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1915             zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1916         }
1917         zi->total_inactive_file += node->fields.field.nr_inactive_file;
1918         zi->total_active_file += node->fields.field.nr_active_file;
1919     }
1920     return 0;
1921 }
1922 
1923 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1924 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1925     char *cp = line;
1926     char *ap;
1927     char *save_ptr;
1928     int64_t val;
1929     int field_idx;
1930     enum field_match_result match_res;
1931 
1932     cp = strtok_r(line, " ", &save_ptr);
1933     if (!cp) {
1934         return false;
1935     }
1936 
1937     ap = strtok_r(NULL, " ", &save_ptr);
1938     if (!ap) {
1939         return false;
1940     }
1941 
1942     match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1943         &val, &field_idx);
1944     if (match_res == PARSE_SUCCESS) {
1945         mi->arr[field_idx] = val / page_k;
1946     }
1947     return (match_res != PARSE_FAIL);
1948 }
1949 
read_gpu_total_kb()1950 static int64_t read_gpu_total_kb() {
1951     static android::base::unique_fd fd(
1952             android::bpf::mapRetrieveRO("/sys/fs/bpf/map_gpuMem_gpu_mem_total_map"));
1953     static constexpr uint64_t kBpfKeyGpuTotalUsage = 0;
1954     uint64_t value;
1955 
1956     if (!fd.ok()) {
1957         return 0;
1958     }
1959 
1960     return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value)
1961             ? 0
1962             : (int32_t)(value / 1024);
1963 }
1964 
meminfo_parse(union meminfo * mi)1965 static int meminfo_parse(union meminfo *mi) {
1966     static struct reread_data file_data = {
1967         .filename = MEMINFO_PATH,
1968         .fd = -1,
1969     };
1970     char *buf;
1971     char *save_ptr;
1972     char *line;
1973 
1974     memset(mi, 0, sizeof(union meminfo));
1975 
1976     if ((buf = reread_file(&file_data)) == NULL) {
1977         return -1;
1978     }
1979 
1980     for (line = strtok_r(buf, "\n", &save_ptr); line;
1981          line = strtok_r(NULL, "\n", &save_ptr)) {
1982         if (!meminfo_parse_line(line, mi)) {
1983             ALOGE("%s parse error", file_data.filename);
1984             return -1;
1985         }
1986     }
1987     mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1988         mi->field.buffers;
1989     mi->field.total_gpu_kb = read_gpu_total_kb();
1990     mi->field.easy_available = mi->field.nr_free_pages + mi->field.inactive_file;
1991 
1992     return 0;
1993 }
1994 
1995 // In the case of ZRAM, mi->field.free_swap can't be used directly because swap space is taken
1996 // from the free memory or reclaimed. Use the lowest of free_swap and easily available memory to
1997 // measure free swap because they represent how much swap space the system will consider to use
1998 // and how much it can actually use.
1999 // Swap compression ratio in the calculation can be adjusted using swap_compression_ratio tunable.
2000 // By setting swap_compression_ratio to 0, available memory can be ignored.
get_free_swap(union meminfo * mi)2001 static inline int64_t get_free_swap(union meminfo *mi) {
2002     if (swap_compression_ratio)
2003         return std::min(mi->field.free_swap, mi->field.easy_available * swap_compression_ratio);
2004     return mi->field.free_swap;
2005 }
2006 
2007 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)2008 static bool vmstat_parse_line(char *line, union vmstat *vs) {
2009     char *cp;
2010     char *ap;
2011     char *save_ptr;
2012     int64_t val;
2013     int field_idx;
2014     enum field_match_result match_res;
2015 
2016     cp = strtok_r(line, " ", &save_ptr);
2017     if (!cp) {
2018         return false;
2019     }
2020 
2021     ap = strtok_r(NULL, " ", &save_ptr);
2022     if (!ap) {
2023         return false;
2024     }
2025 
2026     match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
2027         &val, &field_idx);
2028     if (match_res == PARSE_SUCCESS) {
2029         vs->arr[field_idx] = val;
2030     }
2031     return (match_res != PARSE_FAIL);
2032 }
2033 
vmstat_parse(union vmstat * vs)2034 static int vmstat_parse(union vmstat *vs) {
2035     static struct reread_data file_data = {
2036         .filename = VMSTAT_PATH,
2037         .fd = -1,
2038     };
2039     char *buf;
2040     char *save_ptr;
2041     char *line;
2042 
2043     memset(vs, 0, sizeof(union vmstat));
2044 
2045     if ((buf = reread_file(&file_data)) == NULL) {
2046         return -1;
2047     }
2048 
2049     for (line = strtok_r(buf, "\n", &save_ptr); line;
2050          line = strtok_r(NULL, "\n", &save_ptr)) {
2051         if (!vmstat_parse_line(line, vs)) {
2052             ALOGE("%s parse error", file_data.filename);
2053             return -1;
2054         }
2055     }
2056 
2057     return 0;
2058 }
2059 
psi_parse(struct reread_data * file_data,struct psi_stats stats[],bool full)2060 static int psi_parse(struct reread_data *file_data, struct psi_stats stats[], bool full) {
2061     char *buf;
2062     char *save_ptr;
2063     char *line;
2064 
2065     if ((buf = reread_file(file_data)) == NULL) {
2066         return -1;
2067     }
2068 
2069     line = strtok_r(buf, "\n", &save_ptr);
2070     if (parse_psi_line(line, PSI_SOME, stats)) {
2071         return -1;
2072     }
2073     if (full) {
2074         line = strtok_r(NULL, "\n", &save_ptr);
2075         if (parse_psi_line(line, PSI_FULL, stats)) {
2076             return -1;
2077         }
2078     }
2079 
2080     return 0;
2081 }
2082 
psi_parse_mem(struct psi_data * psi_data)2083 static int psi_parse_mem(struct psi_data *psi_data) {
2084     static struct reread_data file_data = {
2085             .filename = psi_resource_file[PSI_MEMORY],
2086             .fd = -1,
2087     };
2088     return psi_parse(&file_data, psi_data->mem_stats, true);
2089 }
2090 
psi_parse_io(struct psi_data * psi_data)2091 static int psi_parse_io(struct psi_data *psi_data) {
2092     static struct reread_data file_data = {
2093             .filename = psi_resource_file[PSI_IO],
2094             .fd = -1,
2095     };
2096     return psi_parse(&file_data, psi_data->io_stats, true);
2097 }
2098 
psi_parse_cpu(struct psi_data * psi_data)2099 static int psi_parse_cpu(struct psi_data *psi_data) {
2100     static struct reread_data file_data = {
2101             .filename = psi_resource_file[PSI_CPU],
2102             .fd = -1,
2103     };
2104     return psi_parse(&file_data, psi_data->cpu_stats, false);
2105 }
2106 
2107 enum wakeup_reason {
2108     Event,
2109     Polling
2110 };
2111 
2112 struct wakeup_info {
2113     struct timespec wakeup_tm;
2114     struct timespec prev_wakeup_tm;
2115     struct timespec last_event_tm;
2116     int wakeups_since_event;
2117     int skipped_wakeups;
2118 };
2119 
2120 /*
2121  * After the initial memory pressure event is received lmkd schedules periodic wakeups to check
2122  * the memory conditions and kill if needed (polling). This is done because pressure events are
2123  * rate-limited and memory conditions can change in between events. Therefore after the initial
2124  * event there might be multiple wakeups. This function records the wakeup information such as the
2125  * timestamps of the last event and the last wakeup, the number of wakeups since the last event
2126  * and how many of those wakeups were skipped (some wakeups are skipped if previously killed
2127  * process is still freeing its memory).
2128  */
record_wakeup_time(struct timespec * tm,enum wakeup_reason reason,struct wakeup_info * wi)2129 static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason,
2130                                struct wakeup_info *wi) {
2131     wi->prev_wakeup_tm = wi->wakeup_tm;
2132     wi->wakeup_tm = *tm;
2133     if (reason == Event) {
2134         wi->last_event_tm = *tm;
2135         wi->wakeups_since_event = 0;
2136         wi->skipped_wakeups = 0;
2137     } else {
2138         wi->wakeups_since_event++;
2139     }
2140 }
2141 
2142 struct kill_info {
2143     enum kill_reasons kill_reason;
2144     const char *kill_desc;
2145     int thrashing;
2146     int max_thrashing;
2147 };
2148 
killinfo_log(struct proc * procp,int min_oom_score,int rss_kb,int swap_kb,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2149 static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb,
2150                          int swap_kb, struct kill_info *ki, union meminfo *mi,
2151                          struct wakeup_info *wi, struct timespec *tm, struct psi_data *pd) {
2152     /* log process information */
2153     android_log_write_int32(ctx, procp->pid);
2154     android_log_write_int32(ctx, procp->uid);
2155     android_log_write_int32(ctx, procp->oomadj);
2156     android_log_write_int32(ctx, min_oom_score);
2157     android_log_write_int32(ctx, std::min(rss_kb, (int)INT32_MAX));
2158     android_log_write_int32(ctx, ki ? ki->kill_reason : NONE);
2159 
2160     /* log meminfo fields */
2161     for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
2162         android_log_write_int32(ctx,
2163                                 mi ? std::min(mi->arr[field_idx] * page_k, (int64_t)INT32_MAX) : 0);
2164     }
2165 
2166     /* log lmkd wakeup information */
2167     if (wi) {
2168         android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm));
2169         android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm));
2170         android_log_write_int32(ctx, wi->wakeups_since_event);
2171         android_log_write_int32(ctx, wi->skipped_wakeups);
2172     } else {
2173         android_log_write_int32(ctx, 0);
2174         android_log_write_int32(ctx, 0);
2175         android_log_write_int32(ctx, 0);
2176         android_log_write_int32(ctx, 0);
2177     }
2178 
2179     android_log_write_int32(ctx, std::min(swap_kb, (int)INT32_MAX));
2180     android_log_write_int32(ctx, mi ? (int32_t)mi->field.total_gpu_kb : 0);
2181     if (ki) {
2182         android_log_write_int32(ctx, ki->thrashing);
2183         android_log_write_int32(ctx, ki->max_thrashing);
2184     } else {
2185         android_log_write_int32(ctx, 0);
2186         android_log_write_int32(ctx, 0);
2187     }
2188 
2189     if (pd) {
2190         android_log_write_float32(ctx, pd->mem_stats[PSI_SOME].avg10);
2191         android_log_write_float32(ctx, pd->mem_stats[PSI_FULL].avg10);
2192         android_log_write_float32(ctx, pd->io_stats[PSI_SOME].avg10);
2193         android_log_write_float32(ctx, pd->io_stats[PSI_FULL].avg10);
2194         android_log_write_float32(ctx, pd->cpu_stats[PSI_SOME].avg10);
2195     } else {
2196         for (int i = 0; i < 5; i++) {
2197             android_log_write_float32(ctx, 0);
2198         }
2199     }
2200 
2201     android_log_write_list(ctx, LOG_ID_EVENTS);
2202     android_log_reset(ctx);
2203 }
2204 
2205 // Note: returned entry is only an anchor and does not hold a valid process info.
2206 // When called from a non-main thread, adjslot_list_lock read lock should be taken.
proc_adj_head(int oomadj)2207 static struct proc *proc_adj_head(int oomadj) {
2208     return (struct proc *)&procadjslot_list[ADJTOSLOT(oomadj)];
2209 }
2210 
2211 // When called from a non-main thread, adjslot_list_lock read lock should be taken.
proc_adj_tail(int oomadj)2212 static struct proc *proc_adj_tail(int oomadj) {
2213     return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
2214 }
2215 
2216 // When called from a non-main thread, adjslot_list_lock read lock should be taken.
proc_adj_prev(int oomadj,int pid)2217 static struct proc *proc_adj_prev(int oomadj, int pid) {
2218     struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
2219     struct adjslot_list *curr = adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
2220 
2221     while (curr != head) {
2222         if (((struct proc *)curr)->pid == pid) {
2223             return (struct proc *)curr->prev;
2224         }
2225         curr = curr->prev;
2226     }
2227 
2228     return NULL;
2229 }
2230 
2231 // Can be called only from the main thread.
proc_get_heaviest(int oomadj)2232 static struct proc *proc_get_heaviest(int oomadj) {
2233     struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
2234     struct adjslot_list *curr = head->next;
2235     struct proc *maxprocp = NULL;
2236     int maxsize = 0;
2237     if ((curr != head) && (curr->next == head)) {
2238         // Our list only has one process.  No need to access procfs for its size.
2239         return (struct proc *)curr;
2240     }
2241     while (curr != head) {
2242         int pid = ((struct proc *)curr)->pid;
2243         int tasksize = proc_get_size(pid);
2244         if (tasksize < 0) {
2245             struct adjslot_list *next = curr->next;
2246             pid_remove(pid);
2247             curr = next;
2248         } else {
2249             if (tasksize > maxsize) {
2250                 maxsize = tasksize;
2251                 maxprocp = (struct proc *)curr;
2252             }
2253             curr = curr->next;
2254         }
2255     }
2256     return maxprocp;
2257 }
2258 
find_victim(int oom_score,int prev_pid,struct proc & target_proc)2259 static bool find_victim(int oom_score, int prev_pid, struct proc &target_proc) {
2260     struct proc *procp;
2261     std::shared_lock lock(adjslot_list_lock);
2262 
2263     if (!prev_pid) {
2264         procp = proc_adj_tail(oom_score);
2265     } else {
2266         procp = proc_adj_prev(oom_score, prev_pid);
2267         if (!procp) {
2268             // pid was removed, restart at the tail
2269             procp = proc_adj_tail(oom_score);
2270         }
2271     }
2272 
2273     // the list is empty at this oom_score or we looped through it
2274     if (!procp || procp == proc_adj_head(oom_score)) {
2275         return false;
2276     }
2277 
2278     // make a copy because original might be destroyed after adjslot_list_lock is released
2279     target_proc = *procp;
2280 
2281     return true;
2282 }
2283 
watchdog_callback()2284 static void watchdog_callback() {
2285     int prev_pid = 0;
2286 
2287     ALOGW("lmkd watchdog timed out!");
2288     for (int oom_score = OOM_SCORE_ADJ_MAX; oom_score >= 0;) {
2289         struct proc target;
2290 
2291         if (!find_victim(oom_score, prev_pid, target)) {
2292             oom_score--;
2293             prev_pid = 0;
2294             continue;
2295         }
2296 
2297         if (target.valid && reaper.kill({ target.pidfd, target.pid, target.uid }, true) == 0) {
2298             ALOGW("lmkd watchdog killed process %d, oom_score_adj %d", target.pid, oom_score);
2299             killinfo_log(&target, 0, 0, 0, NULL, NULL, NULL, NULL, NULL);
2300             // Can't call pid_remove() from non-main thread, therefore just invalidate the record
2301             pid_invalidate(target.pid);
2302             break;
2303         }
2304         prev_pid = target.pid;
2305     }
2306 }
2307 
2308 static Watchdog watchdog(WATCHDOG_TIMEOUT_SEC, watchdog_callback);
2309 
is_kill_pending(void)2310 static bool is_kill_pending(void) {
2311     char buf[24];
2312 
2313     if (last_kill_pid_or_fd < 0) {
2314         return false;
2315     }
2316 
2317     if (pidfd_supported) {
2318         return true;
2319     }
2320 
2321     /* when pidfd is not supported base the decision on /proc/<pid> existence */
2322     snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
2323     if (access(buf, F_OK) == 0) {
2324         return true;
2325     }
2326 
2327     return false;
2328 }
2329 
is_waiting_for_kill(void)2330 static bool is_waiting_for_kill(void) {
2331     return pidfd_supported && last_kill_pid_or_fd >= 0;
2332 }
2333 
stop_wait_for_proc_kill(bool finished)2334 static void stop_wait_for_proc_kill(bool finished) {
2335     struct epoll_event epev;
2336 
2337     if (last_kill_pid_or_fd < 0) {
2338         return;
2339     }
2340 
2341     if (debug_process_killing) {
2342         struct timespec curr_tm;
2343 
2344         if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2345             /*
2346              * curr_tm is used here merely to report kill duration, so this failure is not fatal.
2347              * Log an error and continue.
2348              */
2349             ALOGE("Failed to get current time");
2350         }
2351 
2352         if (finished) {
2353             ALOGI("Process got killed in %ldms",
2354                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2355         } else {
2356             ALOGI("Stop waiting for process kill after %ldms",
2357                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2358         }
2359     }
2360 
2361     if (pidfd_supported) {
2362         /* unregister fd */
2363         if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) {
2364             // Log an error and keep going
2365             ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
2366         }
2367         maxevents--;
2368         close(last_kill_pid_or_fd);
2369     }
2370 
2371     last_kill_pid_or_fd = -1;
2372 }
2373 
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2374 static void kill_done_handler(int data __unused, uint32_t events __unused,
2375                               struct polling_params *poll_params) {
2376     stop_wait_for_proc_kill(true);
2377     poll_params->update = POLLING_RESUME;
2378 }
2379 
kill_fail_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2380 static void kill_fail_handler(int data __unused, uint32_t events __unused,
2381                               struct polling_params *poll_params) {
2382     int pid;
2383 
2384     // Extract pid from the communication pipe. Clearing the pipe this way allows further
2385     // epoll_wait calls to sleep until the next event.
2386     if (TEMP_FAILURE_RETRY(read(reaper_comm_fd[0], &pid, sizeof(pid))) != sizeof(pid)) {
2387         ALOGE("thread communication read failed: %s", strerror(errno));
2388     }
2389     stop_wait_for_proc_kill(false);
2390     poll_params->update = POLLING_RESUME;
2391 }
2392 
start_wait_for_proc_kill(int pid_or_fd)2393 static void start_wait_for_proc_kill(int pid_or_fd) {
2394     static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
2395     struct epoll_event epev;
2396 
2397     if (last_kill_pid_or_fd >= 0) {
2398         /* Should not happen but if it does we should stop previous wait */
2399         ALOGE("Attempt to wait for a kill while another wait is in progress");
2400         stop_wait_for_proc_kill(false);
2401     }
2402 
2403     last_kill_pid_or_fd = pid_or_fd;
2404 
2405     if (!pidfd_supported) {
2406         /* If pidfd is not supported just store PID and exit */
2407         return;
2408     }
2409 
2410     epev.events = EPOLLIN;
2411     epev.data.ptr = (void *)&kill_done_hinfo;
2412     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
2413         ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
2414         close(last_kill_pid_or_fd);
2415         last_kill_pid_or_fd = -1;
2416         return;
2417     }
2418     maxevents++;
2419 }
2420 
2421 /* Kill one process specified by procp.  Returns the size (in pages) of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2422 static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
2423                             union meminfo *mi, struct wakeup_info *wi, struct timespec *tm,
2424                             struct psi_data *pd) {
2425     int pid = procp->pid;
2426     int pidfd = procp->pidfd;
2427     uid_t uid = procp->uid;
2428     char *taskname;
2429     int kill_result;
2430     int result = -1;
2431     struct memory_stat *mem_st;
2432     struct kill_stat kill_st;
2433     int64_t tgid;
2434     int64_t rss_kb;
2435     int64_t swap_kb;
2436     char buf[pagesize];
2437     char desc[LINE_MAX];
2438 
2439     if (!procp->valid || !read_proc_status(pid, buf, sizeof(buf))) {
2440         goto out;
2441     }
2442     if (!parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid)) {
2443         ALOGE("Unable to parse tgid from /proc/%d/status", pid);
2444         goto out;
2445     }
2446     if (tgid != pid) {
2447         ALOGE("Possible pid reuse detected (pid %d, tgid %" PRId64 ")!", pid, tgid);
2448         goto out;
2449     }
2450     // Zombie processes will not have RSS / Swap fields.
2451     if (!parse_status_tag(buf, PROC_STATUS_RSS_FIELD, &rss_kb)) {
2452         goto out;
2453     }
2454     if (!parse_status_tag(buf, PROC_STATUS_SWAP_FIELD, &swap_kb)) {
2455         goto out;
2456     }
2457 
2458     taskname = proc_get_name(pid, buf, sizeof(buf));
2459     // taskname will point inside buf, do not reuse buf onwards.
2460     if (!taskname) {
2461         goto out;
2462     }
2463 
2464     mem_st = stats_read_memory_stat(per_app_memcg, pid, uid, rss_kb * 1024, swap_kb * 1024);
2465 
2466     snprintf(desc, sizeof(desc), "lmk,%d,%d,%d,%d,%d", pid, ki ? (int)ki->kill_reason : -1,
2467              procp->oomadj, min_oom_score, ki ? ki->max_thrashing : -1);
2468 
2469     result = lmkd_free_memory_before_kill_hook(procp, rss_kb / page_k, procp->oomadj,
2470                                                ki ? (int)ki->kill_reason : -1);
2471     if (result > 0) {
2472       /*
2473        * Memory was freed elsewhere; no need to kill. Note: intentionally do not
2474        * pid_remove(pid) since it was not killed.
2475        */
2476       ALOGI("Skipping kill; %ld kB freed elsewhere.", result * page_k);
2477       return result;
2478     }
2479 
2480     trace_kill_start(desc);
2481 
2482     start_wait_for_proc_kill(pidfd < 0 ? pid : pidfd);
2483     kill_result = reaper.kill({ pidfd, pid, uid }, false);
2484 
2485     trace_kill_end();
2486 
2487     if (kill_result) {
2488         stop_wait_for_proc_kill(false);
2489         ALOGE("kill(%d): errno=%d", pid, errno);
2490         /* Delete process record even when we fail to kill so that we don't get stuck on it */
2491         goto out;
2492     }
2493 
2494     last_kill_tm = *tm;
2495 
2496     inc_killcnt(procp->oomadj);
2497 
2498     if (ki) {
2499         kill_st.kill_reason = ki->kill_reason;
2500         kill_st.thrashing = ki->thrashing;
2501         kill_st.max_thrashing = ki->max_thrashing;
2502         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2503               "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb,
2504               ki->kill_desc);
2505     } else {
2506         kill_st.kill_reason = NONE;
2507         kill_st.thrashing = 0;
2508         kill_st.max_thrashing = 0;
2509         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2510               "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb);
2511     }
2512     killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki, mi, wi, tm, pd);
2513 
2514     kill_st.uid = static_cast<int32_t>(uid);
2515     kill_st.taskname = taskname;
2516     kill_st.oom_score = procp->oomadj;
2517     kill_st.min_oom_score = min_oom_score;
2518     kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
2519     kill_st.free_swap_kb = get_free_swap(mi) * page_k;
2520     stats_write_lmk_kill_occurred(&kill_st, mem_st);
2521 
2522     ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid, rss_kb);
2523 
2524     result = rss_kb / page_k;
2525 
2526 out:
2527     /*
2528      * WARNING: After pid_remove() procp is freed and can't be used!
2529      * Therefore placed at the end of the function.
2530      */
2531     pid_remove(pid);
2532     return result;
2533 }
2534 
2535 /*
2536  * Find one process to kill at or above the given oom_score_adj level.
2537  * Returns size of the killed process.
2538  */
find_and_kill_process(int min_score_adj,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm,struct psi_data * pd)2539 static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
2540                                  struct wakeup_info *wi, struct timespec *tm,
2541                                  struct psi_data *pd) {
2542     int i;
2543     int killed_size = 0;
2544     bool choose_heaviest_task = kill_heaviest_task;
2545 
2546     for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
2547         struct proc *procp;
2548 
2549         if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
2550             /*
2551              * If we have to choose a perceptible process, choose the heaviest one to
2552              * hopefully minimize the number of victims.
2553              */
2554             choose_heaviest_task = true;
2555         }
2556 
2557         while (true) {
2558             procp = choose_heaviest_task ?
2559                 proc_get_heaviest(i) : proc_adj_tail(i);
2560 
2561             if (!procp)
2562                 break;
2563 
2564             killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm, pd);
2565             if (killed_size >= 0) {
2566                 break;
2567             }
2568         }
2569         if (killed_size) {
2570             break;
2571         }
2572     }
2573 
2574     return killed_size;
2575 }
2576 
get_memory_usage(struct reread_data * file_data)2577 static int64_t get_memory_usage(struct reread_data *file_data) {
2578     int64_t mem_usage;
2579     char *buf;
2580 
2581     if ((buf = reread_file(file_data)) == NULL) {
2582         return -1;
2583     }
2584 
2585     if (!parse_int64(buf, &mem_usage)) {
2586         ALOGE("%s parse error", file_data->filename);
2587         return -1;
2588     }
2589     if (mem_usage == 0) {
2590         ALOGE("No memory!");
2591         return -1;
2592     }
2593     return mem_usage;
2594 }
2595 
record_low_pressure_levels(union meminfo * mi)2596 void record_low_pressure_levels(union meminfo *mi) {
2597     if (low_pressure_mem.min_nr_free_pages == -1 ||
2598         low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
2599         if (debug_process_killing) {
2600             ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
2601                 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
2602         }
2603         low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
2604     }
2605     /*
2606      * Free memory at low vmpressure events occasionally gets spikes,
2607      * possibly a stale low vmpressure event with memory already
2608      * freed up (no memory pressure should have been reported).
2609      * Ignore large jumps in max_nr_free_pages that would mess up our stats.
2610      */
2611     if (low_pressure_mem.max_nr_free_pages == -1 ||
2612         (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
2613          mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
2614          low_pressure_mem.max_nr_free_pages * 0.1)) {
2615         if (debug_process_killing) {
2616             ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
2617                 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
2618         }
2619         low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
2620     }
2621 }
2622 
upgrade_level(enum vmpressure_level level)2623 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
2624     return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
2625         level + 1 : level);
2626 }
2627 
downgrade_level(enum vmpressure_level level)2628 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
2629     return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
2630         level - 1 : level);
2631 }
2632 
2633 enum zone_watermark {
2634     WMARK_MIN = 0,
2635     WMARK_LOW,
2636     WMARK_HIGH,
2637     WMARK_NONE
2638 };
2639 
2640 struct zone_watermarks {
2641     long high_wmark;
2642     long low_wmark;
2643     long min_wmark;
2644 };
2645 
2646 static struct zone_watermarks watermarks;
2647 
2648 /*
2649  * Returns lowest breached watermark or WMARK_NONE.
2650  */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)2651 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
2652                                                 struct zone_watermarks *watermarks)
2653 {
2654     int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
2655 
2656     if (nr_free_pages < watermarks->min_wmark) {
2657         return WMARK_MIN;
2658     }
2659     if (nr_free_pages < watermarks->low_wmark) {
2660         return WMARK_LOW;
2661     }
2662     if (nr_free_pages < watermarks->high_wmark) {
2663         return WMARK_HIGH;
2664     }
2665     return WMARK_NONE;
2666 }
2667 
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2668 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2669     memset(watermarks, 0, sizeof(struct zone_watermarks));
2670 
2671     for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2672         struct zoneinfo_node *node = &zi->nodes[node_idx];
2673         for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2674             struct zoneinfo_zone *zone = &node->zones[zone_idx];
2675 
2676             if (!zone->fields.field.present) {
2677                 continue;
2678             }
2679 
2680             watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2681             watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2682             watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2683         }
2684     }
2685 }
2686 
update_zoneinfo_watermarks(struct zoneinfo * zi)2687 static int update_zoneinfo_watermarks(struct zoneinfo *zi) {
2688     if (zoneinfo_parse(zi) < 0) {
2689         ALOGE("Failed to parse zoneinfo!");
2690         return -1;
2691     }
2692     calc_zone_watermarks(zi, &watermarks);
2693     return 0;
2694 }
2695 
calc_swap_utilization(union meminfo * mi)2696 static int calc_swap_utilization(union meminfo *mi) {
2697     int64_t swap_used = mi->field.total_swap - get_free_swap(mi);
2698     int64_t total_swappable = mi->field.active_anon + mi->field.inactive_anon +
2699                               mi->field.shmem + swap_used;
2700     return total_swappable > 0 ? (swap_used * 100) / total_swappable : 0;
2701 }
2702 
2703 enum event_source {
2704     PSI,
2705     VENDOR,
2706 };
2707 
2708 union psi_event_data {
2709     enum vmpressure_level level;
2710     mem_event_t vendor_event;
2711 };
2712 
__mp_event_psi(enum event_source source,union psi_event_data data,uint32_t events,struct polling_params * poll_params)2713 static void __mp_event_psi(enum event_source source, union psi_event_data data,
2714                            uint32_t events, struct polling_params *poll_params) {
2715     enum reclaim_state {
2716         NO_RECLAIM = 0,
2717         KSWAPD_RECLAIM,
2718         DIRECT_RECLAIM,
2719     };
2720     static int64_t init_ws_refault;
2721     static int64_t prev_workingset_refault;
2722     static int64_t base_file_lru;
2723     static int64_t init_pgscan_kswapd;
2724     static int64_t init_pgscan_direct;
2725     static int64_t init_pgrefill;
2726     static bool killing;
2727     static int thrashing_limit = thrashing_limit_pct;
2728     static struct timespec wmark_update_tm;
2729     static struct wakeup_info wi;
2730     static struct timespec thrashing_reset_tm;
2731     static int64_t prev_thrash_growth = 0;
2732     static bool check_filecache = false;
2733     static int max_thrashing = 0;
2734 
2735     union meminfo mi;
2736     union vmstat vs;
2737     struct psi_data psi_data;
2738     struct timespec curr_tm;
2739     int64_t thrashing = 0;
2740     bool swap_is_low = false;
2741     enum vmpressure_level level = (source == PSI) ? data.level: (enum vmpressure_level)0;
2742     enum kill_reasons kill_reason = NONE;
2743     bool cycle_after_kill = false;
2744     enum reclaim_state reclaim = NO_RECLAIM;
2745     enum zone_watermark wmark = WMARK_NONE;
2746     char kill_desc[LINE_MAX];
2747     bool cut_thrashing_limit = false;
2748     int min_score_adj = 0;
2749     int swap_util = 0;
2750     int64_t swap_low_threshold;
2751     long since_thrashing_reset_ms;
2752     int64_t workingset_refault_file;
2753     bool critical_stall = false;
2754     bool in_direct_reclaim;
2755     long direct_reclaim_duration_ms;
2756     bool in_kswapd_reclaim;
2757 
2758     mp_event_count++;
2759     if (debug_process_killing) {
2760         if (source == PSI)
2761             ALOGI("%s memory pressure event #%" PRIu64 " is triggered",
2762                   level_name[level], mp_event_count);
2763         else
2764             ALOGI("vendor kill event #%" PRIu64 " is triggered", mp_event_count);
2765     }
2766 
2767     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2768         ALOGE("Failed to get current time");
2769         return;
2770     }
2771 
2772     if (source == PSI) {
2773         if (events > 0 ) {
2774             /* Ignore a lower event within the first polling window. */
2775             if (level < prev_level) {
2776                 if (debug_process_killing)
2777                     ALOGI("Ignoring %s pressure event; occurred too soon.",
2778                            level_name[level]);
2779                 return;
2780             }
2781             prev_level = level;
2782         } else {
2783             /* Reset event level after the first polling window. */
2784             prev_level = VMPRESS_LEVEL_LOW;
2785         }
2786 
2787         record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2788     }
2789 
2790     bool kill_pending = is_kill_pending();
2791     if (kill_pending && (kill_timeout_ms == 0 ||
2792         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
2793         /* Skip while still killing a process */
2794         wi.skipped_wakeups++;
2795         goto no_kill;
2796     }
2797     /*
2798      * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
2799      * supported and death notification already caused waiting to stop.
2800      */
2801     stop_wait_for_proc_kill(!kill_pending);
2802 
2803     if (vmstat_parse(&vs) < 0) {
2804         ALOGE("Failed to parse vmstat!");
2805         return;
2806     }
2807     /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */
2808     workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;
2809 
2810     if (meminfo_parse(&mi) < 0) {
2811         ALOGE("Failed to parse meminfo!");
2812         return;
2813     }
2814 
2815     /* Reset states after process got killed */
2816     if (killing) {
2817         killing = false;
2818         cycle_after_kill = true;
2819         /* Reset file-backed pagecache size and refault amounts after a kill */
2820         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2821         init_ws_refault = workingset_refault_file;
2822         thrashing_reset_tm = curr_tm;
2823         prev_thrash_growth = 0;
2824     }
2825 
2826     /* Check free swap levels */
2827     if (swap_free_low_percentage) {
2828         swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2829         swap_is_low = get_free_swap(&mi) < swap_low_threshold;
2830     } else {
2831         swap_low_threshold = 0;
2832     }
2833 
2834     if (memevent_listener) {
2835         in_direct_reclaim =
2836                 direct_reclaim_start_tm.tv_sec != 0 || direct_reclaim_start_tm.tv_nsec != 0;
2837         in_kswapd_reclaim = kswapd_start_tm.tv_sec != 0 || kswapd_start_tm.tv_nsec != 0;
2838     } else {
2839         in_direct_reclaim = vs.field.pgscan_direct != init_pgscan_direct;
2840         in_kswapd_reclaim = (vs.field.pgscan_kswapd != init_pgscan_kswapd) ||
2841                             (vs.field.pgrefill != init_pgrefill);
2842     }
2843 
2844     /* Identify reclaim state */
2845     if (in_direct_reclaim) {
2846         init_pgscan_direct = vs.field.pgscan_direct;
2847         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2848         init_pgrefill = vs.field.pgrefill;
2849         direct_reclaim_duration_ms = get_time_diff_ms(&direct_reclaim_start_tm, &curr_tm);
2850         reclaim = DIRECT_RECLAIM;
2851     } else if (in_kswapd_reclaim) {
2852         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2853         init_pgrefill = vs.field.pgrefill;
2854         reclaim = KSWAPD_RECLAIM;
2855     } else if ((workingset_refault_file == prev_workingset_refault) &&
2856                 (source == PSI)) {
2857         /*
2858          * Device is not thrashing and not reclaiming, bail out early until we see these stats
2859          * changing
2860          */
2861         goto no_kill;
2862     }
2863 
2864     prev_workingset_refault = workingset_refault_file;
2865 
2866      /*
2867      * It's possible we fail to find an eligible process to kill (ex. no process is
2868      * above oom_adj_min). When this happens, we should retry to find a new process
2869      * for a kill whenever a new eligible process is available. This is especially
2870      * important for a slow growing refault case. While retrying, we should keep
2871      * monitoring new thrashing counter as someone could release the memory to mitigate
2872      * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
2873      * counter by window counts. If the counter is still greater than thrashing limit,
2874      * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
2875      * we reset the prev_thrash counter so we will stop retrying.
2876      */
2877     since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
2878     if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
2879         long windows_passed;
2880         /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
2881         prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
2882                             / (base_file_lru + 1);
2883         windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
2884         /*
2885          * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
2886          * just crossed, which means there were no eligible processes to kill. We preserve the
2887          * counter in that case to ensure a kill if a new eligible process appears.
2888          */
2889         if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
2890             prev_thrash_growth >>= windows_passed;
2891         }
2892 
2893         /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
2894         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2895         init_ws_refault = workingset_refault_file;
2896         thrashing_reset_tm = curr_tm;
2897         thrashing_limit = thrashing_limit_pct;
2898     } else {
2899         /* Calculate what % of the file-backed pagecache refaulted so far */
2900         thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
2901     }
2902     /* Add previous cycle's decayed thrashing amount */
2903     thrashing += prev_thrash_growth;
2904     if (max_thrashing < thrashing) {
2905         max_thrashing = thrashing;
2906     }
2907 
2908 update_watermarks:
2909     /*
2910      * Refresh watermarks:
2911      * 1. watermarks haven't been initialized (high_wmark == 0)
2912      * 2. per min in case user updated one of the margins if mem_event update_zoneinfo is NOT
2913      *    supported.
2914      */
2915     if (watermarks.high_wmark == 0 || (!mem_event_update_zoneinfo_supported &&
2916         get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000)) {
2917         struct zoneinfo zi;
2918 
2919         if (update_zoneinfo_watermarks(&zi) < 0) {
2920             return;
2921         }
2922         wmark_update_tm = curr_tm;
2923     }
2924 
2925     /* Find out which watermark is breached if any */
2926     wmark = get_lowest_watermark(&mi, &watermarks);
2927 
2928     if (!psi_parse_mem(&psi_data)) {
2929         critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
2930     }
2931     /*
2932      * TODO: move this logic into a separate function
2933      * Decide if killing a process is necessary and record the reason
2934      */
2935     if (source == VENDOR) {
2936         int vendor_kill_reason = data.vendor_event.event_data.vendor_kill.reason;
2937         short vendor_kill_min_oom_score_adj =
2938             data.vendor_event.event_data.vendor_kill.min_oom_score_adj;
2939         if (vendor_kill_reason < 0 ||
2940             vendor_kill_reason > VENDOR_KILL_REASON_END ||
2941             vendor_kill_min_oom_score_adj < 0) {
2942             ALOGE("Invalid vendor kill reason %d, min_oom_score_adj %d",
2943                   vendor_kill_reason, vendor_kill_min_oom_score_adj);
2944             return;
2945         }
2946 
2947         kill_reason = (enum kill_reasons)(vendor_kill_reason + VENDOR_KILL_REASON_BASE);
2948         min_score_adj = vendor_kill_min_oom_score_adj;
2949         snprintf(kill_desc, sizeof(kill_desc),
2950             "vendor kill with the reason %d, min_score_adj %d", kill_reason, min_score_adj);
2951     } else if (cycle_after_kill && wmark < WMARK_LOW) {
2952         /*
2953          * Prevent kills not freeing enough memory which might lead to OOM kill.
2954          * This might happen when a process is consuming memory faster than reclaim can
2955          * free even after a kill. Mostly happens when running memory stress tests.
2956          */
2957         min_score_adj = pressure_after_kill_min_score;
2958         kill_reason = PRESSURE_AFTER_KILL;
2959         strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2960         kill_desc[sizeof(kill_desc) - 1] = '\0';
2961     } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2962         /*
2963          * Device is too busy reclaiming memory which might lead to ANR.
2964          * Critical level is triggered when PSI complete stall (all tasks are blocked because
2965          * of the memory congestion) breaches the configured threshold.
2966          */
2967         kill_reason = NOT_RESPONDING;
2968         strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2969         kill_desc[sizeof(kill_desc) - 1] = '\0';
2970     } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2971         /* Page cache is thrashing while swap is low */
2972         kill_reason = LOW_SWAP_AND_THRASHING;
2973         snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2974             "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2975             get_free_swap(&mi) * page_k, swap_low_threshold * page_k, thrashing);
2976         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2977         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2978             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2979         }
2980         check_filecache = true;
2981     } else if (swap_is_low && wmark < WMARK_HIGH) {
2982         /* Both free memory and swap are low */
2983         kill_reason = LOW_MEM_AND_SWAP;
2984         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2985             PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
2986             get_free_swap(&mi) * page_k, swap_low_threshold * page_k);
2987         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2988         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2989             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2990         }
2991     } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
2992                (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
2993         /*
2994          * Too much anon memory is swapped out but swap is not low.
2995          * Non-swappable allocations created memory pressure.
2996          */
2997         kill_reason = LOW_MEM_AND_SWAP_UTIL;
2998         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
2999             " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
3000             swap_util, swap_util_max);
3001     } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
3002         /* Page cache is thrashing while memory is low */
3003         kill_reason = LOW_MEM_AND_THRASHING;
3004         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
3005             PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
3006         cut_thrashing_limit = true;
3007         /* Do not kill perceptible apps unless thrashing at critical levels */
3008         if (thrashing < thrashing_critical_pct) {
3009             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
3010         }
3011         check_filecache = true;
3012     } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
3013         /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
3014         kill_reason = DIRECT_RECL_AND_THRASHING;
3015         snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
3016             PRId64 "%%)", thrashing);
3017         cut_thrashing_limit = true;
3018         /* Do not kill perceptible apps unless thrashing at critical levels */
3019         if (thrashing < thrashing_critical_pct) {
3020             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
3021         }
3022         check_filecache = true;
3023     } else if (reclaim == DIRECT_RECLAIM && direct_reclaim_threshold_ms > 0 &&
3024                direct_reclaim_duration_ms > direct_reclaim_threshold_ms) {
3025         kill_reason = DIRECT_RECL_STUCK;
3026         snprintf(kill_desc, sizeof(kill_desc), "device is stuck in direct reclaim (%ldms > %dms)",
3027                  direct_reclaim_duration_ms, direct_reclaim_threshold_ms);
3028     } else if (check_filecache) {
3029         int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
3030 
3031         if (file_lru_kb < filecache_min_kb) {
3032             /* File cache is too low after thrashing, keep killing background processes */
3033             kill_reason = LOW_FILECACHE_AFTER_THRASHING;
3034             snprintf(kill_desc, sizeof(kill_desc),
3035                 "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
3036                 file_lru_kb, filecache_min_kb);
3037             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
3038         } else {
3039             /* File cache is big enough, stop checking */
3040             check_filecache = false;
3041         }
3042     }
3043 
3044     /* Check if a cached app should be killed */
3045     if (kill_reason == NONE && wmark < WMARK_HIGH) {
3046         kill_reason = LOW_MEM;
3047         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached",
3048             wmark < WMARK_LOW ? "min" : "low");
3049         min_score_adj = lowmem_min_oom_score;
3050     }
3051 
3052     /* Kill a process if necessary */
3053     if (kill_reason != NONE) {
3054         struct kill_info ki = {
3055             .kill_reason = kill_reason,
3056             .kill_desc = kill_desc,
3057             .thrashing = (int)thrashing,
3058             .max_thrashing = max_thrashing,
3059         };
3060         static bool first_kill = true;
3061 
3062         /* Make sure watermarks are correct before the first kill */
3063         if (first_kill) {
3064             first_kill = false;
3065             watermarks.high_wmark = 0;  // force recomputation
3066             goto update_watermarks;
3067         }
3068 
3069         /* Allow killing perceptible apps if the system is stalled */
3070         if (critical_stall) {
3071             min_score_adj = 0;
3072         }
3073         psi_parse_io(&psi_data);
3074         psi_parse_cpu(&psi_data);
3075         int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm, &psi_data);
3076         if (pages_freed > 0) {
3077             killing = true;
3078             max_thrashing = 0;
3079             if (cut_thrashing_limit) {
3080                 /*
3081                  * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
3082                  * thrashing limit until the system stops thrashing.
3083                  */
3084                 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
3085             }
3086         }
3087     }
3088 
3089 no_kill:
3090     /* Do not poll if kernel supports pidfd waiting */
3091     if (is_waiting_for_kill()) {
3092         /* Pause polling if we are waiting for process death notification */
3093         poll_params->update = POLLING_PAUSE;
3094         return;
3095     }
3096 
3097     /*
3098      * Start polling after initial PSI event;
3099      * extend polling while device is in direct reclaim or process is being killed;
3100      * do not extend when kswapd reclaims because that might go on for a long time
3101      * without causing memory pressure
3102      */
3103     if (events || killing || reclaim == DIRECT_RECLAIM) {
3104         poll_params->update = POLLING_START;
3105     }
3106 
3107     /* Decide the polling interval */
3108     if (swap_is_low || killing) {
3109         /* Fast polling during and after a kill or when swap is low */
3110         poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3111     } else {
3112         /* By default use long intervals */
3113         poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
3114     }
3115 }
3116 
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)3117 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
3118     union psi_event_data event_data = {.level = (enum vmpressure_level)data};
3119     __mp_event_psi(PSI, event_data, events, poll_params);
3120 }
3121 
GetCgroupAttributePath(const char * attr)3122 static std::string GetCgroupAttributePath(const char* attr) {
3123     std::string path;
3124     if (!CgroupGetAttributePath(attr, &path)) {
3125         ALOGE("Unknown cgroup attribute %s", attr);
3126     }
3127     return path;
3128 }
3129 
3130 // The implementation of this function relies on memcg statistics that are only available in the
3131 // v1 cgroup hierarchy.
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)3132 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
3133     unsigned long long evcount;
3134     int64_t mem_usage, memsw_usage;
3135     int64_t mem_pressure;
3136     union meminfo mi;
3137     struct zoneinfo zi;
3138     struct timespec curr_tm;
3139     static unsigned long kill_skip_count = 0;
3140     enum vmpressure_level level = (enum vmpressure_level)data;
3141     long other_free = 0, other_file = 0;
3142     int min_score_adj;
3143     int minfree = 0;
3144     static const std::string mem_usage_path = GetCgroupAttributePath("MemUsage");
3145     static struct reread_data mem_usage_file_data = {
3146         .filename = mem_usage_path.c_str(),
3147         .fd = -1,
3148     };
3149     static const std::string memsw_usage_path = GetCgroupAttributePath("MemAndSwapUsage");
3150     static struct reread_data memsw_usage_file_data = {
3151         .filename = memsw_usage_path.c_str(),
3152         .fd = -1,
3153     };
3154     static struct wakeup_info wi;
3155 
3156     mp_event_count++;
3157     if (debug_process_killing) {
3158         ALOGI("%s memory pressure event #%" PRIu64 " is triggered",
3159               level_name[level], mp_event_count);
3160     }
3161 
3162     if (!use_psi_monitors) {
3163         /*
3164          * Check all event counters from low to critical
3165          * and upgrade to the highest priority one. By reading
3166          * eventfd we also reset the event counters.
3167          */
3168         for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
3169             if (mpevfd[lvl] != -1 &&
3170                 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
3171                                    &evcount, sizeof(evcount))) > 0 &&
3172                 evcount > 0 && lvl > level) {
3173                 level = static_cast<vmpressure_level>(lvl);
3174             }
3175         }
3176     }
3177 
3178     /* Start polling after initial PSI event */
3179     if (use_psi_monitors && events) {
3180         /* Override polling params only if current event is more critical */
3181         if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
3182             poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3183             poll_params->update = POLLING_START;
3184         }
3185     }
3186 
3187     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
3188         ALOGE("Failed to get current time");
3189         return;
3190     }
3191 
3192     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
3193 
3194     if (kill_timeout_ms &&
3195         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms)) {
3196         /*
3197          * If we're within the no-kill timeout, see if there's pending reclaim work
3198          * from the last killed process. If so, skip killing for now.
3199          */
3200         if (is_kill_pending()) {
3201             kill_skip_count++;
3202             wi.skipped_wakeups++;
3203             return;
3204         }
3205         /*
3206          * Process is dead, stop waiting. This has no effect if pidfds are supported and
3207          * death notification already caused waiting to stop.
3208          */
3209         stop_wait_for_proc_kill(true);
3210     } else {
3211         /*
3212          * Killing took longer than no-kill timeout. Stop waiting for the last process
3213          * to die because we are ready to kill again.
3214          */
3215         stop_wait_for_proc_kill(false);
3216     }
3217 
3218     if (kill_skip_count > 0) {
3219         ALOGI("%lu memory pressure events were skipped after a kill!",
3220               kill_skip_count);
3221         kill_skip_count = 0;
3222     }
3223 
3224     if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
3225         ALOGE("Failed to get free memory!");
3226         return;
3227     }
3228 
3229     if (use_minfree_levels) {
3230         int i;
3231 
3232         other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
3233         if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
3234             other_file = (mi.field.nr_file_pages - mi.field.shmem -
3235                           mi.field.unevictable - mi.field.swap_cached);
3236         } else {
3237             other_file = 0;
3238         }
3239 
3240         min_score_adj = OOM_SCORE_ADJ_MAX + 1;
3241         for (i = 0; i < lowmem_targets_size; i++) {
3242             minfree = lowmem_minfree[i];
3243             if (other_free < minfree && other_file < minfree) {
3244                 min_score_adj = lowmem_adj[i];
3245                 break;
3246             }
3247         }
3248 
3249         if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
3250             if (debug_process_killing && lowmem_targets_size) {
3251                 ALOGI("Ignore %s memory pressure event "
3252                       "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
3253                       level_name[level], other_free * page_k, other_file * page_k,
3254                       (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
3255             }
3256             return;
3257         }
3258 
3259         goto do_kill;
3260     }
3261 
3262     if (level == VMPRESS_LEVEL_LOW) {
3263         record_low_pressure_levels(&mi);
3264     }
3265 
3266     if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
3267         /* Do not monitor this pressure level */
3268         return;
3269     }
3270 
3271     if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
3272         goto do_kill;
3273     }
3274     if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
3275         goto do_kill;
3276     }
3277 
3278     // Calculate percent for swappinness.
3279     mem_pressure = (mem_usage * 100) / memsw_usage;
3280 
3281     if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
3282         // We are swapping too much.
3283         if (mem_pressure < upgrade_pressure) {
3284             level = upgrade_level(level);
3285             if (debug_process_killing) {
3286                 ALOGI("Event upgraded to %s", level_name[level]);
3287             }
3288         }
3289     }
3290 
3291     // If we still have enough swap space available, check if we want to
3292     // ignore/downgrade pressure events.
3293     if (get_free_swap(&mi) >=
3294         mi.field.total_swap * swap_free_low_percentage / 100) {
3295         // If the pressure is larger than downgrade_pressure lmk will not
3296         // kill any process, since enough memory is available.
3297         if (mem_pressure > downgrade_pressure) {
3298             if (debug_process_killing) {
3299                 ALOGI("Ignore %s memory pressure", level_name[level]);
3300             }
3301             return;
3302         } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
3303             if (debug_process_killing) {
3304                 ALOGI("Downgrade critical memory pressure");
3305             }
3306             // Downgrade event, since enough memory available.
3307             level = downgrade_level(level);
3308         }
3309     }
3310 
3311 do_kill:
3312     if (low_ram_device) {
3313         /* For Go devices kill only one task */
3314         if (find_and_kill_process(use_minfree_levels ? min_score_adj : level_oomadj[level],
3315                                   NULL, &mi, &wi, &curr_tm, NULL) == 0) {
3316             if (debug_process_killing) {
3317                 ALOGI("Nothing to kill");
3318             }
3319         }
3320     } else {
3321         int pages_freed;
3322         static struct timespec last_report_tm;
3323         static unsigned long report_skip_count = 0;
3324 
3325         if (!use_minfree_levels) {
3326             /* Free up enough memory to downgrate the memory pressure to low level */
3327             if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
3328                 if (debug_process_killing) {
3329                     ALOGI("Ignoring pressure since more memory is "
3330                         "available (%" PRId64 ") than watermark (%" PRId64 ")",
3331                         mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
3332                 }
3333                 return;
3334             }
3335             min_score_adj = level_oomadj[level];
3336         }
3337 
3338         pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm, NULL);
3339 
3340         if (pages_freed == 0 && min_score_adj == 0) {
3341             lmkd_no_kill_candidates_hook();
3342         }
3343 
3344         if (pages_freed == 0) {
3345             /* Rate limit kill reports when nothing was reclaimed */
3346             if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
3347                 report_skip_count++;
3348                 return;
3349             }
3350         }
3351 
3352         /* Log whenever we kill or when report rate limit allows */
3353         if (use_minfree_levels) {
3354             ALOGI("Reclaimed %ldkB, cache(%ldkB) and free(%" PRId64 "kB)-reserved(%" PRId64 "kB) "
3355                 "below min(%ldkB) for oom_score_adj %d",
3356                 pages_freed * page_k,
3357                 other_file * page_k, mi.field.nr_free_pages * page_k,
3358                 zi.totalreserve_pages * page_k,
3359                 minfree * page_k, min_score_adj);
3360         } else {
3361             ALOGI("Reclaimed %ldkB at oom_score_adj %d", pages_freed * page_k, min_score_adj);
3362         }
3363 
3364         if (report_skip_count > 0) {
3365             ALOGI("Suppressed %lu failed kill reports", report_skip_count);
3366             report_skip_count = 0;
3367         }
3368 
3369         last_report_tm = curr_tm;
3370     }
3371     if (is_waiting_for_kill()) {
3372         /* pause polling if we are waiting for process death notification */
3373         poll_params->update = POLLING_PAUSE;
3374     }
3375 }
3376 
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)3377 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
3378     int fd;
3379 
3380     /* Do not register a handler if threshold_ms is not set */
3381     if (!psi_thresholds[level].threshold_ms) {
3382         return true;
3383     }
3384 
3385     fd = init_psi_monitor(psi_thresholds[level].stall_type,
3386         psi_thresholds[level].threshold_ms * US_PER_MS,
3387         PSI_WINDOW_SIZE_MS * US_PER_MS);
3388 
3389     if (fd < 0) {
3390         return false;
3391     }
3392 
3393     vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
3394     vmpressure_hinfo[level].data = level;
3395     if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
3396         destroy_psi_monitor(fd);
3397         return false;
3398     }
3399     maxevents++;
3400     mpevfd[level] = fd;
3401 
3402     return true;
3403 }
3404 
destroy_mp_psi(enum vmpressure_level level)3405 static void destroy_mp_psi(enum vmpressure_level level) {
3406     int fd = mpevfd[level];
3407 
3408     if (fd < 0) {
3409         return;
3410     }
3411 
3412     if (unregister_psi_monitor(epollfd, fd) < 0) {
3413         ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
3414             level_name[level], errno);
3415     }
3416     maxevents--;
3417     destroy_psi_monitor(fd);
3418     mpevfd[level] = -1;
3419 }
3420 
3421 enum class MemcgVersion {
3422     kNotFound,
3423     kV1,
3424     kV2,
3425 };
3426 
__memcg_version()3427 static MemcgVersion __memcg_version() {
3428     std::string cgroupv2_path, memcg_path;
3429 
3430     if (!CgroupGetControllerPath("memory", &memcg_path)) {
3431         return MemcgVersion::kNotFound;
3432     }
3433     return CgroupGetControllerPath(CGROUPV2_HIERARCHY_NAME, &cgroupv2_path) &&
3434                            cgroupv2_path == memcg_path
3435                    ? MemcgVersion::kV2
3436                    : MemcgVersion::kV1;
3437 }
3438 
memcg_version()3439 static MemcgVersion memcg_version() {
3440     static MemcgVersion version = __memcg_version();
3441 
3442     return version;
3443 }
3444 
memevent_listener_notification(int data __unused,uint32_t events __unused,struct polling_params * poll_params)3445 static void memevent_listener_notification(int data __unused, uint32_t events __unused,
3446                                            struct polling_params* poll_params) {
3447     struct timespec curr_tm;
3448     std::vector<mem_event_t> mem_events;
3449 
3450     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
3451         direct_reclaim_start_tm.tv_sec = 0;
3452         direct_reclaim_start_tm.tv_nsec = 0;
3453         ALOGE("Failed to get current time for memevent listener notification.");
3454         return;
3455     }
3456 
3457     if (!memevent_listener->getMemEvents(mem_events)) {
3458         direct_reclaim_start_tm.tv_sec = 0;
3459         direct_reclaim_start_tm.tv_nsec = 0;
3460         ALOGE("Failed fetching memory listener events.");
3461         return;
3462     }
3463 
3464     for (const mem_event_t& mem_event : mem_events) {
3465         switch (mem_event.type) {
3466             /* Direct Reclaim */
3467             case MEM_EVENT_DIRECT_RECLAIM_BEGIN:
3468                 direct_reclaim_start_tm = curr_tm;
3469                 break;
3470             case MEM_EVENT_DIRECT_RECLAIM_END:
3471                 direct_reclaim_start_tm.tv_sec = 0;
3472                 direct_reclaim_start_tm.tv_nsec = 0;
3473                 break;
3474 
3475             /* kswapd */
3476             case MEM_EVENT_KSWAPD_WAKE:
3477                 kswapd_start_tm = curr_tm;
3478                 break;
3479             case MEM_EVENT_KSWAPD_SLEEP:
3480                 kswapd_start_tm.tv_sec = 0;
3481                 kswapd_start_tm.tv_nsec = 0;
3482                 break;
3483             case MEM_EVENT_VENDOR_LMK_KILL: {
3484                 union psi_event_data event_data = {.vendor_event = mem_event};
3485                  __mp_event_psi(VENDOR, event_data, 0, poll_params);
3486                 break;
3487             }
3488             case MEM_EVENT_UPDATE_ZONEINFO: {
3489                 struct zoneinfo zi;
3490                 update_zoneinfo_watermarks(&zi);
3491                 break;
3492             }
3493         }
3494     }
3495 }
3496 
init_memevent_listener_monitoring()3497 static bool init_memevent_listener_monitoring() {
3498     static struct event_handler_info direct_reclaim_poll_hinfo = {0,
3499                                                                   memevent_listener_notification};
3500 
3501     if (memevent_listener) return true;
3502 
3503     // Make sure bpf programs are loaded, else we'll wait until they are loaded
3504     android::bpf::waitForProgsLoaded();
3505     memevent_listener = std::make_unique<android::bpf::memevents::MemEventListener>(
3506             android::bpf::memevents::MemEventClient::LMKD);
3507 
3508     if (!memevent_listener->ok()) {
3509         ALOGE("Failed to initialize memevents listener");
3510         memevent_listener.reset();
3511         return false;
3512     }
3513 
3514     if (!memevent_listener->registerEvent(MEM_EVENT_DIRECT_RECLAIM_BEGIN) ||
3515         !memevent_listener->registerEvent(MEM_EVENT_DIRECT_RECLAIM_END)) {
3516         ALOGE("Failed to register direct reclaim memevents");
3517         memevent_listener.reset();
3518         return false;
3519     }
3520     if (!memevent_listener->registerEvent(MEM_EVENT_KSWAPD_WAKE) ||
3521         !memevent_listener->registerEvent(MEM_EVENT_KSWAPD_SLEEP)) {
3522         ALOGE("Failed to register kswapd memevents");
3523         memevent_listener.reset();
3524         return false;
3525     }
3526 
3527     if (!memevent_listener->registerEvent(MEM_EVENT_VENDOR_LMK_KILL)) {
3528         ALOGI("Failed to register android_vendor_kill memevents");
3529     }
3530 
3531     if (!memevent_listener->registerEvent(MEM_EVENT_UPDATE_ZONEINFO)) {
3532         mem_event_update_zoneinfo_supported = false;
3533         ALOGI("update_zoneinfo memevents are not supported");
3534     } else {
3535         mem_event_update_zoneinfo_supported = true;
3536     }
3537 
3538     int memevent_listener_fd = memevent_listener->getRingBufferFd();
3539     if (memevent_listener_fd < 0) {
3540         memevent_listener.reset();
3541         ALOGE("Invalid memevent_listener fd: %d", memevent_listener_fd);
3542         return false;
3543     }
3544 
3545     struct epoll_event epev;
3546     epev.events = EPOLLIN;
3547     epev.data.ptr = (void*)&direct_reclaim_poll_hinfo;
3548     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, memevent_listener_fd, &epev) < 0) {
3549         ALOGE("Failed registering memevent_listener fd: %d; errno=%d", memevent_listener_fd, errno);
3550         memevent_listener.reset();
3551         return false;
3552     }
3553 
3554     direct_reclaim_start_tm.tv_sec = 0;
3555     direct_reclaim_start_tm.tv_nsec = 0;
3556 
3557     maxevents++;
3558     return true;
3559 }
3560 
init_psi_monitors()3561 static bool init_psi_monitors() {
3562     /*
3563      * When PSI is used on low-ram devices or on high-end devices without memfree levels
3564      * use new kill strategy based on zone watermarks, free swap and thrashing stats.
3565      * Also use the new strategy if memcg has not been mounted in the v1 cgroups hiearchy since
3566      * the old strategy relies on memcg attributes that are available only in the v1 cgroups
3567      * hiearchy.
3568      */
3569     bool use_new_strategy =
3570         GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels);
3571     if (!use_new_strategy && memcg_version() != MemcgVersion::kV1) {
3572         ALOGE("Old kill strategy can only be used with v1 cgroup hierarchy");
3573         return false;
3574     }
3575     /* In default PSI mode override stall amounts using system properties */
3576     if (use_new_strategy) {
3577         /* Do not use low pressure level */
3578         psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
3579         psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
3580         psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
3581     }
3582 
3583     if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
3584         return false;
3585     }
3586     if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
3587         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3588         return false;
3589     }
3590     if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
3591         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3592         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3593         return false;
3594     }
3595     return true;
3596 }
3597 
init_mp_common(enum vmpressure_level level)3598 static bool init_mp_common(enum vmpressure_level level) {
3599     // The implementation of this function relies on memcg statistics that are only available in the
3600     // v1 cgroup hierarchy.
3601     if (memcg_version() != MemcgVersion::kV1) {
3602         ALOGE("%s: global monitoring is only available for the v1 cgroup hierarchy", __func__);
3603         return false;
3604     }
3605 
3606     int mpfd;
3607     int evfd;
3608     int evctlfd;
3609     char buf[256];
3610     struct epoll_event epev;
3611     int ret;
3612     int level_idx = (int)level;
3613     const char *levelstr = level_name[level_idx];
3614 
3615     /* gid containing AID_SYSTEM required */
3616     mpfd = open(GetCgroupAttributePath("MemPressureLevel").c_str(), O_RDONLY | O_CLOEXEC);
3617     if (mpfd < 0) {
3618         ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
3619         goto err_open_mpfd;
3620     }
3621 
3622     evctlfd = open(GetCgroupAttributePath("MemCgroupEventControl").c_str(), O_WRONLY | O_CLOEXEC);
3623     if (evctlfd < 0) {
3624         ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
3625         goto err_open_evctlfd;
3626     }
3627 
3628     evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
3629     if (evfd < 0) {
3630         ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
3631         goto err_eventfd;
3632     }
3633 
3634     ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
3635     if (ret >= (ssize_t)sizeof(buf)) {
3636         ALOGE("cgroup.event_control line overflow for level %s", levelstr);
3637         goto err;
3638     }
3639 
3640     ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
3641     if (ret == -1) {
3642         ALOGE("cgroup.event_control write failed for level %s; errno=%d",
3643               levelstr, errno);
3644         goto err;
3645     }
3646 
3647     epev.events = EPOLLIN;
3648     /* use data to store event level */
3649     vmpressure_hinfo[level_idx].data = level_idx;
3650     vmpressure_hinfo[level_idx].handler = mp_event_common;
3651     epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
3652     ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
3653     if (ret == -1) {
3654         ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
3655         goto err;
3656     }
3657     maxevents++;
3658     mpevfd[level] = evfd;
3659     close(evctlfd);
3660     return true;
3661 
3662 err:
3663     close(evfd);
3664 err_eventfd:
3665     close(evctlfd);
3666 err_open_evctlfd:
3667     close(mpfd);
3668 err_open_mpfd:
3669     return false;
3670 }
3671 
destroy_mp_common(enum vmpressure_level level)3672 static void destroy_mp_common(enum vmpressure_level level) {
3673     struct epoll_event epev;
3674     int fd = mpevfd[level];
3675 
3676     if (fd < 0) {
3677         return;
3678     }
3679 
3680     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) {
3681         // Log an error and keep going
3682         ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno);
3683     }
3684     maxevents--;
3685     close(fd);
3686     mpevfd[level] = -1;
3687 }
3688 
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)3689 static void kernel_event_handler(int data __unused, uint32_t events __unused,
3690                                  struct polling_params *poll_params __unused) {
3691     poll_kernel(kpoll_fd);
3692 }
3693 
init_monitors()3694 static bool init_monitors() {
3695     ALOGI("Wakeup counter is reset from %" PRIu64 " to 0", mp_event_count);
3696     mp_event_count = 0;
3697     /* Try to use psi monitor first if kernel has it */
3698     use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) &&
3699         init_psi_monitors();
3700     /* Fall back to vmpressure */
3701     if (!use_psi_monitors &&
3702         (!init_mp_common(VMPRESS_LEVEL_LOW) ||
3703         !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
3704         !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
3705         ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
3706         return false;
3707     }
3708     if (use_psi_monitors) {
3709         ALOGI("Using psi monitors for memory pressure detection");
3710     } else {
3711         ALOGI("Using vmpressure for memory pressure detection");
3712     }
3713 
3714     monitors_initialized = true;
3715     return true;
3716 }
3717 
destroy_monitors()3718 static void destroy_monitors() {
3719     if (use_psi_monitors) {
3720         destroy_mp_psi(VMPRESS_LEVEL_CRITICAL);
3721         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3722         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3723     } else {
3724         destroy_mp_common(VMPRESS_LEVEL_CRITICAL);
3725         destroy_mp_common(VMPRESS_LEVEL_MEDIUM);
3726         destroy_mp_common(VMPRESS_LEVEL_LOW);
3727     }
3728 }
3729 
drop_reaper_comm()3730 static void drop_reaper_comm() {
3731     close(reaper_comm_fd[0]);
3732     close(reaper_comm_fd[1]);
3733 }
3734 
setup_reaper_comm()3735 static bool setup_reaper_comm() {
3736     if (pipe(reaper_comm_fd)) {
3737         ALOGE("pipe failed: %s", strerror(errno));
3738         return false;
3739     }
3740 
3741     // Ensure main thread never blocks on read
3742     int flags = fcntl(reaper_comm_fd[0], F_GETFL);
3743     if (fcntl(reaper_comm_fd[0], F_SETFL, flags | O_NONBLOCK)) {
3744         ALOGE("fcntl failed: %s", strerror(errno));
3745         drop_reaper_comm();
3746         return false;
3747     }
3748 
3749     return true;
3750 }
3751 
init_reaper()3752 static bool init_reaper() {
3753     if (!reaper.is_reaping_supported()) {
3754         ALOGI("Process reaping is not supported");
3755         return false;
3756     }
3757 
3758     if (!setup_reaper_comm()) {
3759         ALOGE("Failed to create thread communication channel");
3760         return false;
3761     }
3762 
3763     // Setup epoll handler
3764     struct epoll_event epev;
3765     static struct event_handler_info kill_failed_hinfo = { 0, kill_fail_handler };
3766     epev.events = EPOLLIN;
3767     epev.data.ptr = (void *)&kill_failed_hinfo;
3768     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, reaper_comm_fd[0], &epev)) {
3769         ALOGE("epoll_ctl failed: %s", strerror(errno));
3770         drop_reaper_comm();
3771         return false;
3772     }
3773 
3774     if (!reaper.init(reaper_comm_fd[1])) {
3775         ALOGE("Failed to initialize reaper object");
3776         if (epoll_ctl(epollfd, EPOLL_CTL_DEL, reaper_comm_fd[0], &epev)) {
3777             ALOGE("epoll_ctl failed: %s", strerror(errno));
3778         }
3779         drop_reaper_comm();
3780         return false;
3781     }
3782     maxevents++;
3783 
3784     return true;
3785 }
3786 
init(void)3787 static int init(void) {
3788     static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
3789     struct reread_data file_data = {
3790         .filename = ZONEINFO_PATH,
3791         .fd = -1,
3792     };
3793     struct epoll_event epev;
3794     int pidfd;
3795     int i;
3796     int ret;
3797 
3798     // Initialize page size
3799     pagesize = getpagesize();
3800     page_k = pagesize / 1024;
3801 
3802     epollfd = epoll_create(MAX_EPOLL_EVENTS);
3803     if (epollfd == -1) {
3804         ALOGE("epoll_create failed (errno=%d)", errno);
3805         return -1;
3806     }
3807 
3808     // mark data connections as not connected
3809     for (int i = 0; i < MAX_DATA_CONN; i++) {
3810         data_sock[i].sock = -1;
3811     }
3812 
3813     ctrl_sock.sock = android_get_control_socket("lmkd");
3814     if (ctrl_sock.sock < 0) {
3815         ALOGE("get lmkd control socket failed");
3816         return -1;
3817     }
3818 
3819     ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
3820     if (ret < 0) {
3821         ALOGE("lmkd control socket listen failed (errno=%d)", errno);
3822         return -1;
3823     }
3824 
3825     epev.events = EPOLLIN;
3826     ctrl_sock.handler_info.handler = ctrl_connect_handler;
3827     epev.data.ptr = (void *)&(ctrl_sock.handler_info);
3828     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
3829         ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
3830         return -1;
3831     }
3832     maxevents++;
3833 
3834     has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
3835     use_inkernel_interface = has_inkernel_module;
3836 
3837     if (use_inkernel_interface) {
3838         ALOGI("Using in-kernel low memory killer interface");
3839         if (init_poll_kernel()) {
3840             epev.events = EPOLLIN;
3841             epev.data.ptr = (void*)&kernel_poll_hinfo;
3842             if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
3843                 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
3844                 close(kpoll_fd);
3845                 kpoll_fd = -1;
3846             } else {
3847                 maxevents++;
3848                 /* let the others know it does support reporting kills */
3849                 property_set("sys.lmk.reportkills", "1");
3850             }
3851         }
3852     } else {
3853         // Do not register monitors until boot completed for devices configured
3854         // for delaying monitors. This is done to save CPU cycles for low
3855         // resource devices during boot up.
3856         if (!delay_monitors_until_boot || property_get_bool("sys.boot_completed", false)) {
3857             if (!init_monitors()) {
3858                 return -1;
3859             }
3860         }
3861         /* let the others know it does support reporting kills */
3862         property_set("sys.lmk.reportkills", "1");
3863     }
3864 
3865     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
3866         procadjslot_list[i].next = &procadjslot_list[i];
3867         procadjslot_list[i].prev = &procadjslot_list[i];
3868     }
3869 
3870     memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
3871 
3872     /*
3873      * Read zoneinfo as the biggest file we read to create and size the initial
3874      * read buffer and avoid memory re-allocations during memory pressure
3875      */
3876     if (reread_file(&file_data) == NULL) {
3877         ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
3878     }
3879 
3880     /* check if kernel supports pidfd_open syscall */
3881     pidfd = TEMP_FAILURE_RETRY(pidfd_open(getpid(), 0));
3882     if (pidfd < 0) {
3883         pidfd_supported = (errno != ENOSYS);
3884     } else {
3885         pidfd_supported = true;
3886         close(pidfd);
3887     }
3888     ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
3889 
3890     if (!lmkd_init_hook()) {
3891         ALOGE("Failed to initialize LMKD hooks.");
3892         return -1;
3893     }
3894 
3895     return 0;
3896 }
3897 
polling_paused(struct polling_params * poll_params)3898 static bool polling_paused(struct polling_params *poll_params) {
3899     return poll_params->paused_handler != NULL;
3900 }
3901 
resume_polling(struct polling_params * poll_params,struct timespec curr_tm)3902 static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) {
3903     poll_params->poll_start_tm = curr_tm;
3904     poll_params->poll_handler = poll_params->paused_handler;
3905     poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3906     poll_params->paused_handler = NULL;
3907 }
3908 
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)3909 static void call_handler(struct event_handler_info* handler_info,
3910                          struct polling_params *poll_params, uint32_t events) {
3911     struct timespec curr_tm;
3912 
3913     watchdog.start();
3914     poll_params->update = POLLING_DO_NOT_CHANGE;
3915     handler_info->handler(handler_info->data, events, poll_params);
3916     clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3917     if (poll_params->poll_handler == handler_info) {
3918         poll_params->last_poll_tm = curr_tm;
3919     }
3920 
3921     switch (poll_params->update) {
3922     case POLLING_START:
3923         /*
3924          * Poll for the duration of PSI_WINDOW_SIZE_MS after the
3925          * initial PSI event because psi events are rate-limited
3926          * at one per sec.
3927          */
3928         poll_params->poll_start_tm = curr_tm;
3929         poll_params->poll_handler = handler_info;
3930         poll_params->last_poll_tm = curr_tm;
3931         break;
3932     case POLLING_PAUSE:
3933         poll_params->paused_handler = handler_info;
3934         poll_params->poll_handler = NULL;
3935         break;
3936     case POLLING_RESUME:
3937         resume_polling(poll_params, curr_tm);
3938         break;
3939     case POLLING_DO_NOT_CHANGE:
3940         if (poll_params->poll_handler &&
3941             get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
3942             /* Polled for the duration of PSI window, time to stop */
3943             poll_params->poll_handler = NULL;
3944         }
3945         break;
3946     }
3947     watchdog.stop();
3948 }
3949 
mainloop(void)3950 static void mainloop(void) {
3951     struct event_handler_info* handler_info;
3952     struct polling_params poll_params;
3953     struct timespec curr_tm;
3954     struct epoll_event *evt;
3955     long delay = -1;
3956 
3957     poll_params.poll_handler = NULL;
3958     poll_params.paused_handler = NULL;
3959 
3960     while (1) {
3961         struct epoll_event events[MAX_EPOLL_EVENTS];
3962         int nevents;
3963         int i;
3964 
3965         if (poll_params.poll_handler) {
3966             bool poll_now;
3967 
3968             clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3969             if (poll_params.update == POLLING_RESUME) {
3970                 /* Just transitioned into POLLING_RESUME, poll immediately. */
3971                 poll_now = true;
3972                 nevents = 0;
3973             } else {
3974                 /* Calculate next timeout */
3975                 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
3976                 delay = (delay < poll_params.polling_interval_ms) ?
3977                     poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
3978 
3979                 /* Wait for events until the next polling timeout */
3980                 nevents = epoll_wait(epollfd, events, maxevents, delay);
3981 
3982                 /* Update current time after wait */
3983                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3984                 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
3985                     poll_params.polling_interval_ms);
3986             }
3987             if (poll_now) {
3988                 call_handler(poll_params.poll_handler, &poll_params, 0);
3989             }
3990         } else {
3991             if (kill_timeout_ms && is_waiting_for_kill()) {
3992                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3993                 delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
3994                 /* Wait for pidfds notification or kill timeout to expire */
3995                 nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
3996                 if (nevents == 0) {
3997                     /* Kill notification timed out */
3998                     stop_wait_for_proc_kill(false);
3999                     if (polling_paused(&poll_params)) {
4000                         clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
4001                         poll_params.update = POLLING_RESUME;
4002                         resume_polling(&poll_params, curr_tm);
4003                     }
4004                 }
4005             } else {
4006                 /* Wait for events with no timeout */
4007                 nevents = epoll_wait(epollfd, events, maxevents, -1);
4008             }
4009         }
4010 
4011         if (nevents == -1) {
4012             if (errno == EINTR)
4013                 continue;
4014             ALOGE("epoll_wait failed (errno=%d)", errno);
4015             continue;
4016         }
4017 
4018         /*
4019          * First pass to see if any data socket connections were dropped.
4020          * Dropped connection should be handled before any other events
4021          * to deallocate data connection and correctly handle cases when
4022          * connection gets dropped and reestablished in the same epoll cycle.
4023          * In such cases it's essential to handle connection closures first.
4024          */
4025         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
4026             if ((evt->events & EPOLLHUP) && evt->data.ptr) {
4027                 handler_info = (struct event_handler_info*)evt->data.ptr;
4028                 if (handler_info->handler == kill_done_handler) {
4029                     call_handler(handler_info, &poll_params, evt->events);
4030                 } else {
4031                     ALOGI("lmkd data connection dropped");
4032                     watchdog.start();
4033                     ctrl_data_close(handler_info->data);
4034                     watchdog.stop();
4035                 }
4036             }
4037         }
4038 
4039         /* Second pass to handle all other events */
4040         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
4041             if (evt->events & EPOLLERR) {
4042                 ALOGD("EPOLLERR on event #%d", i);
4043             }
4044             if (evt->events & EPOLLHUP) {
4045                 /* This case was handled in the first pass */
4046                 continue;
4047             }
4048             if (evt->data.ptr) {
4049                 handler_info = (struct event_handler_info*)evt->data.ptr;
4050                 call_handler(handler_info, &poll_params, evt->events);
4051             }
4052         }
4053     }
4054 }
4055 
issue_reinit()4056 int issue_reinit() {
4057     int sock;
4058 
4059     sock = lmkd_connect();
4060     if (sock < 0) {
4061         ALOGE("failed to connect to lmkd: %s", strerror(errno));
4062         return -1;
4063     }
4064 
4065     enum update_props_result res = lmkd_update_props(sock);
4066     switch (res) {
4067     case UPDATE_PROPS_SUCCESS:
4068         ALOGI("lmkd updated properties successfully");
4069         break;
4070     case UPDATE_PROPS_SEND_ERR:
4071         ALOGE("failed to send lmkd request: %s", strerror(errno));
4072         break;
4073     case UPDATE_PROPS_RECV_ERR:
4074         ALOGE("failed to receive lmkd reply: %s", strerror(errno));
4075         break;
4076     case UPDATE_PROPS_FORMAT_ERR:
4077         ALOGE("lmkd reply is invalid");
4078         break;
4079     case UPDATE_PROPS_FAIL:
4080         ALOGE("lmkd failed to update its properties");
4081         break;
4082     }
4083 
4084     close(sock);
4085     return res == UPDATE_PROPS_SUCCESS ? 0 : -1;
4086 }
4087 
on_boot_completed()4088 static int on_boot_completed() {
4089     int sock;
4090 
4091     sock = lmkd_connect();
4092     if (sock < 0) {
4093         ALOGE("failed to connect to lmkd: %s", strerror(errno));
4094         return -1;
4095     }
4096 
4097     enum boot_completed_notification_result res = lmkd_notify_boot_completed(sock);
4098 
4099     switch (res) {
4100         case BOOT_COMPLETED_NOTIF_SUCCESS:
4101             break;
4102         case BOOT_COMPLETED_NOTIF_ALREADY_HANDLED:
4103             ALOGW("lmkd already handled boot-completed operations");
4104             break;
4105         case BOOT_COMPLETED_NOTIF_SEND_ERR:
4106             ALOGE("failed to send lmkd request: %m");
4107             break;
4108         case BOOT_COMPLETED_NOTIF_RECV_ERR:
4109             ALOGE("failed to receive request: %m");
4110             break;
4111         case BOOT_COMPLETED_NOTIF_FORMAT_ERR:
4112             ALOGE("lmkd reply is invalid");
4113             break;
4114         case BOOT_COMPLETED_NOTIF_FAILS:
4115             ALOGE("lmkd failed to receive boot-completed notification");
4116             break;
4117     }
4118 
4119     close(sock);
4120     return res == BOOT_COMPLETED_NOTIF_SUCCESS ? 0 : -1;
4121 }
4122 
update_props()4123 static bool update_props() {
4124     /* By default disable low level vmpressure events */
4125     level_oomadj[VMPRESS_LEVEL_LOW] =
4126         GET_LMK_PROPERTY(int32, "low", OOM_SCORE_ADJ_MAX + 1);
4127     level_oomadj[VMPRESS_LEVEL_MEDIUM] =
4128         GET_LMK_PROPERTY(int32, "medium", 800);
4129     level_oomadj[VMPRESS_LEVEL_CRITICAL] =
4130         GET_LMK_PROPERTY(int32, "critical", 0);
4131     debug_process_killing = GET_LMK_PROPERTY(bool, "debug", false);
4132 
4133     /* By default disable upgrade/downgrade logic */
4134     enable_pressure_upgrade =
4135         GET_LMK_PROPERTY(bool, "critical_upgrade", false);
4136     upgrade_pressure =
4137         (int64_t)GET_LMK_PROPERTY(int32, "upgrade_pressure", 100);
4138     downgrade_pressure =
4139         (int64_t)GET_LMK_PROPERTY(int32, "downgrade_pressure", 100);
4140     kill_heaviest_task =
4141         GET_LMK_PROPERTY(bool, "kill_heaviest_task", false);
4142     low_ram_device = property_get_bool("ro.config.low_ram", false);
4143     kill_timeout_ms =
4144         (unsigned long)GET_LMK_PROPERTY(int32, "kill_timeout_ms", 100);
4145     pressure_after_kill_min_score =
4146         (unsigned long)GET_LMK_PROPERTY(int32, "pressure_after_kill_min_score", 0);
4147     use_minfree_levels =
4148         GET_LMK_PROPERTY(bool, "use_minfree_levels", false);
4149     per_app_memcg =
4150         property_get_bool("ro.config.per_app_memcg", low_ram_device);
4151     swap_free_low_percentage = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_free_low_percentage",
4152         DEF_LOW_SWAP));
4153     psi_partial_stall_ms = GET_LMK_PROPERTY(int32, "psi_partial_stall_ms",
4154         low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
4155     psi_complete_stall_ms = GET_LMK_PROPERTY(int32, "psi_complete_stall_ms",
4156         DEF_COMPLETE_STALL);
4157     thrashing_limit_pct =
4158             std::max(0, GET_LMK_PROPERTY(int32, "thrashing_limit",
4159                                          low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
4160     thrashing_limit_decay_pct = clamp(0, 100, GET_LMK_PROPERTY(int32, "thrashing_limit_decay",
4161         low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
4162     thrashing_critical_pct = std::max(
4163             0, GET_LMK_PROPERTY(int32, "thrashing_limit_critical", thrashing_limit_pct * 3));
4164     swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100));
4165     filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
4166     stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);
4167     delay_monitors_until_boot = GET_LMK_PROPERTY(bool, "delay_monitors_until_boot", false);
4168     direct_reclaim_threshold_ms =
4169             GET_LMK_PROPERTY(int64, "direct_reclaim_threshold_ms", DEF_DIRECT_RECL_THRESH_MS);
4170     swap_compression_ratio =
4171             GET_LMK_PROPERTY(int64, "swap_compression_ratio", DEF_SWAP_COMP_RATIO);
4172     lowmem_min_oom_score =
4173             std::max(PERCEPTIBLE_APP_ADJ + 1,
4174                      GET_LMK_PROPERTY(int32, "lowmem_min_oom_score", DEF_LOWMEM_MIN_SCORE));
4175 
4176     reaper.enable_debug(debug_process_killing);
4177 
4178     /* Call the update props hook */
4179     if (!lmkd_update_props_hook()) {
4180         ALOGE("Failed to update LMKD hook props.");
4181         return false;
4182     }
4183 
4184     return true;
4185 }
4186 
main(int argc,char ** argv)4187 int main(int argc, char **argv) {
4188     if ((argc > 1) && argv[1]) {
4189         if (!strcmp(argv[1], "--reinit")) {
4190             if (property_set(LMKD_REINIT_PROP, "")) {
4191                 ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
4192             }
4193             return issue_reinit();
4194         } else if (!strcmp(argv[1], "--boot_completed")) {
4195             return on_boot_completed();
4196         }
4197     }
4198 
4199     if (!update_props()) {
4200         ALOGE("Failed to initialize props, exiting.");
4201         return -1;
4202     }
4203 
4204     ctx = create_android_logger(KILLINFO_LOG_TAG);
4205 
4206     if (!init()) {
4207         if (!use_inkernel_interface) {
4208             /*
4209              * MCL_ONFAULT pins pages as they fault instead of loading
4210              * everything immediately all at once. (Which would be bad,
4211              * because as of this writing, we have a lot of mapped pages we
4212              * never use.) Old kernels will see MCL_ONFAULT and fail with
4213              * EINVAL; we ignore this failure.
4214              *
4215              * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
4216              * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
4217              * in pages.
4218              */
4219             /* CAP_IPC_LOCK required */
4220             if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
4221                 ALOGW("mlockall failed %s", strerror(errno));
4222             }
4223 
4224             /* CAP_NICE required */
4225             struct sched_param param = {
4226                     .sched_priority = 1,
4227             };
4228             if (sched_setscheduler(0, SCHED_FIFO, &param)) {
4229                 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
4230             }
4231         }
4232 
4233         if (init_reaper()) {
4234             ALOGI("Process reaper initialized with %d threads in the pool",
4235                 reaper.thread_cnt());
4236         }
4237 
4238         if (!watchdog.init()) {
4239             ALOGE("Failed to initialize the watchdog");
4240         }
4241 
4242         mainloop();
4243     }
4244 
4245     android_log_destroy(&ctx);
4246 
4247     ALOGI("exiting");
4248     return 0;
4249 }
4250