• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "lowmemorykiller"
18 
19 #include <dirent.h>
20 #include <errno.h>
21 #include <inttypes.h>
22 #include <pwd.h>
23 #include <sched.h>
24 #include <signal.h>
25 #include <stdbool.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <sys/cdefs.h>
29 #include <sys/epoll.h>
30 #include <sys/eventfd.h>
31 #include <sys/mman.h>
32 #include <sys/pidfd.h>
33 #include <sys/resource.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/time.h>
38 #include <sys/types.h>
39 #include <time.h>
40 #include <unistd.h>
41 
42 #include <cutils/properties.h>
43 #include <cutils/sched_policy.h>
44 #include <cutils/sockets.h>
45 #include <liblmkd_utils.h>
46 #include <lmkd.h>
47 #include <log/log.h>
48 #include <log/log_event_list.h>
49 #include <log/log_time.h>
50 #include <private/android_filesystem_config.h>
51 #include <psi/psi.h>
52 #include <system/thread_defs.h>
53 
54 #include "statslog.h"
55 
56 #define BPF_FD_JUST_USE_INT
57 #include "BpfSyscallWrappers.h"
58 
59 /*
60  * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
61  * to profile and correlate with OOM kills
62  */
63 #ifdef LMKD_TRACE_KILLS
64 
65 #define ATRACE_TAG ATRACE_TAG_ALWAYS
66 #include <cutils/trace.h>
67 
68 #define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
69 #define TRACE_KILL_END()      ATRACE_INT(__FUNCTION__, 0);
70 
71 #else /* LMKD_TRACE_KILLS */
72 
73 #define TRACE_KILL_START(pid) ((void)(pid))
74 #define TRACE_KILL_END() ((void)0)
75 
76 #endif /* LMKD_TRACE_KILLS */
77 
78 #ifndef __unused
79 #define __unused __attribute__((__unused__))
80 #endif
81 
82 #define MEMCG_SYSFS_PATH "/dev/memcg/"
83 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
84 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
85 #define ZONEINFO_PATH "/proc/zoneinfo"
86 #define MEMINFO_PATH "/proc/meminfo"
87 #define VMSTAT_PATH "/proc/vmstat"
88 #define PROC_STATUS_TGID_FIELD "Tgid:"
89 #define PROC_STATUS_RSS_FIELD "VmRSS:"
90 #define PROC_STATUS_SWAP_FIELD "VmSwap:"
91 #define LINE_MAX 128
92 
93 #define PERCEPTIBLE_APP_ADJ 200
94 
95 /* Android Logger event logtags (see event.logtags) */
96 #define KILLINFO_LOG_TAG 10195355
97 
98 /* gid containing AID_SYSTEM required */
99 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
100 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
101 
102 #define ARRAY_SIZE(x)   (sizeof(x) / sizeof(*(x)))
103 #define EIGHT_MEGA (1 << 23)
104 
105 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
106 #define THRASHING_RESET_INTERVAL_MS 1000
107 
108 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
109 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
110 
111 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
112 #define SYSTEM_ADJ (-900)
113 
114 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
115 #define STRINGIFY_INTERNAL(x) #x
116 
117 /*
118  * Read lmk property with persist.device_config.lmkd_native.<name> overriding ro.lmk.<name>
119  * persist.device_config.lmkd_native.* properties are being set by experiments. If a new property
120  * can be controlled by an experiment then use GET_LMK_PROPERTY instead of property_get_xxx and
121  * add "on property" triggers in lmkd.rc to react to the experiment flag changes.
122  */
123 #define GET_LMK_PROPERTY(type, name, def) \
124     property_get_##type("persist.device_config.lmkd_native." name, \
125         property_get_##type("ro.lmk." name, def))
126 
127 /*
128  * PSI monitor tracking window size.
129  * PSI monitor generates events at most once per window,
130  * therefore we poll memory state for the duration of
131  * PSI_WINDOW_SIZE_MS after the event happens.
132  */
133 #define PSI_WINDOW_SIZE_MS 1000
134 /* Polling period after PSI signal when pressure is high */
135 #define PSI_POLL_PERIOD_SHORT_MS 10
136 /* Polling period after PSI signal when pressure is low */
137 #define PSI_POLL_PERIOD_LONG_MS 100
138 
139 #define min(a, b) (((a) < (b)) ? (a) : (b))
140 #define max(a, b) (((a) > (b)) ? (a) : (b))
141 
142 #define FAIL_REPORT_RLIMIT_MS 1000
143 
144 /*
145  * System property defaults
146  */
147 /* ro.lmk.swap_free_low_percentage property defaults */
148 #define DEF_LOW_SWAP 10
149 /* ro.lmk.thrashing_limit property defaults */
150 #define DEF_THRASHING_LOWRAM 30
151 #define DEF_THRASHING 100
152 /* ro.lmk.thrashing_limit_decay property defaults */
153 #define DEF_THRASHING_DECAY_LOWRAM 50
154 #define DEF_THRASHING_DECAY 10
155 /* ro.lmk.psi_partial_stall_ms property defaults */
156 #define DEF_PARTIAL_STALL_LOWRAM 200
157 #define DEF_PARTIAL_STALL 70
158 /* ro.lmk.psi_complete_stall_ms property defaults */
159 #define DEF_COMPLETE_STALL 700
160 
161 #define LMKD_REINIT_PROP "lmkd.reinit"
162 
163 /* default to old in-kernel interface if no memory pressure events */
164 static bool use_inkernel_interface = true;
165 static bool has_inkernel_module;
166 
167 /* memory pressure levels */
168 enum vmpressure_level {
169     VMPRESS_LEVEL_LOW = 0,
170     VMPRESS_LEVEL_MEDIUM,
171     VMPRESS_LEVEL_CRITICAL,
172     VMPRESS_LEVEL_COUNT
173 };
174 
175 static const char *level_name[] = {
176     "low",
177     "medium",
178     "critical"
179 };
180 
181 struct {
182     int64_t min_nr_free_pages; /* recorded but not used yet */
183     int64_t max_nr_free_pages;
184 } low_pressure_mem = { -1, -1 };
185 
186 struct psi_threshold {
187     enum psi_stall_type stall_type;
188     int threshold_ms;
189 };
190 
191 static int level_oomadj[VMPRESS_LEVEL_COUNT];
192 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
193 static bool pidfd_supported;
194 static int last_kill_pid_or_fd = -1;
195 static struct timespec last_kill_tm;
196 
197 /* lmkd configurable parameters */
198 static bool debug_process_killing;
199 static bool enable_pressure_upgrade;
200 static int64_t upgrade_pressure;
201 static int64_t downgrade_pressure;
202 static bool low_ram_device;
203 static bool kill_heaviest_task;
204 static unsigned long kill_timeout_ms;
205 static bool use_minfree_levels;
206 static bool per_app_memcg;
207 static int swap_free_low_percentage;
208 static int psi_partial_stall_ms;
209 static int psi_complete_stall_ms;
210 static int thrashing_limit_pct;
211 static int thrashing_limit_decay_pct;
212 static int thrashing_critical_pct;
213 static int swap_util_max;
214 static int64_t filecache_min_kb;
215 static bool use_psi_monitors = false;
216 static int kpoll_fd;
217 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
218     { PSI_SOME, 70 },    /* 70ms out of 1sec for partial stall */
219     { PSI_SOME, 100 },   /* 100ms out of 1sec for partial stall */
220     { PSI_FULL, 70 },    /* 70ms out of 1sec for complete stall */
221 };
222 
223 static android_log_context ctx;
224 
225 enum polling_update {
226     POLLING_DO_NOT_CHANGE,
227     POLLING_START,
228     POLLING_PAUSE,
229     POLLING_RESUME,
230 };
231 
232 /*
233  * Data used for periodic polling for the memory state of the device.
234  * Note that when system is not polling poll_handler is set to NULL,
235  * when polling starts poll_handler gets set and is reset back to
236  * NULL when polling stops.
237  */
238 struct polling_params {
239     struct event_handler_info* poll_handler;
240     struct event_handler_info* paused_handler;
241     struct timespec poll_start_tm;
242     struct timespec last_poll_tm;
243     int polling_interval_ms;
244     enum polling_update update;
245 };
246 
247 /* data required to handle events */
248 struct event_handler_info {
249     int data;
250     void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
251 };
252 
253 /* data required to handle socket events */
254 struct sock_event_handler_info {
255     int sock;
256     pid_t pid;
257     uint32_t async_event_mask;
258     struct event_handler_info handler_info;
259 };
260 
261 /* max supported number of data connections (AMS, init, tests) */
262 #define MAX_DATA_CONN 3
263 
264 /* socket event handler data */
265 static struct sock_event_handler_info ctrl_sock;
266 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
267 
268 /* vmpressure event handler data */
269 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
270 
271 /*
272  * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
273  * 1 lmk events + 1 fd to wait for process death
274  */
275 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
276 static int epollfd;
277 static int maxevents;
278 
279 /* OOM score values used by both kernel and framework */
280 #define OOM_SCORE_ADJ_MIN       (-1000)
281 #define OOM_SCORE_ADJ_MAX       1000
282 
283 static int lowmem_adj[MAX_TARGETS];
284 static int lowmem_minfree[MAX_TARGETS];
285 static int lowmem_targets_size;
286 
287 /* Fields to parse in /proc/zoneinfo */
288 /* zoneinfo per-zone fields */
289 enum zoneinfo_zone_field {
290     ZI_ZONE_NR_FREE_PAGES = 0,
291     ZI_ZONE_MIN,
292     ZI_ZONE_LOW,
293     ZI_ZONE_HIGH,
294     ZI_ZONE_PRESENT,
295     ZI_ZONE_NR_FREE_CMA,
296     ZI_ZONE_FIELD_COUNT
297 };
298 
299 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
300     "nr_free_pages",
301     "min",
302     "low",
303     "high",
304     "present",
305     "nr_free_cma",
306 };
307 
308 /* zoneinfo per-zone special fields */
309 enum zoneinfo_zone_spec_field {
310     ZI_ZONE_SPEC_PROTECTION = 0,
311     ZI_ZONE_SPEC_PAGESETS,
312     ZI_ZONE_SPEC_FIELD_COUNT,
313 };
314 
315 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
316     "protection:",
317     "pagesets",
318 };
319 
320 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
321 #define MAX_NR_ZONES 6
322 
323 union zoneinfo_zone_fields {
324     struct {
325         int64_t nr_free_pages;
326         int64_t min;
327         int64_t low;
328         int64_t high;
329         int64_t present;
330         int64_t nr_free_cma;
331     } field;
332     int64_t arr[ZI_ZONE_FIELD_COUNT];
333 };
334 
335 struct zoneinfo_zone {
336     union zoneinfo_zone_fields fields;
337     int64_t protection[MAX_NR_ZONES];
338     int64_t max_protection;
339 };
340 
341 /* zoneinfo per-node fields */
342 enum zoneinfo_node_field {
343     ZI_NODE_NR_INACTIVE_FILE = 0,
344     ZI_NODE_NR_ACTIVE_FILE,
345     ZI_NODE_FIELD_COUNT
346 };
347 
348 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
349     "nr_inactive_file",
350     "nr_active_file",
351 };
352 
353 union zoneinfo_node_fields {
354     struct {
355         int64_t nr_inactive_file;
356         int64_t nr_active_file;
357     } field;
358     int64_t arr[ZI_NODE_FIELD_COUNT];
359 };
360 
361 struct zoneinfo_node {
362     int id;
363     int zone_count;
364     struct zoneinfo_zone zones[MAX_NR_ZONES];
365     union zoneinfo_node_fields fields;
366 };
367 
368 /* for now two memory nodes is more than enough */
369 #define MAX_NR_NODES 2
370 
371 struct zoneinfo {
372     int node_count;
373     struct zoneinfo_node nodes[MAX_NR_NODES];
374     int64_t totalreserve_pages;
375     int64_t total_inactive_file;
376     int64_t total_active_file;
377 };
378 
379 /* Fields to parse in /proc/meminfo */
380 enum meminfo_field {
381     MI_NR_FREE_PAGES = 0,
382     MI_CACHED,
383     MI_SWAP_CACHED,
384     MI_BUFFERS,
385     MI_SHMEM,
386     MI_UNEVICTABLE,
387     MI_TOTAL_SWAP,
388     MI_FREE_SWAP,
389     MI_ACTIVE_ANON,
390     MI_INACTIVE_ANON,
391     MI_ACTIVE_FILE,
392     MI_INACTIVE_FILE,
393     MI_SRECLAIMABLE,
394     MI_SUNRECLAIM,
395     MI_KERNEL_STACK,
396     MI_PAGE_TABLES,
397     MI_ION_HELP,
398     MI_ION_HELP_POOL,
399     MI_CMA_FREE,
400     MI_FIELD_COUNT
401 };
402 
403 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
404     "MemFree:",
405     "Cached:",
406     "SwapCached:",
407     "Buffers:",
408     "Shmem:",
409     "Unevictable:",
410     "SwapTotal:",
411     "SwapFree:",
412     "Active(anon):",
413     "Inactive(anon):",
414     "Active(file):",
415     "Inactive(file):",
416     "SReclaimable:",
417     "SUnreclaim:",
418     "KernelStack:",
419     "PageTables:",
420     "ION_heap:",
421     "ION_heap_pool:",
422     "CmaFree:",
423 };
424 
425 union meminfo {
426     struct {
427         int64_t nr_free_pages;
428         int64_t cached;
429         int64_t swap_cached;
430         int64_t buffers;
431         int64_t shmem;
432         int64_t unevictable;
433         int64_t total_swap;
434         int64_t free_swap;
435         int64_t active_anon;
436         int64_t inactive_anon;
437         int64_t active_file;
438         int64_t inactive_file;
439         int64_t sreclaimable;
440         int64_t sunreclaimable;
441         int64_t kernel_stack;
442         int64_t page_tables;
443         int64_t ion_heap;
444         int64_t ion_heap_pool;
445         int64_t cma_free;
446         /* fields below are calculated rather than read from the file */
447         int64_t nr_file_pages;
448         int64_t total_gpu_kb;
449     } field;
450     int64_t arr[MI_FIELD_COUNT];
451 };
452 
453 /* Fields to parse in /proc/vmstat */
454 enum vmstat_field {
455     VS_FREE_PAGES,
456     VS_INACTIVE_FILE,
457     VS_ACTIVE_FILE,
458     VS_WORKINGSET_REFAULT,
459     VS_WORKINGSET_REFAULT_FILE,
460     VS_PGSCAN_KSWAPD,
461     VS_PGSCAN_DIRECT,
462     VS_PGSCAN_DIRECT_THROTTLE,
463     VS_FIELD_COUNT
464 };
465 
466 static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
467     "nr_free_pages",
468     "nr_inactive_file",
469     "nr_active_file",
470     "workingset_refault",
471     "workingset_refault_file",
472     "pgscan_kswapd",
473     "pgscan_direct",
474     "pgscan_direct_throttle",
475 };
476 
477 union vmstat {
478     struct {
479         int64_t nr_free_pages;
480         int64_t nr_inactive_file;
481         int64_t nr_active_file;
482         int64_t workingset_refault;
483         int64_t workingset_refault_file;
484         int64_t pgscan_kswapd;
485         int64_t pgscan_direct;
486         int64_t pgscan_direct_throttle;
487     } field;
488     int64_t arr[VS_FIELD_COUNT];
489 };
490 
491 enum field_match_result {
492     NO_MATCH,
493     PARSE_FAIL,
494     PARSE_SUCCESS
495 };
496 
497 struct adjslot_list {
498     struct adjslot_list *next;
499     struct adjslot_list *prev;
500 };
501 
502 struct proc {
503     struct adjslot_list asl;
504     int pid;
505     int pidfd;
506     uid_t uid;
507     int oomadj;
508     pid_t reg_pid; /* PID of the process that registered this record */
509     struct proc *pidhash_next;
510 };
511 
512 struct reread_data {
513     const char* const filename;
514     int fd;
515 };
516 
517 #define PIDHASH_SZ 1024
518 static struct proc *pidhash[PIDHASH_SZ];
519 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
520 
521 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
522 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
523 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
524 
525 #define MAX_DISTINCT_OOM_ADJ 32
526 #define KILLCNT_INVALID_IDX 0xFF
527 /*
528  * Because killcnt array is sparse a two-level indirection is used
529  * to keep the size small. killcnt_idx stores index of the element in
530  * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
531  */
532 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
533 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
534 static int killcnt_free_idx = 0;
535 static uint32_t killcnt_total = 0;
536 
537 /* PAGE_SIZE / 1024 */
538 static long page_k;
539 
540 static void update_props();
541 static bool init_monitors();
542 static void destroy_monitors();
543 
clamp(int low,int high,int value)544 static int clamp(int low, int high, int value) {
545     return max(min(value, high), low);
546 }
547 
parse_int64(const char * str,int64_t * ret)548 static bool parse_int64(const char* str, int64_t* ret) {
549     char* endptr;
550     long long val = strtoll(str, &endptr, 10);
551     if (str == endptr || val > INT64_MAX) {
552         return false;
553     }
554     *ret = (int64_t)val;
555     return true;
556 }
557 
find_field(const char * name,const char * const field_names[],int field_count)558 static int find_field(const char* name, const char* const field_names[], int field_count) {
559     for (int i = 0; i < field_count; i++) {
560         if (!strcmp(name, field_names[i])) {
561             return i;
562         }
563     }
564     return -1;
565 }
566 
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)567 static enum field_match_result match_field(const char* cp, const char* ap,
568                                    const char* const field_names[],
569                                    int field_count, int64_t* field,
570                                    int *field_idx) {
571     int i = find_field(cp, field_names, field_count);
572     if (i < 0) {
573         return NO_MATCH;
574     }
575     *field_idx = i;
576     return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
577 }
578 
579 /*
580  * Read file content from the beginning up to max_len bytes or EOF
581  * whichever happens first.
582  */
read_all(int fd,char * buf,size_t max_len)583 static ssize_t read_all(int fd, char *buf, size_t max_len)
584 {
585     ssize_t ret = 0;
586     off_t offset = 0;
587 
588     while (max_len > 0) {
589         ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
590         if (r == 0) {
591             break;
592         }
593         if (r == -1) {
594             return -1;
595         }
596         ret += r;
597         buf += r;
598         offset += r;
599         max_len -= r;
600     }
601 
602     return ret;
603 }
604 
605 /*
606  * Read a new or already opened file from the beginning.
607  * If the file has not been opened yet data->fd should be set to -1.
608  * To be used with files which are read often and possibly during high
609  * memory pressure to minimize file opening which by itself requires kernel
610  * memory allocation and might result in a stall on memory stressed system.
611  */
reread_file(struct reread_data * data)612 static char *reread_file(struct reread_data *data) {
613     /* start with page-size buffer and increase if needed */
614     static ssize_t buf_size = PAGE_SIZE;
615     static char *new_buf, *buf = NULL;
616     ssize_t size;
617 
618     if (data->fd == -1) {
619         /* First-time buffer initialization */
620         if (!buf && (buf = static_cast<char*>(malloc(buf_size))) == nullptr) {
621             return NULL;
622         }
623 
624         data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
625         if (data->fd < 0) {
626             ALOGE("%s open: %s", data->filename, strerror(errno));
627             return NULL;
628         }
629     }
630 
631     while (true) {
632         size = read_all(data->fd, buf, buf_size - 1);
633         if (size < 0) {
634             ALOGE("%s read: %s", data->filename, strerror(errno));
635             close(data->fd);
636             data->fd = -1;
637             return NULL;
638         }
639         if (size < buf_size - 1) {
640             break;
641         }
642         /*
643          * Since we are reading /proc files we can't use fstat to find out
644          * the real size of the file. Double the buffer size and keep retrying.
645          */
646         if ((new_buf = static_cast<char*>(realloc(buf, buf_size * 2))) == nullptr) {
647             errno = ENOMEM;
648             return NULL;
649         }
650         buf = new_buf;
651         buf_size *= 2;
652     }
653     buf[size] = 0;
654 
655     return buf;
656 }
657 
claim_record(struct proc * procp,pid_t pid)658 static bool claim_record(struct proc* procp, pid_t pid) {
659     if (procp->reg_pid == pid) {
660         /* Record already belongs to the registrant */
661         return true;
662     }
663     if (procp->reg_pid == 0) {
664         /* Old registrant is gone, claim the record */
665         procp->reg_pid = pid;
666         return true;
667     }
668     /* The record is owned by another registrant */
669     return false;
670 }
671 
remove_claims(pid_t pid)672 static void remove_claims(pid_t pid) {
673     int i;
674 
675     for (i = 0; i < PIDHASH_SZ; i++) {
676         struct proc* procp = pidhash[i];
677         while (procp) {
678             if (procp->reg_pid == pid) {
679                 procp->reg_pid = 0;
680             }
681             procp = procp->pidhash_next;
682         }
683     }
684 }
685 
ctrl_data_close(int dsock_idx)686 static void ctrl_data_close(int dsock_idx) {
687     struct epoll_event epev;
688 
689     ALOGI("closing lmkd data connection");
690     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
691         // Log a warning and keep going
692         ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
693     }
694     maxevents--;
695 
696     close(data_sock[dsock_idx].sock);
697     data_sock[dsock_idx].sock = -1;
698 
699     /* Mark all records of the old registrant as unclaimed */
700     remove_claims(data_sock[dsock_idx].pid);
701 }
702 
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz,struct ucred * sender_cred)703 static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) {
704     struct iovec iov = {buf, bufsz};
705     char control[CMSG_SPACE(sizeof(struct ucred))];
706     struct msghdr hdr = {
707             NULL, 0, &iov, 1, control, sizeof(control), 0,
708     };
709     ssize_t ret;
710     ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0));
711     if (ret == -1) {
712         ALOGE("control data socket read failed; %s", strerror(errno));
713         return -1;
714     }
715     if (ret == 0) {
716         ALOGE("Got EOF on control data socket");
717         return -1;
718     }
719 
720     struct ucred* cred = NULL;
721     struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
722     while (cmsg != NULL) {
723         if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) {
724             cred = (struct ucred*)CMSG_DATA(cmsg);
725             break;
726         }
727         cmsg = CMSG_NXTHDR(&hdr, cmsg);
728     }
729 
730     if (cred == NULL) {
731         ALOGE("Failed to retrieve sender credentials");
732         /* Close the connection */
733         ctrl_data_close(dsock_idx);
734         return -1;
735     }
736 
737     memcpy(sender_cred, cred, sizeof(struct ucred));
738 
739     /* Store PID of the peer */
740     data_sock[dsock_idx].pid = cred->pid;
741 
742     return ret;
743 }
744 
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)745 static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) {
746     int ret = 0;
747 
748     ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
749 
750     if (ret == -1) {
751         ALOGE("control data socket write failed; errno=%d", errno);
752     } else if (ret == 0) {
753         ALOGE("Got EOF on control data socket");
754         ret = -1;
755     }
756 
757     return ret;
758 }
759 
760 /*
761  * Write the pid/uid pair over the data socket, note: all active clients
762  * will receive this unsolicited notification.
763  */
ctrl_data_write_lmk_kill_occurred(pid_t pid,uid_t uid)764 static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid) {
765     LMKD_CTRL_PACKET packet;
766     size_t len = lmkd_pack_set_prockills(packet, pid, uid);
767 
768     for (int i = 0; i < MAX_DATA_CONN; i++) {
769         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) {
770             ctrl_data_write(i, (char*)packet, len);
771         }
772     }
773 }
774 
775 /*
776  * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd
777  */
stats_write_lmk_kill_occurred(struct kill_stat * kill_st,struct memory_stat * mem_st)778 static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st,
779                                           struct memory_stat *mem_st) {
780     LMK_KILL_OCCURRED_PACKET packet;
781     const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st);
782     if (len == 0) {
783         return;
784     }
785 
786     for (int i = 0; i < MAX_DATA_CONN; i++) {
787         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
788             ctrl_data_write(i, packet, len);
789         }
790     }
791 
792 }
793 
stats_write_lmk_kill_occurred_pid(int pid,struct kill_stat * kill_st,struct memory_stat * mem_st)794 static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st,
795                                               struct memory_stat *mem_st) {
796     kill_st->taskname = stats_get_task_name(pid);
797     if (kill_st->taskname != NULL) {
798         stats_write_lmk_kill_occurred(kill_st, mem_st);
799     }
800 }
801 
802 /*
803  * Write the state_changed over the data socket to be propagated via AMS to statsd
804  */
stats_write_lmk_state_changed(enum lmk_state state)805 static void stats_write_lmk_state_changed(enum lmk_state state) {
806     LMKD_CTRL_PACKET packet_state_changed;
807     const size_t len = lmkd_pack_set_state_changed(packet_state_changed, state);
808     if (len == 0) {
809         return;
810     }
811     for (int i = 0; i < MAX_DATA_CONN; i++) {
812         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) {
813             ctrl_data_write(i, (char*)packet_state_changed, len);
814         }
815     }
816 }
817 
poll_kernel(int poll_fd)818 static void poll_kernel(int poll_fd) {
819     if (poll_fd == -1) {
820         // not waiting
821         return;
822     }
823 
824     while (1) {
825         char rd_buf[256];
826         int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf), 0));
827         if (bytes_read <= 0) break;
828         rd_buf[bytes_read] = '\0';
829 
830         int64_t pid;
831         int64_t uid;
832         int64_t group_leader_pid;
833         int64_t rss_in_pages;
834         struct memory_stat mem_st = {};
835         int16_t oom_score_adj;
836         int16_t min_score_adj;
837         int64_t starttime;
838         char* taskname = 0;
839 
840         int fields_read =
841                 sscanf(rd_buf,
842                        "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64
843                        " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]",
844                        &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault,
845                        &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname);
846 
847         /* only the death of the group leader process is logged */
848         if (fields_read == 10 && group_leader_pid == pid) {
849             ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid);
850             mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK));
851             mem_st.rss_in_bytes = rss_in_pages * PAGE_SIZE;
852 
853             struct kill_stat kill_st = {
854                 .uid = static_cast<int32_t>(uid),
855                 .kill_reason = NONE,
856                 .oom_score = oom_score_adj,
857                 .min_oom_score = min_score_adj,
858                 .free_mem_kb = 0,
859                 .free_swap_kb = 0,
860             };
861             stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st);
862         }
863 
864         free(taskname);
865     }
866 }
867 
init_poll_kernel()868 static bool init_poll_kernel() {
869     kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC));
870 
871     if (kpoll_fd < 0) {
872         ALOGE("kernel lmk event file could not be opened; errno=%d", errno);
873         return false;
874     }
875 
876     return true;
877 }
878 
pid_lookup(int pid)879 static struct proc *pid_lookup(int pid) {
880     struct proc *procp;
881 
882     for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
883          procp = procp->pidhash_next)
884             ;
885 
886     return procp;
887 }
888 
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new_element)889 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
890 {
891     struct adjslot_list *next = head->next;
892     new_element->prev = head;
893     new_element->next = next;
894     next->prev = new_element;
895     head->next = new_element;
896 }
897 
adjslot_remove(struct adjslot_list * old)898 static void adjslot_remove(struct adjslot_list *old)
899 {
900     struct adjslot_list *prev = old->prev;
901     struct adjslot_list *next = old->next;
902     next->prev = prev;
903     prev->next = next;
904 }
905 
adjslot_tail(struct adjslot_list * head)906 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
907     struct adjslot_list *asl = head->prev;
908 
909     return asl == head ? NULL : asl;
910 }
911 
proc_slot(struct proc * procp)912 static void proc_slot(struct proc *procp) {
913     int adjslot = ADJTOSLOT(procp->oomadj);
914 
915     adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
916 }
917 
proc_unslot(struct proc * procp)918 static void proc_unslot(struct proc *procp) {
919     adjslot_remove(&procp->asl);
920 }
921 
proc_insert(struct proc * procp)922 static void proc_insert(struct proc *procp) {
923     int hval = pid_hashfn(procp->pid);
924 
925     procp->pidhash_next = pidhash[hval];
926     pidhash[hval] = procp;
927     proc_slot(procp);
928 }
929 
pid_remove(int pid)930 static int pid_remove(int pid) {
931     int hval = pid_hashfn(pid);
932     struct proc *procp;
933     struct proc *prevp;
934 
935     for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
936          procp = procp->pidhash_next)
937             prevp = procp;
938 
939     if (!procp)
940         return -1;
941 
942     if (!prevp)
943         pidhash[hval] = procp->pidhash_next;
944     else
945         prevp->pidhash_next = procp->pidhash_next;
946 
947     proc_unslot(procp);
948     /*
949      * Close pidfd here if we are not waiting for corresponding process to die,
950      * in which case stop_wait_for_proc_kill() will close the pidfd later
951      */
952     if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
953         close(procp->pidfd);
954     }
955     free(procp);
956     return 0;
957 }
958 
959 /*
960  * Write a string to a file.
961  * Returns false if the file does not exist.
962  */
writefilestring(const char * path,const char * s,bool err_if_missing)963 static bool writefilestring(const char *path, const char *s,
964                             bool err_if_missing) {
965     int fd = open(path, O_WRONLY | O_CLOEXEC);
966     ssize_t len = strlen(s);
967     ssize_t ret;
968 
969     if (fd < 0) {
970         if (err_if_missing) {
971             ALOGE("Error opening %s; errno=%d", path, errno);
972         }
973         return false;
974     }
975 
976     ret = TEMP_FAILURE_RETRY(write(fd, s, len));
977     if (ret < 0) {
978         ALOGE("Error writing %s; errno=%d", path, errno);
979     } else if (ret < len) {
980         ALOGE("Short write on %s; length=%zd", path, ret);
981     }
982 
983     close(fd);
984     return true;
985 }
986 
get_time_diff_ms(struct timespec * from,struct timespec * to)987 static inline long get_time_diff_ms(struct timespec *from,
988                                     struct timespec *to) {
989     return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
990            (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
991 }
992 
993 /* Reads /proc/pid/status into buf. */
read_proc_status(int pid,char * buf,size_t buf_sz)994 static bool read_proc_status(int pid, char *buf, size_t buf_sz) {
995     char path[PATH_MAX];
996     int fd;
997     ssize_t size;
998 
999     snprintf(path, PATH_MAX, "/proc/%d/status", pid);
1000     fd = open(path, O_RDONLY | O_CLOEXEC);
1001     if (fd < 0) {
1002         return false;
1003     }
1004 
1005     size = read_all(fd, buf, buf_sz - 1);
1006     close(fd);
1007     if (size < 0) {
1008         return false;
1009     }
1010     buf[size] = 0;
1011     return true;
1012 }
1013 
1014 /* Looks for tag in buf and parses the first integer */
parse_status_tag(char * buf,const char * tag,int64_t * out)1015 static bool parse_status_tag(char *buf, const char *tag, int64_t *out) {
1016     char *pos = buf;
1017     while (true) {
1018         pos = strstr(pos, tag);
1019         /* Stop if tag not found or found at the line beginning */
1020         if (pos == NULL || pos == buf || pos[-1] == '\n') {
1021             break;
1022         }
1023         pos++;
1024     }
1025 
1026     if (pos == NULL) {
1027         return false;
1028     }
1029 
1030     pos += strlen(tag);
1031     while (*pos == ' ') ++pos;
1032     return parse_int64(pos, out);
1033 }
1034 
proc_get_size(int pid)1035 static int proc_get_size(int pid) {
1036     char path[PATH_MAX];
1037     char line[LINE_MAX];
1038     int fd;
1039     int rss = 0;
1040     int total;
1041     ssize_t ret;
1042 
1043     /* gid containing AID_READPROC required */
1044     snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
1045     fd = open(path, O_RDONLY | O_CLOEXEC);
1046     if (fd == -1)
1047         return -1;
1048 
1049     ret = read_all(fd, line, sizeof(line) - 1);
1050     if (ret < 0) {
1051         close(fd);
1052         return -1;
1053     }
1054     line[ret] = '\0';
1055 
1056     sscanf(line, "%d %d ", &total, &rss);
1057     close(fd);
1058     return rss;
1059 }
1060 
proc_get_name(int pid,char * buf,size_t buf_size)1061 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
1062     char path[PATH_MAX];
1063     int fd;
1064     char *cp;
1065     ssize_t ret;
1066 
1067     /* gid containing AID_READPROC required */
1068     snprintf(path, PATH_MAX, "/proc/%d/cmdline", pid);
1069     fd = open(path, O_RDONLY | O_CLOEXEC);
1070     if (fd == -1) {
1071         return NULL;
1072     }
1073     ret = read_all(fd, buf, buf_size - 1);
1074     close(fd);
1075     if (ret < 0) {
1076         return NULL;
1077     }
1078     buf[ret] = '\0';
1079 
1080     cp = strchr(buf, ' ');
1081     if (cp) {
1082         *cp = '\0';
1083     }
1084 
1085     return buf;
1086 }
1087 
cmd_procprio(LMKD_CTRL_PACKET packet,int field_count,struct ucred * cred)1088 static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
1089     struct proc *procp;
1090     char path[LINE_MAX];
1091     char val[20];
1092     int soft_limit_mult;
1093     struct lmk_procprio params;
1094     bool is_system_server;
1095     struct passwd *pwdrec;
1096     int64_t tgid;
1097     char buf[PAGE_SIZE];
1098 
1099     lmkd_pack_get_procprio(packet, field_count, &params);
1100 
1101     if (params.oomadj < OOM_SCORE_ADJ_MIN ||
1102         params.oomadj > OOM_SCORE_ADJ_MAX) {
1103         ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
1104         return;
1105     }
1106 
1107     if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
1108         ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
1109         return;
1110     }
1111 
1112     /* Check if registered process is a thread group leader */
1113     if (read_proc_status(params.pid, buf, sizeof(buf))) {
1114         if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
1115             ALOGE("Attempt to register a task that is not a thread group leader "
1116                   "(tid %d, tgid %" PRId64 ")", params.pid, tgid);
1117             return;
1118         }
1119     }
1120 
1121     /* gid containing AID_READPROC required */
1122     /* CAP_SYS_RESOURCE required */
1123     /* CAP_DAC_OVERRIDE required */
1124     snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
1125     snprintf(val, sizeof(val), "%d", params.oomadj);
1126     if (!writefilestring(path, val, false)) {
1127         ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
1128               path, errno, params.pid);
1129         /* If this file does not exist the process is dead. */
1130         return;
1131     }
1132 
1133     if (use_inkernel_interface) {
1134         stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
1135         return;
1136     }
1137 
1138     /* lmkd should not change soft limits for services */
1139     if (params.ptype == PROC_TYPE_APP && per_app_memcg) {
1140         if (params.oomadj >= 900) {
1141             soft_limit_mult = 0;
1142         } else if (params.oomadj >= 800) {
1143             soft_limit_mult = 0;
1144         } else if (params.oomadj >= 700) {
1145             soft_limit_mult = 0;
1146         } else if (params.oomadj >= 600) {
1147             // Launcher should be perceptible, don't kill it.
1148             params.oomadj = 200;
1149             soft_limit_mult = 1;
1150         } else if (params.oomadj >= 500) {
1151             soft_limit_mult = 0;
1152         } else if (params.oomadj >= 400) {
1153             soft_limit_mult = 0;
1154         } else if (params.oomadj >= 300) {
1155             soft_limit_mult = 1;
1156         } else if (params.oomadj >= 200) {
1157             soft_limit_mult = 8;
1158         } else if (params.oomadj >= 100) {
1159             soft_limit_mult = 10;
1160         } else if (params.oomadj >=   0) {
1161             soft_limit_mult = 20;
1162         } else {
1163             // Persistent processes will have a large
1164             // soft limit 512MB.
1165             soft_limit_mult = 64;
1166         }
1167 
1168         snprintf(path, sizeof(path), MEMCG_SYSFS_PATH
1169                  "apps/uid_%d/pid_%d/memory.soft_limit_in_bytes",
1170                  params.uid, params.pid);
1171         snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
1172 
1173         /*
1174          * system_server process has no memcg under /dev/memcg/apps but should be
1175          * registered with lmkd. This is the best way so far to identify it.
1176          */
1177         is_system_server = (params.oomadj == SYSTEM_ADJ &&
1178                             (pwdrec = getpwnam("system")) != NULL &&
1179                             params.uid == pwdrec->pw_uid);
1180         writefilestring(path, val, !is_system_server);
1181     }
1182 
1183     procp = pid_lookup(params.pid);
1184     if (!procp) {
1185         int pidfd = -1;
1186 
1187         if (pidfd_supported) {
1188             pidfd = TEMP_FAILURE_RETRY(pidfd_open(params.pid, 0));
1189             if (pidfd < 0) {
1190                 ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
1191                 return;
1192             }
1193         }
1194 
1195         procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
1196         if (!procp) {
1197             // Oh, the irony.  May need to rebuild our state.
1198             return;
1199         }
1200 
1201         procp->pid = params.pid;
1202         procp->pidfd = pidfd;
1203         procp->uid = params.uid;
1204         procp->reg_pid = cred->pid;
1205         procp->oomadj = params.oomadj;
1206         proc_insert(procp);
1207     } else {
1208         if (!claim_record(procp, cred->pid)) {
1209             char buf[LINE_MAX];
1210             char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1211             /* Only registrant of the record can remove it */
1212             ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
1213                 taskname ? taskname : "A process ", cred->uid, cred->pid);
1214             return;
1215         }
1216         proc_unslot(procp);
1217         procp->oomadj = params.oomadj;
1218         proc_slot(procp);
1219     }
1220 }
1221 
cmd_procremove(LMKD_CTRL_PACKET packet,struct ucred * cred)1222 static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) {
1223     struct lmk_procremove params;
1224     struct proc *procp;
1225 
1226     lmkd_pack_get_procremove(packet, &params);
1227 
1228     if (use_inkernel_interface) {
1229         /*
1230          * Perform an extra check before the pid is removed, after which it
1231          * will be impossible for poll_kernel to get the taskname. poll_kernel()
1232          * is potentially a long-running blocking function; however this method
1233          * handles AMS requests but does not block AMS.
1234          */
1235         poll_kernel(kpoll_fd);
1236 
1237         stats_remove_taskname(params.pid);
1238         return;
1239     }
1240 
1241     procp = pid_lookup(params.pid);
1242     if (!procp) {
1243         return;
1244     }
1245 
1246     if (!claim_record(procp, cred->pid)) {
1247         char buf[LINE_MAX];
1248         char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1249         /* Only registrant of the record can remove it */
1250         ALOGE("%s (%d, %d) attempts to unregister a process registered by another client",
1251             taskname ? taskname : "A process ", cred->uid, cred->pid);
1252         return;
1253     }
1254 
1255     /*
1256      * WARNING: After pid_remove() procp is freed and can't be used!
1257      * Therefore placed at the end of the function.
1258      */
1259     pid_remove(params.pid);
1260 }
1261 
cmd_procpurge(struct ucred * cred)1262 static void cmd_procpurge(struct ucred *cred) {
1263     int i;
1264     struct proc *procp;
1265     struct proc *next;
1266 
1267     if (use_inkernel_interface) {
1268         stats_purge_tasknames();
1269         return;
1270     }
1271 
1272     for (i = 0; i < PIDHASH_SZ; i++) {
1273         procp = pidhash[i];
1274         while (procp) {
1275             next = procp->pidhash_next;
1276             /* Purge only records created by the requestor */
1277             if (claim_record(procp, cred->pid)) {
1278                 pid_remove(procp->pid);
1279             }
1280             procp = next;
1281         }
1282     }
1283 }
1284 
cmd_subscribe(int dsock_idx,LMKD_CTRL_PACKET packet)1285 static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) {
1286     struct lmk_subscribe params;
1287 
1288     lmkd_pack_get_subscribe(packet, &params);
1289     data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type;
1290 }
1291 
inc_killcnt(int oomadj)1292 static void inc_killcnt(int oomadj) {
1293     int slot = ADJTOSLOT(oomadj);
1294     uint8_t idx = killcnt_idx[slot];
1295 
1296     if (idx == KILLCNT_INVALID_IDX) {
1297         /* index is not assigned for this oomadj */
1298         if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1299             killcnt_idx[slot] = killcnt_free_idx;
1300             killcnt[killcnt_free_idx] = 1;
1301             killcnt_free_idx++;
1302         } else {
1303             ALOGW("Number of distinct oomadj levels exceeds %d",
1304                 MAX_DISTINCT_OOM_ADJ);
1305         }
1306     } else {
1307         /*
1308          * wraparound is highly unlikely and is detectable using total
1309          * counter because it has to be equal to the sum of all counters
1310          */
1311         killcnt[idx]++;
1312     }
1313     /* increment total kill counter */
1314     killcnt_total++;
1315 }
1316 
get_killcnt(int min_oomadj,int max_oomadj)1317 static int get_killcnt(int min_oomadj, int max_oomadj) {
1318     int slot;
1319     int count = 0;
1320 
1321     if (min_oomadj > max_oomadj)
1322         return 0;
1323 
1324     /* special case to get total kill count */
1325     if (min_oomadj > OOM_SCORE_ADJ_MAX)
1326         return killcnt_total;
1327 
1328     while (min_oomadj <= max_oomadj &&
1329            (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1330         uint8_t idx = killcnt_idx[slot];
1331         if (idx != KILLCNT_INVALID_IDX) {
1332             count += killcnt[idx];
1333         }
1334         min_oomadj++;
1335     }
1336 
1337     return count;
1338 }
1339 
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1340 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1341     struct lmk_getkillcnt params;
1342 
1343     if (use_inkernel_interface) {
1344         /* kernel driver does not expose this information */
1345         return 0;
1346     }
1347 
1348     lmkd_pack_get_getkillcnt(packet, &params);
1349 
1350     return get_killcnt(params.min_oomadj, params.max_oomadj);
1351 }
1352 
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1353 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1354     int i;
1355     struct lmk_target target;
1356     char minfree_str[PROPERTY_VALUE_MAX];
1357     char *pstr = minfree_str;
1358     char *pend = minfree_str + sizeof(minfree_str);
1359     static struct timespec last_req_tm;
1360     struct timespec curr_tm;
1361 
1362     if (ntargets < 1 || ntargets > (int)ARRAY_SIZE(lowmem_adj))
1363         return;
1364 
1365     /*
1366      * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1367      * to prevent DoS attacks
1368      */
1369     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1370         ALOGE("Failed to get current time");
1371         return;
1372     }
1373 
1374     if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1375         TARGET_UPDATE_MIN_INTERVAL_MS) {
1376         ALOGE("Ignoring frequent updated to lmkd limits");
1377         return;
1378     }
1379 
1380     last_req_tm = curr_tm;
1381 
1382     for (i = 0; i < ntargets; i++) {
1383         lmkd_pack_get_target(packet, i, &target);
1384         lowmem_minfree[i] = target.minfree;
1385         lowmem_adj[i] = target.oom_adj_score;
1386 
1387         pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1388             target.oom_adj_score);
1389         if (pstr >= pend) {
1390             /* if no more space in the buffer then terminate the loop */
1391             pstr = pend;
1392             break;
1393         }
1394     }
1395 
1396     lowmem_targets_size = ntargets;
1397 
1398     /* Override the last extra comma */
1399     pstr[-1] = '\0';
1400     property_set("sys.lmk.minfree_levels", minfree_str);
1401 
1402     if (has_inkernel_module) {
1403         char minfreestr[128];
1404         char killpriostr[128];
1405 
1406         minfreestr[0] = '\0';
1407         killpriostr[0] = '\0';
1408 
1409         for (i = 0; i < lowmem_targets_size; i++) {
1410             char val[40];
1411 
1412             if (i) {
1413                 strlcat(minfreestr, ",", sizeof(minfreestr));
1414                 strlcat(killpriostr, ",", sizeof(killpriostr));
1415             }
1416 
1417             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1418             strlcat(minfreestr, val, sizeof(minfreestr));
1419             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1420             strlcat(killpriostr, val, sizeof(killpriostr));
1421         }
1422 
1423         writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1424         writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1425     }
1426 }
1427 
ctrl_command_handler(int dsock_idx)1428 static void ctrl_command_handler(int dsock_idx) {
1429     LMKD_CTRL_PACKET packet;
1430     struct ucred cred;
1431     int len;
1432     enum lmk_cmd cmd;
1433     int nargs;
1434     int targets;
1435     int kill_cnt;
1436     int result;
1437 
1438     len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
1439     if (len <= 0)
1440         return;
1441 
1442     if (len < (int)sizeof(int)) {
1443         ALOGE("Wrong control socket read length len=%d", len);
1444         return;
1445     }
1446 
1447     cmd = lmkd_pack_get_cmd(packet);
1448     nargs = len / sizeof(int) - 1;
1449     if (nargs < 0)
1450         goto wronglen;
1451 
1452     switch(cmd) {
1453     case LMK_TARGET:
1454         targets = nargs / 2;
1455         if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
1456             goto wronglen;
1457         cmd_target(targets, packet);
1458         break;
1459     case LMK_PROCPRIO:
1460         /* process type field is optional for backward compatibility */
1461         if (nargs < 3 || nargs > 4)
1462             goto wronglen;
1463         cmd_procprio(packet, nargs, &cred);
1464         break;
1465     case LMK_PROCREMOVE:
1466         if (nargs != 1)
1467             goto wronglen;
1468         cmd_procremove(packet, &cred);
1469         break;
1470     case LMK_PROCPURGE:
1471         if (nargs != 0)
1472             goto wronglen;
1473         cmd_procpurge(&cred);
1474         break;
1475     case LMK_GETKILLCNT:
1476         if (nargs != 2)
1477             goto wronglen;
1478         kill_cnt = cmd_getkillcnt(packet);
1479         len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1480         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1481             return;
1482         break;
1483     case LMK_SUBSCRIBE:
1484         if (nargs != 1)
1485             goto wronglen;
1486         cmd_subscribe(dsock_idx, packet);
1487         break;
1488     case LMK_PROCKILL:
1489         /* This command code is NOT expected at all */
1490         ALOGE("Received unexpected command code %d", cmd);
1491         break;
1492     case LMK_UPDATE_PROPS:
1493         if (nargs != 0)
1494             goto wronglen;
1495         update_props();
1496         if (!use_inkernel_interface) {
1497             /* Reinitialize monitors to apply new settings */
1498             destroy_monitors();
1499             result = init_monitors() ? 0 : -1;
1500         } else {
1501             result = 0;
1502         }
1503         len = lmkd_pack_set_update_props_repl(packet, result);
1504         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
1505             ALOGE("Failed to report operation results");
1506         }
1507         if (!result) {
1508             ALOGI("Properties reinitilized");
1509         } else {
1510             /* New settings can't be supported, crash to be restarted */
1511             ALOGE("New configuration is not supported. Exiting...");
1512             exit(1);
1513         }
1514         break;
1515     default:
1516         ALOGE("Received unknown command code %d", cmd);
1517         return;
1518     }
1519 
1520     return;
1521 
1522 wronglen:
1523     ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1524 }
1525 
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1526 static void ctrl_data_handler(int data, uint32_t events,
1527                               struct polling_params *poll_params __unused) {
1528     if (events & EPOLLIN) {
1529         ctrl_command_handler(data);
1530     }
1531 }
1532 
get_free_dsock()1533 static int get_free_dsock() {
1534     for (int i = 0; i < MAX_DATA_CONN; i++) {
1535         if (data_sock[i].sock < 0) {
1536             return i;
1537         }
1538     }
1539     return -1;
1540 }
1541 
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1542 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1543                                  struct polling_params *poll_params __unused) {
1544     struct epoll_event epev;
1545     int free_dscock_idx = get_free_dsock();
1546 
1547     if (free_dscock_idx < 0) {
1548         /*
1549          * Number of data connections exceeded max supported. This should not
1550          * happen but if it does we drop all existing connections and accept
1551          * the new one. This prevents inactive connections from monopolizing
1552          * data socket and if we drop ActivityManager connection it will
1553          * immediately reconnect.
1554          */
1555         for (int i = 0; i < MAX_DATA_CONN; i++) {
1556             ctrl_data_close(i);
1557         }
1558         free_dscock_idx = 0;
1559     }
1560 
1561     data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1562     if (data_sock[free_dscock_idx].sock < 0) {
1563         ALOGE("lmkd control socket accept failed; errno=%d", errno);
1564         return;
1565     }
1566 
1567     ALOGI("lmkd data connection established");
1568     /* use data to store data connection idx */
1569     data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1570     data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1571     data_sock[free_dscock_idx].async_event_mask = 0;
1572     epev.events = EPOLLIN;
1573     epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1574     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1575         ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1576         ctrl_data_close(free_dscock_idx);
1577         return;
1578     }
1579     maxevents++;
1580 }
1581 
1582 /*
1583  * /proc/zoneinfo parsing routines
1584  * Expected file format is:
1585  *
1586  *   Node <node_id>, zone   <zone_name>
1587  *   (
1588  *    per-node stats
1589  *       (<per-node field name> <value>)+
1590  *   )?
1591  *   (pages free     <value>
1592  *       (<per-zone field name> <value>)+
1593  *    pagesets
1594  *       (<unused fields>)*
1595  *   )+
1596  *   ...
1597  */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1598 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1599     int zone_idx;
1600     int64_t max = 0;
1601     char *save_ptr;
1602 
1603     for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1604          buf && zone_idx < MAX_NR_ZONES;
1605          buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1606         long long zoneval = strtoll(buf, &buf, 0);
1607         if (zoneval > max) {
1608             max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1609         }
1610         zone->protection[zone_idx] = zoneval;
1611     }
1612     zone->max_protection = max;
1613 }
1614 
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1615 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1616     for (char *line = strtok_r(NULL, "\n", buf); line;
1617          line = strtok_r(NULL, "\n", buf)) {
1618         char *cp;
1619         char *ap;
1620         char *save_ptr;
1621         int64_t val;
1622         int field_idx;
1623         enum field_match_result match_res;
1624 
1625         cp = strtok_r(line, " ", &save_ptr);
1626         if (!cp) {
1627             return false;
1628         }
1629 
1630         field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1631         if (field_idx >= 0) {
1632             /* special field */
1633             if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1634                 /* no mode fields we are interested in */
1635                 return true;
1636             }
1637 
1638             /* protection field */
1639             ap = strtok_r(NULL, ")", &save_ptr);
1640             if (ap) {
1641                 zoneinfo_parse_protection(ap, zone);
1642             }
1643             continue;
1644         }
1645 
1646         ap = strtok_r(NULL, " ", &save_ptr);
1647         if (!ap) {
1648             continue;
1649         }
1650 
1651         match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1652             &val, &field_idx);
1653         if (match_res == PARSE_FAIL) {
1654             return false;
1655         }
1656         if (match_res == PARSE_SUCCESS) {
1657             zone->fields.arr[field_idx] = val;
1658         }
1659         if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1660             /* zone is not populated, stop parsing it */
1661             return true;
1662         }
1663     }
1664     return false;
1665 }
1666 
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1667 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1668     int fields_to_match = ZI_NODE_FIELD_COUNT;
1669 
1670     for (char *line = strtok_r(NULL, "\n", buf); line;
1671          line = strtok_r(NULL, "\n", buf)) {
1672         char *cp;
1673         char *ap;
1674         char *save_ptr;
1675         int64_t val;
1676         int field_idx;
1677         enum field_match_result match_res;
1678 
1679         cp = strtok_r(line, " ", &save_ptr);
1680         if (!cp) {
1681             return false;
1682         }
1683 
1684         ap = strtok_r(NULL, " ", &save_ptr);
1685         if (!ap) {
1686             return false;
1687         }
1688 
1689         match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1690             &val, &field_idx);
1691         if (match_res == PARSE_FAIL) {
1692             return false;
1693         }
1694         if (match_res == PARSE_SUCCESS) {
1695             node->fields.arr[field_idx] = val;
1696             fields_to_match--;
1697             if (!fields_to_match) {
1698                 return true;
1699             }
1700         }
1701     }
1702     return false;
1703 }
1704 
zoneinfo_parse(struct zoneinfo * zi)1705 static int zoneinfo_parse(struct zoneinfo *zi) {
1706     static struct reread_data file_data = {
1707         .filename = ZONEINFO_PATH,
1708         .fd = -1,
1709     };
1710     char *buf;
1711     char *save_ptr;
1712     char *line;
1713     char zone_name[LINE_MAX + 1];
1714     struct zoneinfo_node *node = NULL;
1715     int node_idx = 0;
1716     int zone_idx = 0;
1717 
1718     memset(zi, 0, sizeof(struct zoneinfo));
1719 
1720     if ((buf = reread_file(&file_data)) == NULL) {
1721         return -1;
1722     }
1723 
1724     for (line = strtok_r(buf, "\n", &save_ptr); line;
1725          line = strtok_r(NULL, "\n", &save_ptr)) {
1726         int node_id;
1727         if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1728             if (!node || node->id != node_id) {
1729                 /* new node is found */
1730                 if (node) {
1731                     node->zone_count = zone_idx + 1;
1732                     node_idx++;
1733                     if (node_idx == MAX_NR_NODES) {
1734                         /* max node count exceeded */
1735                         ALOGE("%s parse error", file_data.filename);
1736                         return -1;
1737                     }
1738                 }
1739                 node = &zi->nodes[node_idx];
1740                 node->id = node_id;
1741                 zone_idx = 0;
1742                 if (!zoneinfo_parse_node(&save_ptr, node)) {
1743                     ALOGE("%s parse error", file_data.filename);
1744                     return -1;
1745                 }
1746             } else {
1747                 /* new zone is found */
1748                 zone_idx++;
1749             }
1750             if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1751                 ALOGE("%s parse error", file_data.filename);
1752                 return -1;
1753             }
1754         }
1755     }
1756     if (!node) {
1757         ALOGE("%s parse error", file_data.filename);
1758         return -1;
1759     }
1760     node->zone_count = zone_idx + 1;
1761     zi->node_count = node_idx + 1;
1762 
1763     /* calculate totals fields */
1764     for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1765         node = &zi->nodes[node_idx];
1766         for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1767             struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1768             zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1769         }
1770         zi->total_inactive_file += node->fields.field.nr_inactive_file;
1771         zi->total_active_file += node->fields.field.nr_active_file;
1772     }
1773     return 0;
1774 }
1775 
1776 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1777 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1778     char *cp = line;
1779     char *ap;
1780     char *save_ptr;
1781     int64_t val;
1782     int field_idx;
1783     enum field_match_result match_res;
1784 
1785     cp = strtok_r(line, " ", &save_ptr);
1786     if (!cp) {
1787         return false;
1788     }
1789 
1790     ap = strtok_r(NULL, " ", &save_ptr);
1791     if (!ap) {
1792         return false;
1793     }
1794 
1795     match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1796         &val, &field_idx);
1797     if (match_res == PARSE_SUCCESS) {
1798         mi->arr[field_idx] = val / page_k;
1799     }
1800     return (match_res != PARSE_FAIL);
1801 }
1802 
read_gpu_total_kb()1803 static int64_t read_gpu_total_kb() {
1804     static int fd = android::bpf::bpfFdGet(
1805             "/sys/fs/bpf/map_gpu_mem_gpu_mem_total_map", BPF_F_RDONLY);
1806     static constexpr uint64_t kBpfKeyGpuTotalUsage = 0;
1807     uint64_t value;
1808 
1809     if (fd < 0) {
1810         return 0;
1811     }
1812 
1813     return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value)
1814             ? 0
1815             : (int32_t)(value / 1024);
1816 }
1817 
meminfo_parse(union meminfo * mi)1818 static int meminfo_parse(union meminfo *mi) {
1819     static struct reread_data file_data = {
1820         .filename = MEMINFO_PATH,
1821         .fd = -1,
1822     };
1823     char *buf;
1824     char *save_ptr;
1825     char *line;
1826 
1827     memset(mi, 0, sizeof(union meminfo));
1828 
1829     if ((buf = reread_file(&file_data)) == NULL) {
1830         return -1;
1831     }
1832 
1833     for (line = strtok_r(buf, "\n", &save_ptr); line;
1834          line = strtok_r(NULL, "\n", &save_ptr)) {
1835         if (!meminfo_parse_line(line, mi)) {
1836             ALOGE("%s parse error", file_data.filename);
1837             return -1;
1838         }
1839     }
1840     mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1841         mi->field.buffers;
1842     mi->field.total_gpu_kb = read_gpu_total_kb();
1843 
1844     return 0;
1845 }
1846 
1847 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)1848 static bool vmstat_parse_line(char *line, union vmstat *vs) {
1849     char *cp;
1850     char *ap;
1851     char *save_ptr;
1852     int64_t val;
1853     int field_idx;
1854     enum field_match_result match_res;
1855 
1856     cp = strtok_r(line, " ", &save_ptr);
1857     if (!cp) {
1858         return false;
1859     }
1860 
1861     ap = strtok_r(NULL, " ", &save_ptr);
1862     if (!ap) {
1863         return false;
1864     }
1865 
1866     match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
1867         &val, &field_idx);
1868     if (match_res == PARSE_SUCCESS) {
1869         vs->arr[field_idx] = val;
1870     }
1871     return (match_res != PARSE_FAIL);
1872 }
1873 
vmstat_parse(union vmstat * vs)1874 static int vmstat_parse(union vmstat *vs) {
1875     static struct reread_data file_data = {
1876         .filename = VMSTAT_PATH,
1877         .fd = -1,
1878     };
1879     char *buf;
1880     char *save_ptr;
1881     char *line;
1882 
1883     memset(vs, 0, sizeof(union vmstat));
1884 
1885     if ((buf = reread_file(&file_data)) == NULL) {
1886         return -1;
1887     }
1888 
1889     for (line = strtok_r(buf, "\n", &save_ptr); line;
1890          line = strtok_r(NULL, "\n", &save_ptr)) {
1891         if (!vmstat_parse_line(line, vs)) {
1892             ALOGE("%s parse error", file_data.filename);
1893             return -1;
1894         }
1895     }
1896 
1897     return 0;
1898 }
1899 
1900 enum wakeup_reason {
1901     Event,
1902     Polling
1903 };
1904 
1905 struct wakeup_info {
1906     struct timespec wakeup_tm;
1907     struct timespec prev_wakeup_tm;
1908     struct timespec last_event_tm;
1909     int wakeups_since_event;
1910     int skipped_wakeups;
1911 };
1912 
1913 /*
1914  * After the initial memory pressure event is received lmkd schedules periodic wakeups to check
1915  * the memory conditions and kill if needed (polling). This is done because pressure events are
1916  * rate-limited and memory conditions can change in between events. Therefore after the initial
1917  * event there might be multiple wakeups. This function records the wakeup information such as the
1918  * timestamps of the last event and the last wakeup, the number of wakeups since the last event
1919  * and how many of those wakeups were skipped (some wakeups are skipped if previously killed
1920  * process is still freeing its memory).
1921  */
record_wakeup_time(struct timespec * tm,enum wakeup_reason reason,struct wakeup_info * wi)1922 static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason,
1923                                struct wakeup_info *wi) {
1924     wi->prev_wakeup_tm = wi->wakeup_tm;
1925     wi->wakeup_tm = *tm;
1926     if (reason == Event) {
1927         wi->last_event_tm = *tm;
1928         wi->wakeups_since_event = 0;
1929         wi->skipped_wakeups = 0;
1930     } else {
1931         wi->wakeups_since_event++;
1932     }
1933 }
1934 
killinfo_log(struct proc * procp,int min_oom_score,int rss_kb,int swap_kb,int kill_reason,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)1935 static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb,
1936                          int swap_kb, int kill_reason, union meminfo *mi,
1937                          struct wakeup_info *wi, struct timespec *tm) {
1938     /* log process information */
1939     android_log_write_int32(ctx, procp->pid);
1940     android_log_write_int32(ctx, procp->uid);
1941     android_log_write_int32(ctx, procp->oomadj);
1942     android_log_write_int32(ctx, min_oom_score);
1943     android_log_write_int32(ctx, (int32_t)min(rss_kb, INT32_MAX));
1944     android_log_write_int32(ctx, kill_reason);
1945 
1946     /* log meminfo fields */
1947     for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
1948         android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
1949     }
1950 
1951     /* log lmkd wakeup information */
1952     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm));
1953     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm));
1954     android_log_write_int32(ctx, wi->wakeups_since_event);
1955     android_log_write_int32(ctx, wi->skipped_wakeups);
1956     android_log_write_int32(ctx, (int32_t)min(swap_kb, INT32_MAX));
1957     android_log_write_int32(ctx, (int32_t)mi->field.total_gpu_kb);
1958 
1959     android_log_write_list(ctx, LOG_ID_EVENTS);
1960     android_log_reset(ctx);
1961 }
1962 
proc_adj_lru(int oomadj)1963 static struct proc *proc_adj_lru(int oomadj) {
1964     return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
1965 }
1966 
proc_get_heaviest(int oomadj)1967 static struct proc *proc_get_heaviest(int oomadj) {
1968     struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
1969     struct adjslot_list *curr = head->next;
1970     struct proc *maxprocp = NULL;
1971     int maxsize = 0;
1972     while (curr != head) {
1973         int pid = ((struct proc *)curr)->pid;
1974         int tasksize = proc_get_size(pid);
1975         if (tasksize < 0) {
1976             struct adjslot_list *next = curr->next;
1977             pid_remove(pid);
1978             curr = next;
1979         } else {
1980             if (tasksize > maxsize) {
1981                 maxsize = tasksize;
1982                 maxprocp = (struct proc *)curr;
1983             }
1984             curr = curr->next;
1985         }
1986     }
1987     return maxprocp;
1988 }
1989 
set_process_group_and_prio(int pid,SchedPolicy sp,int prio)1990 static void set_process_group_and_prio(int pid, SchedPolicy sp, int prio) {
1991     DIR* d;
1992     char proc_path[PATH_MAX];
1993     struct dirent* de;
1994 
1995     snprintf(proc_path, sizeof(proc_path), "/proc/%d/task", pid);
1996     if (!(d = opendir(proc_path))) {
1997         ALOGW("Failed to open %s; errno=%d: process pid(%d) might have died", proc_path, errno,
1998               pid);
1999         return;
2000     }
2001 
2002     while ((de = readdir(d))) {
2003         int t_pid;
2004 
2005         if (de->d_name[0] == '.') continue;
2006         t_pid = atoi(de->d_name);
2007 
2008         if (!t_pid) {
2009             ALOGW("Failed to get t_pid for '%s' of pid(%d)", de->d_name, pid);
2010             continue;
2011         }
2012 
2013         if (setpriority(PRIO_PROCESS, t_pid, prio) && errno != ESRCH) {
2014             ALOGW("Unable to raise priority of killing t_pid (%d): errno=%d", t_pid, errno);
2015         }
2016 
2017         if (set_cpuset_policy(t_pid, sp)) {
2018             ALOGW("Failed to set_cpuset_policy on pid(%d) t_pid(%d) to %d", pid, t_pid, (int)sp);
2019             continue;
2020         }
2021     }
2022     closedir(d);
2023 }
2024 
is_kill_pending(void)2025 static bool is_kill_pending(void) {
2026     char buf[24];
2027 
2028     if (last_kill_pid_or_fd < 0) {
2029         return false;
2030     }
2031 
2032     if (pidfd_supported) {
2033         return true;
2034     }
2035 
2036     /* when pidfd is not supported base the decision on /proc/<pid> existence */
2037     snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
2038     if (access(buf, F_OK) == 0) {
2039         return true;
2040     }
2041 
2042     return false;
2043 }
2044 
is_waiting_for_kill(void)2045 static bool is_waiting_for_kill(void) {
2046     return pidfd_supported && last_kill_pid_or_fd >= 0;
2047 }
2048 
stop_wait_for_proc_kill(bool finished)2049 static void stop_wait_for_proc_kill(bool finished) {
2050     struct epoll_event epev;
2051 
2052     if (last_kill_pid_or_fd < 0) {
2053         return;
2054     }
2055 
2056     if (debug_process_killing) {
2057         struct timespec curr_tm;
2058 
2059         if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2060             /*
2061              * curr_tm is used here merely to report kill duration, so this failure is not fatal.
2062              * Log an error and continue.
2063              */
2064             ALOGE("Failed to get current time");
2065         }
2066 
2067         if (finished) {
2068             ALOGI("Process got killed in %ldms",
2069                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2070         } else {
2071             ALOGI("Stop waiting for process kill after %ldms",
2072                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2073         }
2074     }
2075 
2076     if (pidfd_supported) {
2077         /* unregister fd */
2078         if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) {
2079             // Log an error and keep going
2080             ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
2081         }
2082         maxevents--;
2083         close(last_kill_pid_or_fd);
2084     }
2085 
2086     last_kill_pid_or_fd = -1;
2087 }
2088 
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2089 static void kill_done_handler(int data __unused, uint32_t events __unused,
2090                               struct polling_params *poll_params) {
2091     stop_wait_for_proc_kill(true);
2092     poll_params->update = POLLING_RESUME;
2093 }
2094 
start_wait_for_proc_kill(int pid_or_fd)2095 static void start_wait_for_proc_kill(int pid_or_fd) {
2096     static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
2097     struct epoll_event epev;
2098 
2099     if (last_kill_pid_or_fd >= 0) {
2100         /* Should not happen but if it does we should stop previous wait */
2101         ALOGE("Attempt to wait for a kill while another wait is in progress");
2102         stop_wait_for_proc_kill(false);
2103     }
2104 
2105     last_kill_pid_or_fd = pid_or_fd;
2106 
2107     if (!pidfd_supported) {
2108         /* If pidfd is not supported just store PID and exit */
2109         return;
2110     }
2111 
2112     epev.events = EPOLLIN;
2113     epev.data.ptr = (void *)&kill_done_hinfo;
2114     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
2115         ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
2116         close(last_kill_pid_or_fd);
2117         last_kill_pid_or_fd = -1;
2118         return;
2119     }
2120     maxevents++;
2121 }
2122 
2123 struct kill_info {
2124     enum kill_reasons kill_reason;
2125     const char *kill_desc;
2126     int thrashing;
2127     int max_thrashing;
2128 };
2129 
2130 /* Kill one process specified by procp.  Returns the size (in pages) of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2131 static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki,
2132                             union meminfo *mi, struct wakeup_info *wi, struct timespec *tm) {
2133     int pid = procp->pid;
2134     int pidfd = procp->pidfd;
2135     uid_t uid = procp->uid;
2136     char *taskname;
2137     int r;
2138     int result = -1;
2139     struct memory_stat *mem_st;
2140     struct kill_stat kill_st;
2141     int64_t tgid;
2142     int64_t rss_kb;
2143     int64_t swap_kb;
2144     char buf[PAGE_SIZE];
2145 
2146     if (!read_proc_status(pid, buf, sizeof(buf))) {
2147         goto out;
2148     }
2149     if (!parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid)) {
2150         ALOGE("Unable to parse tgid from /proc/%d/status", pid);
2151         goto out;
2152     }
2153     if (tgid != pid) {
2154         ALOGE("Possible pid reuse detected (pid %d, tgid %" PRId64 ")!", pid, tgid);
2155         goto out;
2156     }
2157     // Zombie processes will not have RSS / Swap fields.
2158     if (!parse_status_tag(buf, PROC_STATUS_RSS_FIELD, &rss_kb)) {
2159         goto out;
2160     }
2161     if (!parse_status_tag(buf, PROC_STATUS_SWAP_FIELD, &swap_kb)) {
2162         goto out;
2163     }
2164 
2165     taskname = proc_get_name(pid, buf, sizeof(buf));
2166     // taskname will point inside buf, do not reuse buf onwards.
2167     if (!taskname) {
2168         goto out;
2169     }
2170 
2171     mem_st = stats_read_memory_stat(per_app_memcg, pid, uid, rss_kb * 1024, swap_kb * 1024);
2172 
2173     TRACE_KILL_START(pid);
2174 
2175     /* CAP_KILL required */
2176     if (pidfd < 0) {
2177         start_wait_for_proc_kill(pid);
2178         r = kill(pid, SIGKILL);
2179     } else {
2180         start_wait_for_proc_kill(pidfd);
2181         r = pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
2182     }
2183 
2184     TRACE_KILL_END();
2185 
2186     if (r) {
2187         stop_wait_for_proc_kill(false);
2188         ALOGE("kill(%d): errno=%d", pid, errno);
2189         /* Delete process record even when we fail to kill so that we don't get stuck on it */
2190         goto out;
2191     }
2192 
2193     set_process_group_and_prio(pid, SP_FOREGROUND, ANDROID_PRIORITY_HIGHEST);
2194 
2195     last_kill_tm = *tm;
2196 
2197     inc_killcnt(procp->oomadj);
2198 
2199     if (ki) {
2200         kill_st.kill_reason = ki->kill_reason;
2201         kill_st.thrashing = ki->thrashing;
2202         kill_st.max_thrashing = ki->max_thrashing;
2203         killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki->kill_reason, mi, wi, tm);
2204         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2205               "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb,
2206               ki->kill_desc);
2207     } else {
2208         kill_st.kill_reason = NONE;
2209         kill_st.thrashing = 0;
2210         kill_st.max_thrashing = 0;
2211         killinfo_log(procp, min_oom_score, rss_kb, swap_kb, NONE, mi, wi, tm);
2212         ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64
2213               "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb);
2214     }
2215 
2216     kill_st.uid = static_cast<int32_t>(uid);
2217     kill_st.taskname = taskname;
2218     kill_st.oom_score = procp->oomadj;
2219     kill_st.min_oom_score = min_oom_score;
2220     kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
2221     kill_st.free_swap_kb = mi->field.free_swap * page_k;
2222     stats_write_lmk_kill_occurred(&kill_st, mem_st);
2223 
2224     ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid);
2225 
2226     result = rss_kb / page_k;
2227 
2228 out:
2229     /*
2230      * WARNING: After pid_remove() procp is freed and can't be used!
2231      * Therefore placed at the end of the function.
2232      */
2233     pid_remove(pid);
2234     return result;
2235 }
2236 
2237 /*
2238  * Find one process to kill at or above the given oom_score_adj level.
2239  * Returns size of the killed process.
2240  */
find_and_kill_process(int min_score_adj,struct kill_info * ki,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2241 static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
2242                                  struct wakeup_info *wi, struct timespec *tm) {
2243     int i;
2244     int killed_size = 0;
2245     bool lmk_state_change_start = false;
2246     bool choose_heaviest_task = kill_heaviest_task;
2247 
2248     for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
2249         struct proc *procp;
2250 
2251         if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
2252             /*
2253              * If we have to choose a perceptible process, choose the heaviest one to
2254              * hopefully minimize the number of victims.
2255              */
2256             choose_heaviest_task = true;
2257         }
2258 
2259         while (true) {
2260             procp = choose_heaviest_task ?
2261                 proc_get_heaviest(i) : proc_adj_lru(i);
2262 
2263             if (!procp)
2264                 break;
2265 
2266             killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm);
2267             if (killed_size >= 0) {
2268                 if (!lmk_state_change_start) {
2269                     lmk_state_change_start = true;
2270                     stats_write_lmk_state_changed(STATE_START);
2271                 }
2272                 break;
2273             }
2274         }
2275         if (killed_size) {
2276             break;
2277         }
2278     }
2279 
2280     if (lmk_state_change_start) {
2281         stats_write_lmk_state_changed(STATE_STOP);
2282     }
2283 
2284     return killed_size;
2285 }
2286 
get_memory_usage(struct reread_data * file_data)2287 static int64_t get_memory_usage(struct reread_data *file_data) {
2288     int64_t mem_usage;
2289     char *buf;
2290 
2291     if ((buf = reread_file(file_data)) == NULL) {
2292         return -1;
2293     }
2294 
2295     if (!parse_int64(buf, &mem_usage)) {
2296         ALOGE("%s parse error", file_data->filename);
2297         return -1;
2298     }
2299     if (mem_usage == 0) {
2300         ALOGE("No memory!");
2301         return -1;
2302     }
2303     return mem_usage;
2304 }
2305 
record_low_pressure_levels(union meminfo * mi)2306 void record_low_pressure_levels(union meminfo *mi) {
2307     if (low_pressure_mem.min_nr_free_pages == -1 ||
2308         low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
2309         if (debug_process_killing) {
2310             ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
2311                 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
2312         }
2313         low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
2314     }
2315     /*
2316      * Free memory at low vmpressure events occasionally gets spikes,
2317      * possibly a stale low vmpressure event with memory already
2318      * freed up (no memory pressure should have been reported).
2319      * Ignore large jumps in max_nr_free_pages that would mess up our stats.
2320      */
2321     if (low_pressure_mem.max_nr_free_pages == -1 ||
2322         (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
2323          mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
2324          low_pressure_mem.max_nr_free_pages * 0.1)) {
2325         if (debug_process_killing) {
2326             ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
2327                 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
2328         }
2329         low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
2330     }
2331 }
2332 
upgrade_level(enum vmpressure_level level)2333 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
2334     return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
2335         level + 1 : level);
2336 }
2337 
downgrade_level(enum vmpressure_level level)2338 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
2339     return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
2340         level - 1 : level);
2341 }
2342 
2343 enum zone_watermark {
2344     WMARK_MIN = 0,
2345     WMARK_LOW,
2346     WMARK_HIGH,
2347     WMARK_NONE
2348 };
2349 
2350 struct zone_watermarks {
2351     long high_wmark;
2352     long low_wmark;
2353     long min_wmark;
2354 };
2355 
2356 /*
2357  * Returns lowest breached watermark or WMARK_NONE.
2358  */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)2359 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
2360                                                 struct zone_watermarks *watermarks)
2361 {
2362     int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
2363 
2364     if (nr_free_pages < watermarks->min_wmark) {
2365         return WMARK_MIN;
2366     }
2367     if (nr_free_pages < watermarks->low_wmark) {
2368         return WMARK_LOW;
2369     }
2370     if (nr_free_pages < watermarks->high_wmark) {
2371         return WMARK_HIGH;
2372     }
2373     return WMARK_NONE;
2374 }
2375 
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2376 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2377     memset(watermarks, 0, sizeof(struct zone_watermarks));
2378 
2379     for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2380         struct zoneinfo_node *node = &zi->nodes[node_idx];
2381         for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2382             struct zoneinfo_zone *zone = &node->zones[zone_idx];
2383 
2384             if (!zone->fields.field.present) {
2385                 continue;
2386             }
2387 
2388             watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2389             watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2390             watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2391         }
2392     }
2393 }
2394 
calc_swap_utilization(union meminfo * mi)2395 static int calc_swap_utilization(union meminfo *mi) {
2396     int64_t swap_used = mi->field.total_swap - mi->field.free_swap;
2397     int64_t total_swappable = mi->field.active_anon + mi->field.inactive_anon +
2398                               mi->field.shmem + swap_used;
2399     return total_swappable > 0 ? (swap_used * 100) / total_swappable : 0;
2400 }
2401 
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)2402 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
2403     enum reclaim_state {
2404         NO_RECLAIM = 0,
2405         KSWAPD_RECLAIM,
2406         DIRECT_RECLAIM,
2407     };
2408     static int64_t init_ws_refault;
2409     static int64_t prev_workingset_refault;
2410     static int64_t base_file_lru;
2411     static int64_t init_pgscan_kswapd;
2412     static int64_t init_pgscan_direct;
2413     static int64_t swap_low_threshold;
2414     static bool killing;
2415     static int thrashing_limit = thrashing_limit_pct;
2416     static struct zone_watermarks watermarks;
2417     static struct timespec wmark_update_tm;
2418     static struct wakeup_info wi;
2419     static struct timespec thrashing_reset_tm;
2420     static int64_t prev_thrash_growth = 0;
2421     static bool check_filecache = false;
2422     static int max_thrashing = 0;
2423 
2424     union meminfo mi;
2425     union vmstat vs;
2426     struct timespec curr_tm;
2427     int64_t thrashing = 0;
2428     bool swap_is_low = false;
2429     enum vmpressure_level level = (enum vmpressure_level)data;
2430     enum kill_reasons kill_reason = NONE;
2431     bool cycle_after_kill = false;
2432     enum reclaim_state reclaim = NO_RECLAIM;
2433     enum zone_watermark wmark = WMARK_NONE;
2434     char kill_desc[LINE_MAX];
2435     bool cut_thrashing_limit = false;
2436     int min_score_adj = 0;
2437     int swap_util = 0;
2438     long since_thrashing_reset_ms;
2439     int64_t workingset_refault_file;
2440 
2441     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2442         ALOGE("Failed to get current time");
2443         return;
2444     }
2445 
2446     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2447 
2448     bool kill_pending = is_kill_pending();
2449     if (kill_pending && (kill_timeout_ms == 0 ||
2450         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
2451         /* Skip while still killing a process */
2452         wi.skipped_wakeups++;
2453         goto no_kill;
2454     }
2455     /*
2456      * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
2457      * supported and death notification already caused waiting to stop.
2458      */
2459     stop_wait_for_proc_kill(!kill_pending);
2460 
2461     if (vmstat_parse(&vs) < 0) {
2462         ALOGE("Failed to parse vmstat!");
2463         return;
2464     }
2465     /* Starting 5.9 kernel workingset_refault vmstat field was renamed workingset_refault_file */
2466     workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;
2467 
2468     if (meminfo_parse(&mi) < 0) {
2469         ALOGE("Failed to parse meminfo!");
2470         return;
2471     }
2472 
2473     /* Reset states after process got killed */
2474     if (killing) {
2475         killing = false;
2476         cycle_after_kill = true;
2477         /* Reset file-backed pagecache size and refault amounts after a kill */
2478         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2479         init_ws_refault = workingset_refault_file;
2480         thrashing_reset_tm = curr_tm;
2481         prev_thrash_growth = 0;
2482     }
2483 
2484     /* Check free swap levels */
2485     if (swap_free_low_percentage) {
2486         if (!swap_low_threshold) {
2487             swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2488         }
2489         swap_is_low = mi.field.free_swap < swap_low_threshold;
2490     }
2491 
2492     /* Identify reclaim state */
2493     if (vs.field.pgscan_direct > init_pgscan_direct) {
2494         init_pgscan_direct = vs.field.pgscan_direct;
2495         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2496         reclaim = DIRECT_RECLAIM;
2497     } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
2498         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2499         reclaim = KSWAPD_RECLAIM;
2500     } else if (workingset_refault_file == prev_workingset_refault) {
2501         /*
2502          * Device is not thrashing and not reclaiming, bail out early until we see these stats
2503          * changing
2504          */
2505         goto no_kill;
2506     }
2507 
2508     prev_workingset_refault = workingset_refault_file;
2509 
2510      /*
2511      * It's possible we fail to find an eligible process to kill (ex. no process is
2512      * above oom_adj_min). When this happens, we should retry to find a new process
2513      * for a kill whenever a new eligible process is available. This is especially
2514      * important for a slow growing refault case. While retrying, we should keep
2515      * monitoring new thrashing counter as someone could release the memory to mitigate
2516      * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
2517      * counter by window counts. If the counter is still greater than thrashing limit,
2518      * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
2519      * we reset the prev_thrash counter so we will stop retrying.
2520      */
2521     since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
2522     if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
2523         long windows_passed;
2524         /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
2525         prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
2526                             / (base_file_lru + 1);
2527         windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
2528         /*
2529          * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
2530          * just crossed, which means there were no eligible processes to kill. We preserve the
2531          * counter in that case to ensure a kill if a new eligible process appears.
2532          */
2533         if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
2534             prev_thrash_growth >>= windows_passed;
2535         }
2536 
2537         /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
2538         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2539         init_ws_refault = workingset_refault_file;
2540         thrashing_reset_tm = curr_tm;
2541         thrashing_limit = thrashing_limit_pct;
2542     } else {
2543         /* Calculate what % of the file-backed pagecache refaulted so far */
2544         thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
2545     }
2546     /* Add previous cycle's decayed thrashing amount */
2547     thrashing += prev_thrash_growth;
2548     if (max_thrashing < thrashing) {
2549         max_thrashing = thrashing;
2550     }
2551 
2552     /*
2553      * Refresh watermarks once per min in case user updated one of the margins.
2554      * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
2555      * that zone watermarks were changed by the system software.
2556      */
2557     if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
2558         struct zoneinfo zi;
2559 
2560         if (zoneinfo_parse(&zi) < 0) {
2561             ALOGE("Failed to parse zoneinfo!");
2562             return;
2563         }
2564 
2565         calc_zone_watermarks(&zi, &watermarks);
2566         wmark_update_tm = curr_tm;
2567     }
2568 
2569     /* Find out which watermark is breached if any */
2570     wmark = get_lowest_watermark(&mi, &watermarks);
2571 
2572     /*
2573      * TODO: move this logic into a separate function
2574      * Decide if killing a process is necessary and record the reason
2575      */
2576     if (cycle_after_kill && wmark < WMARK_LOW) {
2577         /*
2578          * Prevent kills not freeing enough memory which might lead to OOM kill.
2579          * This might happen when a process is consuming memory faster than reclaim can
2580          * free even after a kill. Mostly happens when running memory stress tests.
2581          */
2582         kill_reason = PRESSURE_AFTER_KILL;
2583         strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2584     } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2585         /*
2586          * Device is too busy reclaiming memory which might lead to ANR.
2587          * Critical level is triggered when PSI complete stall (all tasks are blocked because
2588          * of the memory congestion) breaches the configured threshold.
2589          */
2590         kill_reason = NOT_RESPONDING;
2591         strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2592     } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2593         /* Page cache is thrashing while swap is low */
2594         kill_reason = LOW_SWAP_AND_THRASHING;
2595         snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2596             "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2597             mi.field.free_swap * page_k, swap_low_threshold * page_k, thrashing);
2598         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2599         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2600             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2601         }
2602         check_filecache = true;
2603     } else if (swap_is_low && wmark < WMARK_HIGH) {
2604         /* Both free memory and swap are low */
2605         kill_reason = LOW_MEM_AND_SWAP;
2606         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2607             PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
2608             mi.field.free_swap * page_k, swap_low_threshold * page_k);
2609         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2610         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2611             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2612         }
2613     } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
2614                (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
2615         /*
2616          * Too much anon memory is swapped out but swap is not low.
2617          * Non-swappable allocations created memory pressure.
2618          */
2619         kill_reason = LOW_MEM_AND_SWAP_UTIL;
2620         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
2621             " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
2622             swap_util, swap_util_max);
2623     } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
2624         /* Page cache is thrashing while memory is low */
2625         kill_reason = LOW_MEM_AND_THRASHING;
2626         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
2627             PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
2628         cut_thrashing_limit = true;
2629         /* Do not kill perceptible apps unless thrashing at critical levels */
2630         if (thrashing < thrashing_critical_pct) {
2631             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2632         }
2633         check_filecache = true;
2634     } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
2635         /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
2636         kill_reason = DIRECT_RECL_AND_THRASHING;
2637         snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
2638             PRId64 "%%)", thrashing);
2639         cut_thrashing_limit = true;
2640         /* Do not kill perceptible apps unless thrashing at critical levels */
2641         if (thrashing < thrashing_critical_pct) {
2642             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2643         }
2644         check_filecache = true;
2645     } else if (check_filecache) {
2646         int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
2647 
2648         if (file_lru_kb < filecache_min_kb) {
2649             /* File cache is too low after thrashing, keep killing background processes */
2650             kill_reason = LOW_FILECACHE_AFTER_THRASHING;
2651             snprintf(kill_desc, sizeof(kill_desc),
2652                 "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
2653                 file_lru_kb, filecache_min_kb);
2654             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2655         } else {
2656             /* File cache is big enough, stop checking */
2657             check_filecache = false;
2658         }
2659     }
2660 
2661     /* Kill a process if necessary */
2662     if (kill_reason != NONE) {
2663         struct kill_info ki = {
2664             .kill_reason = kill_reason,
2665             .kill_desc = kill_desc,
2666             .thrashing = (int)thrashing,
2667             .max_thrashing = max_thrashing,
2668         };
2669         int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm);
2670         if (pages_freed > 0) {
2671             killing = true;
2672             max_thrashing = 0;
2673             if (cut_thrashing_limit) {
2674                 /*
2675                  * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
2676                  * thrashing limit until the system stops thrashing.
2677                  */
2678                 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
2679             }
2680         }
2681     }
2682 
2683 no_kill:
2684     /* Do not poll if kernel supports pidfd waiting */
2685     if (is_waiting_for_kill()) {
2686         /* Pause polling if we are waiting for process death notification */
2687         poll_params->update = POLLING_PAUSE;
2688         return;
2689     }
2690 
2691     /*
2692      * Start polling after initial PSI event;
2693      * extend polling while device is in direct reclaim or process is being killed;
2694      * do not extend when kswapd reclaims because that might go on for a long time
2695      * without causing memory pressure
2696      */
2697     if (events || killing || reclaim == DIRECT_RECLAIM) {
2698         poll_params->update = POLLING_START;
2699     }
2700 
2701     /* Decide the polling interval */
2702     if (swap_is_low || killing) {
2703         /* Fast polling during and after a kill or when swap is low */
2704         poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2705     } else {
2706         /* By default use long intervals */
2707         poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
2708     }
2709 }
2710 
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)2711 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
2712     unsigned long long evcount;
2713     int64_t mem_usage, memsw_usage;
2714     int64_t mem_pressure;
2715     union meminfo mi;
2716     struct zoneinfo zi;
2717     struct timespec curr_tm;
2718     static unsigned long kill_skip_count = 0;
2719     enum vmpressure_level level = (enum vmpressure_level)data;
2720     long other_free = 0, other_file = 0;
2721     int min_score_adj;
2722     int minfree = 0;
2723     static struct reread_data mem_usage_file_data = {
2724         .filename = MEMCG_MEMORY_USAGE,
2725         .fd = -1,
2726     };
2727     static struct reread_data memsw_usage_file_data = {
2728         .filename = MEMCG_MEMORYSW_USAGE,
2729         .fd = -1,
2730     };
2731     static struct wakeup_info wi;
2732 
2733     if (debug_process_killing) {
2734         ALOGI("%s memory pressure event is triggered", level_name[level]);
2735     }
2736 
2737     if (!use_psi_monitors) {
2738         /*
2739          * Check all event counters from low to critical
2740          * and upgrade to the highest priority one. By reading
2741          * eventfd we also reset the event counters.
2742          */
2743         for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
2744             if (mpevfd[lvl] != -1 &&
2745                 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
2746                                    &evcount, sizeof(evcount))) > 0 &&
2747                 evcount > 0 && lvl > level) {
2748                 level = static_cast<vmpressure_level>(lvl);
2749             }
2750         }
2751     }
2752 
2753     /* Start polling after initial PSI event */
2754     if (use_psi_monitors && events) {
2755         /* Override polling params only if current event is more critical */
2756         if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
2757             poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2758             poll_params->update = POLLING_START;
2759         }
2760     }
2761 
2762     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2763         ALOGE("Failed to get current time");
2764         return;
2765     }
2766 
2767     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2768 
2769     if (kill_timeout_ms &&
2770         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms)) {
2771         /*
2772          * If we're within the no-kill timeout, see if there's pending reclaim work
2773          * from the last killed process. If so, skip killing for now.
2774          */
2775         if (is_kill_pending()) {
2776             kill_skip_count++;
2777             wi.skipped_wakeups++;
2778             return;
2779         }
2780         /*
2781          * Process is dead, stop waiting. This has no effect if pidfds are supported and
2782          * death notification already caused waiting to stop.
2783          */
2784         stop_wait_for_proc_kill(true);
2785     } else {
2786         /*
2787          * Killing took longer than no-kill timeout. Stop waiting for the last process
2788          * to die because we are ready to kill again.
2789          */
2790         stop_wait_for_proc_kill(false);
2791     }
2792 
2793     if (kill_skip_count > 0) {
2794         ALOGI("%lu memory pressure events were skipped after a kill!",
2795               kill_skip_count);
2796         kill_skip_count = 0;
2797     }
2798 
2799     if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
2800         ALOGE("Failed to get free memory!");
2801         return;
2802     }
2803 
2804     if (use_minfree_levels) {
2805         int i;
2806 
2807         other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
2808         if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
2809             other_file = (mi.field.nr_file_pages - mi.field.shmem -
2810                           mi.field.unevictable - mi.field.swap_cached);
2811         } else {
2812             other_file = 0;
2813         }
2814 
2815         min_score_adj = OOM_SCORE_ADJ_MAX + 1;
2816         for (i = 0; i < lowmem_targets_size; i++) {
2817             minfree = lowmem_minfree[i];
2818             if (other_free < minfree && other_file < minfree) {
2819                 min_score_adj = lowmem_adj[i];
2820                 break;
2821             }
2822         }
2823 
2824         if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
2825             if (debug_process_killing) {
2826                 ALOGI("Ignore %s memory pressure event "
2827                       "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
2828                       level_name[level], other_free * page_k, other_file * page_k,
2829                       (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
2830             }
2831             return;
2832         }
2833 
2834         goto do_kill;
2835     }
2836 
2837     if (level == VMPRESS_LEVEL_LOW) {
2838         record_low_pressure_levels(&mi);
2839     }
2840 
2841     if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
2842         /* Do not monitor this pressure level */
2843         return;
2844     }
2845 
2846     if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
2847         goto do_kill;
2848     }
2849     if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
2850         goto do_kill;
2851     }
2852 
2853     // Calculate percent for swappinness.
2854     mem_pressure = (mem_usage * 100) / memsw_usage;
2855 
2856     if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
2857         // We are swapping too much.
2858         if (mem_pressure < upgrade_pressure) {
2859             level = upgrade_level(level);
2860             if (debug_process_killing) {
2861                 ALOGI("Event upgraded to %s", level_name[level]);
2862             }
2863         }
2864     }
2865 
2866     // If we still have enough swap space available, check if we want to
2867     // ignore/downgrade pressure events.
2868     if (mi.field.free_swap >=
2869         mi.field.total_swap * swap_free_low_percentage / 100) {
2870         // If the pressure is larger than downgrade_pressure lmk will not
2871         // kill any process, since enough memory is available.
2872         if (mem_pressure > downgrade_pressure) {
2873             if (debug_process_killing) {
2874                 ALOGI("Ignore %s memory pressure", level_name[level]);
2875             }
2876             return;
2877         } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
2878             if (debug_process_killing) {
2879                 ALOGI("Downgrade critical memory pressure");
2880             }
2881             // Downgrade event, since enough memory available.
2882             level = downgrade_level(level);
2883         }
2884     }
2885 
2886 do_kill:
2887     if (low_ram_device) {
2888         /* For Go devices kill only one task */
2889         if (find_and_kill_process(level_oomadj[level], NULL, &mi, &wi, &curr_tm) == 0) {
2890             if (debug_process_killing) {
2891                 ALOGI("Nothing to kill");
2892             }
2893         }
2894     } else {
2895         int pages_freed;
2896         static struct timespec last_report_tm;
2897         static unsigned long report_skip_count = 0;
2898 
2899         if (!use_minfree_levels) {
2900             /* Free up enough memory to downgrate the memory pressure to low level */
2901             if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
2902                 if (debug_process_killing) {
2903                     ALOGI("Ignoring pressure since more memory is "
2904                         "available (%" PRId64 ") than watermark (%" PRId64 ")",
2905                         mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
2906                 }
2907                 return;
2908             }
2909             min_score_adj = level_oomadj[level];
2910         }
2911 
2912         pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm);
2913 
2914         if (pages_freed == 0) {
2915             /* Rate limit kill reports when nothing was reclaimed */
2916             if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
2917                 report_skip_count++;
2918                 return;
2919             }
2920         }
2921 
2922         /* Log whenever we kill or when report rate limit allows */
2923         if (use_minfree_levels) {
2924             ALOGI("Reclaimed %ldkB, cache(%ldkB) and free(%" PRId64 "kB)-reserved(%" PRId64 "kB) "
2925                 "below min(%ldkB) for oom_score_adj %d",
2926                 pages_freed * page_k,
2927                 other_file * page_k, mi.field.nr_free_pages * page_k,
2928                 zi.totalreserve_pages * page_k,
2929                 minfree * page_k, min_score_adj);
2930         } else {
2931             ALOGI("Reclaimed %ldkB at oom_score_adj %d", pages_freed * page_k, min_score_adj);
2932         }
2933 
2934         if (report_skip_count > 0) {
2935             ALOGI("Suppressed %lu failed kill reports", report_skip_count);
2936             report_skip_count = 0;
2937         }
2938 
2939         last_report_tm = curr_tm;
2940     }
2941     if (is_waiting_for_kill()) {
2942         /* pause polling if we are waiting for process death notification */
2943         poll_params->update = POLLING_PAUSE;
2944     }
2945 }
2946 
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)2947 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
2948     int fd;
2949 
2950     /* Do not register a handler if threshold_ms is not set */
2951     if (!psi_thresholds[level].threshold_ms) {
2952         return true;
2953     }
2954 
2955     fd = init_psi_monitor(psi_thresholds[level].stall_type,
2956         psi_thresholds[level].threshold_ms * US_PER_MS,
2957         PSI_WINDOW_SIZE_MS * US_PER_MS);
2958 
2959     if (fd < 0) {
2960         return false;
2961     }
2962 
2963     vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
2964     vmpressure_hinfo[level].data = level;
2965     if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
2966         destroy_psi_monitor(fd);
2967         return false;
2968     }
2969     maxevents++;
2970     mpevfd[level] = fd;
2971 
2972     return true;
2973 }
2974 
destroy_mp_psi(enum vmpressure_level level)2975 static void destroy_mp_psi(enum vmpressure_level level) {
2976     int fd = mpevfd[level];
2977 
2978     if (fd < 0) {
2979         return;
2980     }
2981 
2982     if (unregister_psi_monitor(epollfd, fd) < 0) {
2983         ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
2984             level_name[level], errno);
2985     }
2986     maxevents--;
2987     destroy_psi_monitor(fd);
2988     mpevfd[level] = -1;
2989 }
2990 
init_psi_monitors()2991 static bool init_psi_monitors() {
2992     /*
2993      * When PSI is used on low-ram devices or on high-end devices without memfree levels
2994      * use new kill strategy based on zone watermarks, free swap and thrashing stats
2995      */
2996     bool use_new_strategy =
2997         GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels);
2998 
2999     /* In default PSI mode override stall amounts using system properties */
3000     if (use_new_strategy) {
3001         /* Do not use low pressure level */
3002         psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
3003         psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
3004         psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
3005     }
3006 
3007     if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
3008         return false;
3009     }
3010     if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
3011         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3012         return false;
3013     }
3014     if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
3015         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3016         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3017         return false;
3018     }
3019     return true;
3020 }
3021 
init_mp_common(enum vmpressure_level level)3022 static bool init_mp_common(enum vmpressure_level level) {
3023     int mpfd;
3024     int evfd;
3025     int evctlfd;
3026     char buf[256];
3027     struct epoll_event epev;
3028     int ret;
3029     int level_idx = (int)level;
3030     const char *levelstr = level_name[level_idx];
3031 
3032     /* gid containing AID_SYSTEM required */
3033     mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
3034     if (mpfd < 0) {
3035         ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
3036         goto err_open_mpfd;
3037     }
3038 
3039     evctlfd = open(MEMCG_SYSFS_PATH "cgroup.event_control", O_WRONLY | O_CLOEXEC);
3040     if (evctlfd < 0) {
3041         ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
3042         goto err_open_evctlfd;
3043     }
3044 
3045     evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
3046     if (evfd < 0) {
3047         ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
3048         goto err_eventfd;
3049     }
3050 
3051     ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
3052     if (ret >= (ssize_t)sizeof(buf)) {
3053         ALOGE("cgroup.event_control line overflow for level %s", levelstr);
3054         goto err;
3055     }
3056 
3057     ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
3058     if (ret == -1) {
3059         ALOGE("cgroup.event_control write failed for level %s; errno=%d",
3060               levelstr, errno);
3061         goto err;
3062     }
3063 
3064     epev.events = EPOLLIN;
3065     /* use data to store event level */
3066     vmpressure_hinfo[level_idx].data = level_idx;
3067     vmpressure_hinfo[level_idx].handler = mp_event_common;
3068     epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
3069     ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
3070     if (ret == -1) {
3071         ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
3072         goto err;
3073     }
3074     maxevents++;
3075     mpevfd[level] = evfd;
3076     close(evctlfd);
3077     return true;
3078 
3079 err:
3080     close(evfd);
3081 err_eventfd:
3082     close(evctlfd);
3083 err_open_evctlfd:
3084     close(mpfd);
3085 err_open_mpfd:
3086     return false;
3087 }
3088 
destroy_mp_common(enum vmpressure_level level)3089 static void destroy_mp_common(enum vmpressure_level level) {
3090     struct epoll_event epev;
3091     int fd = mpevfd[level];
3092 
3093     if (fd < 0) {
3094         return;
3095     }
3096 
3097     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) {
3098         // Log an error and keep going
3099         ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno);
3100     }
3101     maxevents--;
3102     close(fd);
3103     mpevfd[level] = -1;
3104 }
3105 
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)3106 static void kernel_event_handler(int data __unused, uint32_t events __unused,
3107                                  struct polling_params *poll_params __unused) {
3108     poll_kernel(kpoll_fd);
3109 }
3110 
init_monitors()3111 static bool init_monitors() {
3112     /* Try to use psi monitor first if kernel has it */
3113     use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) &&
3114         init_psi_monitors();
3115     /* Fall back to vmpressure */
3116     if (!use_psi_monitors &&
3117         (!init_mp_common(VMPRESS_LEVEL_LOW) ||
3118         !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
3119         !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
3120         ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
3121         return false;
3122     }
3123     if (use_psi_monitors) {
3124         ALOGI("Using psi monitors for memory pressure detection");
3125     } else {
3126         ALOGI("Using vmpressure for memory pressure detection");
3127     }
3128     return true;
3129 }
3130 
destroy_monitors()3131 static void destroy_monitors() {
3132     if (use_psi_monitors) {
3133         destroy_mp_psi(VMPRESS_LEVEL_CRITICAL);
3134         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
3135         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3136     } else {
3137         destroy_mp_common(VMPRESS_LEVEL_CRITICAL);
3138         destroy_mp_common(VMPRESS_LEVEL_MEDIUM);
3139         destroy_mp_common(VMPRESS_LEVEL_LOW);
3140     }
3141 }
3142 
init(void)3143 static int init(void) {
3144     static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
3145     struct reread_data file_data = {
3146         .filename = ZONEINFO_PATH,
3147         .fd = -1,
3148     };
3149     struct epoll_event epev;
3150     int pidfd;
3151     int i;
3152     int ret;
3153 
3154     page_k = sysconf(_SC_PAGESIZE);
3155     if (page_k == -1)
3156         page_k = PAGE_SIZE;
3157     page_k /= 1024;
3158 
3159     epollfd = epoll_create(MAX_EPOLL_EVENTS);
3160     if (epollfd == -1) {
3161         ALOGE("epoll_create failed (errno=%d)", errno);
3162         return -1;
3163     }
3164 
3165     // mark data connections as not connected
3166     for (int i = 0; i < MAX_DATA_CONN; i++) {
3167         data_sock[i].sock = -1;
3168     }
3169 
3170     ctrl_sock.sock = android_get_control_socket("lmkd");
3171     if (ctrl_sock.sock < 0) {
3172         ALOGE("get lmkd control socket failed");
3173         return -1;
3174     }
3175 
3176     ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
3177     if (ret < 0) {
3178         ALOGE("lmkd control socket listen failed (errno=%d)", errno);
3179         return -1;
3180     }
3181 
3182     epev.events = EPOLLIN;
3183     ctrl_sock.handler_info.handler = ctrl_connect_handler;
3184     epev.data.ptr = (void *)&(ctrl_sock.handler_info);
3185     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
3186         ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
3187         return -1;
3188     }
3189     maxevents++;
3190 
3191     has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
3192     use_inkernel_interface = has_inkernel_module;
3193 
3194     if (use_inkernel_interface) {
3195         ALOGI("Using in-kernel low memory killer interface");
3196         if (init_poll_kernel()) {
3197             epev.events = EPOLLIN;
3198             epev.data.ptr = (void*)&kernel_poll_hinfo;
3199             if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
3200                 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
3201                 close(kpoll_fd);
3202                 kpoll_fd = -1;
3203             } else {
3204                 maxevents++;
3205                 /* let the others know it does support reporting kills */
3206                 property_set("sys.lmk.reportkills", "1");
3207             }
3208         }
3209     } else {
3210         if (!init_monitors()) {
3211             return -1;
3212         }
3213         /* let the others know it does support reporting kills */
3214         property_set("sys.lmk.reportkills", "1");
3215     }
3216 
3217     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
3218         procadjslot_list[i].next = &procadjslot_list[i];
3219         procadjslot_list[i].prev = &procadjslot_list[i];
3220     }
3221 
3222     memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
3223 
3224     /*
3225      * Read zoneinfo as the biggest file we read to create and size the initial
3226      * read buffer and avoid memory re-allocations during memory pressure
3227      */
3228     if (reread_file(&file_data) == NULL) {
3229         ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
3230     }
3231 
3232     /* check if kernel supports pidfd_open syscall */
3233     pidfd = TEMP_FAILURE_RETRY(pidfd_open(getpid(), 0));
3234     if (pidfd < 0) {
3235         pidfd_supported = (errno != ENOSYS);
3236     } else {
3237         pidfd_supported = true;
3238         close(pidfd);
3239     }
3240     ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
3241 
3242     return 0;
3243 }
3244 
polling_paused(struct polling_params * poll_params)3245 static bool polling_paused(struct polling_params *poll_params) {
3246     return poll_params->paused_handler != NULL;
3247 }
3248 
resume_polling(struct polling_params * poll_params,struct timespec curr_tm)3249 static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) {
3250     poll_params->poll_start_tm = curr_tm;
3251     poll_params->poll_handler = poll_params->paused_handler;
3252     poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3253     poll_params->paused_handler = NULL;
3254 }
3255 
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)3256 static void call_handler(struct event_handler_info* handler_info,
3257                          struct polling_params *poll_params, uint32_t events) {
3258     struct timespec curr_tm;
3259 
3260     poll_params->update = POLLING_DO_NOT_CHANGE;
3261     handler_info->handler(handler_info->data, events, poll_params);
3262     clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3263     if (poll_params->poll_handler == handler_info) {
3264         poll_params->last_poll_tm = curr_tm;
3265     }
3266 
3267     switch (poll_params->update) {
3268     case POLLING_START:
3269         /*
3270          * Poll for the duration of PSI_WINDOW_SIZE_MS after the
3271          * initial PSI event because psi events are rate-limited
3272          * at one per sec.
3273          */
3274         poll_params->poll_start_tm = curr_tm;
3275         poll_params->poll_handler = handler_info;
3276         break;
3277     case POLLING_PAUSE:
3278         poll_params->paused_handler = handler_info;
3279         poll_params->poll_handler = NULL;
3280         break;
3281     case POLLING_RESUME:
3282         resume_polling(poll_params, curr_tm);
3283         break;
3284     case POLLING_DO_NOT_CHANGE:
3285         if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
3286             /* Polled for the duration of PSI window, time to stop */
3287             poll_params->poll_handler = NULL;
3288         }
3289         break;
3290     }
3291 }
3292 
mainloop(void)3293 static void mainloop(void) {
3294     struct event_handler_info* handler_info;
3295     struct polling_params poll_params;
3296     struct timespec curr_tm;
3297     struct epoll_event *evt;
3298     long delay = -1;
3299 
3300     poll_params.poll_handler = NULL;
3301     poll_params.paused_handler = NULL;
3302 
3303     while (1) {
3304         struct epoll_event events[MAX_EPOLL_EVENTS];
3305         int nevents;
3306         int i;
3307 
3308         if (poll_params.poll_handler) {
3309             bool poll_now;
3310 
3311             clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3312             if (poll_params.update == POLLING_RESUME) {
3313                 /* Just transitioned into POLLING_RESUME, poll immediately. */
3314                 poll_now = true;
3315                 nevents = 0;
3316             } else {
3317                 /* Calculate next timeout */
3318                 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
3319                 delay = (delay < poll_params.polling_interval_ms) ?
3320                     poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
3321 
3322                 /* Wait for events until the next polling timeout */
3323                 nevents = epoll_wait(epollfd, events, maxevents, delay);
3324 
3325                 /* Update current time after wait */
3326                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3327                 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
3328                     poll_params.polling_interval_ms);
3329             }
3330             if (poll_now) {
3331                 call_handler(poll_params.poll_handler, &poll_params, 0);
3332             }
3333         } else {
3334             if (kill_timeout_ms && is_waiting_for_kill()) {
3335                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3336                 delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
3337                 /* Wait for pidfds notification or kill timeout to expire */
3338                 nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
3339                 if (nevents == 0) {
3340                     /* Kill notification timed out */
3341                     stop_wait_for_proc_kill(false);
3342                     if (polling_paused(&poll_params)) {
3343                         clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3344                         poll_params.update = POLLING_RESUME;
3345                         resume_polling(&poll_params, curr_tm);
3346                     }
3347                 }
3348             } else {
3349                 /* Wait for events with no timeout */
3350                 nevents = epoll_wait(epollfd, events, maxevents, -1);
3351             }
3352         }
3353 
3354         if (nevents == -1) {
3355             if (errno == EINTR)
3356                 continue;
3357             ALOGE("epoll_wait failed (errno=%d)", errno);
3358             continue;
3359         }
3360 
3361         /*
3362          * First pass to see if any data socket connections were dropped.
3363          * Dropped connection should be handled before any other events
3364          * to deallocate data connection and correctly handle cases when
3365          * connection gets dropped and reestablished in the same epoll cycle.
3366          * In such cases it's essential to handle connection closures first.
3367          */
3368         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3369             if ((evt->events & EPOLLHUP) && evt->data.ptr) {
3370                 ALOGI("lmkd data connection dropped");
3371                 handler_info = (struct event_handler_info*)evt->data.ptr;
3372                 ctrl_data_close(handler_info->data);
3373             }
3374         }
3375 
3376         /* Second pass to handle all other events */
3377         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3378             if (evt->events & EPOLLERR) {
3379                 ALOGD("EPOLLERR on event #%d", i);
3380             }
3381             if (evt->events & EPOLLHUP) {
3382                 /* This case was handled in the first pass */
3383                 continue;
3384             }
3385             if (evt->data.ptr) {
3386                 handler_info = (struct event_handler_info*)evt->data.ptr;
3387                 call_handler(handler_info, &poll_params, evt->events);
3388             }
3389         }
3390     }
3391 }
3392 
issue_reinit()3393 int issue_reinit() {
3394     int sock;
3395 
3396     sock = lmkd_connect();
3397     if (sock < 0) {
3398         ALOGE("failed to connect to lmkd: %s", strerror(errno));
3399         return -1;
3400     }
3401 
3402     enum update_props_result res = lmkd_update_props(sock);
3403     switch (res) {
3404     case UPDATE_PROPS_SUCCESS:
3405         ALOGI("lmkd updated properties successfully");
3406         break;
3407     case UPDATE_PROPS_SEND_ERR:
3408         ALOGE("failed to send lmkd request: %s", strerror(errno));
3409         break;
3410     case UPDATE_PROPS_RECV_ERR:
3411         ALOGE("failed to receive lmkd reply: %s", strerror(errno));
3412         break;
3413     case UPDATE_PROPS_FORMAT_ERR:
3414         ALOGE("lmkd reply is invalid");
3415         break;
3416     case UPDATE_PROPS_FAIL:
3417         ALOGE("lmkd failed to update its properties");
3418         break;
3419     }
3420 
3421     close(sock);
3422     return res == UPDATE_PROPS_SUCCESS ? 0 : -1;
3423 }
3424 
update_props()3425 static void update_props() {
3426     /* By default disable low level vmpressure events */
3427     level_oomadj[VMPRESS_LEVEL_LOW] =
3428         GET_LMK_PROPERTY(int32, "low", OOM_SCORE_ADJ_MAX + 1);
3429     level_oomadj[VMPRESS_LEVEL_MEDIUM] =
3430         GET_LMK_PROPERTY(int32, "medium", 800);
3431     level_oomadj[VMPRESS_LEVEL_CRITICAL] =
3432         GET_LMK_PROPERTY(int32, "critical", 0);
3433     debug_process_killing = GET_LMK_PROPERTY(bool, "debug", false);
3434 
3435     /* By default disable upgrade/downgrade logic */
3436     enable_pressure_upgrade =
3437         GET_LMK_PROPERTY(bool, "critical_upgrade", false);
3438     upgrade_pressure =
3439         (int64_t)GET_LMK_PROPERTY(int32, "upgrade_pressure", 100);
3440     downgrade_pressure =
3441         (int64_t)GET_LMK_PROPERTY(int32, "downgrade_pressure", 100);
3442     kill_heaviest_task =
3443         GET_LMK_PROPERTY(bool, "kill_heaviest_task", false);
3444     low_ram_device = property_get_bool("ro.config.low_ram", false);
3445     kill_timeout_ms =
3446         (unsigned long)GET_LMK_PROPERTY(int32, "kill_timeout_ms", 100);
3447     use_minfree_levels =
3448         GET_LMK_PROPERTY(bool, "use_minfree_levels", false);
3449     per_app_memcg =
3450         property_get_bool("ro.config.per_app_memcg", low_ram_device);
3451     swap_free_low_percentage = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_free_low_percentage",
3452         DEF_LOW_SWAP));
3453     psi_partial_stall_ms = GET_LMK_PROPERTY(int32, "psi_partial_stall_ms",
3454         low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
3455     psi_complete_stall_ms = GET_LMK_PROPERTY(int32, "psi_complete_stall_ms",
3456         DEF_COMPLETE_STALL);
3457     thrashing_limit_pct = max(0, GET_LMK_PROPERTY(int32, "thrashing_limit",
3458         low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
3459     thrashing_limit_decay_pct = clamp(0, 100, GET_LMK_PROPERTY(int32, "thrashing_limit_decay",
3460         low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
3461     thrashing_critical_pct = max(0, GET_LMK_PROPERTY(int32, "thrashing_limit_critical",
3462         thrashing_limit_pct * 2));
3463     swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100));
3464     filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
3465 }
3466 
main(int argc,char ** argv)3467 int main(int argc, char **argv) {
3468     if ((argc > 1) && argv[1] && !strcmp(argv[1], "--reinit")) {
3469         if (property_set(LMKD_REINIT_PROP, "")) {
3470             ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
3471         }
3472         return issue_reinit();
3473     }
3474 
3475     update_props();
3476 
3477     ctx = create_android_logger(KILLINFO_LOG_TAG);
3478 
3479     if (!init()) {
3480         if (!use_inkernel_interface) {
3481             /*
3482              * MCL_ONFAULT pins pages as they fault instead of loading
3483              * everything immediately all at once. (Which would be bad,
3484              * because as of this writing, we have a lot of mapped pages we
3485              * never use.) Old kernels will see MCL_ONFAULT and fail with
3486              * EINVAL; we ignore this failure.
3487              *
3488              * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
3489              * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
3490              * in pages.
3491              */
3492             /* CAP_IPC_LOCK required */
3493             if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
3494                 ALOGW("mlockall failed %s", strerror(errno));
3495             }
3496 
3497             /* CAP_NICE required */
3498             struct sched_param param = {
3499                     .sched_priority = 1,
3500             };
3501             if (sched_setscheduler(0, SCHED_FIFO, &param)) {
3502                 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
3503             }
3504         }
3505 
3506         mainloop();
3507     }
3508 
3509     android_log_destroy(&ctx);
3510 
3511     ALOGI("exiting");
3512     return 0;
3513 }
3514