• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "lowmemorykiller"
18 
19 #include <dirent.h>
20 #include <errno.h>
21 #include <inttypes.h>
22 #include <pwd.h>
23 #include <sched.h>
24 #include <signal.h>
25 #include <statslog_lmkd.h>
26 #include <stdbool.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <sys/cdefs.h>
30 #include <sys/epoll.h>
31 #include <sys/eventfd.h>
32 #include <sys/mman.h>
33 #include <sys/resource.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/sysinfo.h>
37 #include <sys/time.h>
38 #include <sys/types.h>
39 #include <time.h>
40 #include <unistd.h>
41 
42 #include <cutils/properties.h>
43 #include <cutils/sched_policy.h>
44 #include <cutils/sockets.h>
45 #include <liblmkd_utils.h>
46 #include <lmkd.h>
47 #include <log/log.h>
48 #include <log/log_event_list.h>
49 #include <log/log_time.h>
50 #include <private/android_filesystem_config.h>
51 #include <psi/psi.h>
52 #include <system/thread_defs.h>
53 
54 #include "statslog.h"
55 
56 /*
57  * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces
58  * to profile and correlate with OOM kills
59  */
60 #ifdef LMKD_TRACE_KILLS
61 
62 #define ATRACE_TAG ATRACE_TAG_ALWAYS
63 #include <cutils/trace.h>
64 
65 #define TRACE_KILL_START(pid) ATRACE_INT(__FUNCTION__, pid);
66 #define TRACE_KILL_END()      ATRACE_INT(__FUNCTION__, 0);
67 
68 #else /* LMKD_TRACE_KILLS */
69 
70 #define TRACE_KILL_START(pid) ((void)(pid))
71 #define TRACE_KILL_END() ((void)0)
72 
73 #endif /* LMKD_TRACE_KILLS */
74 
75 #ifndef __unused
76 #define __unused __attribute__((__unused__))
77 #endif
78 
79 #define MEMCG_SYSFS_PATH "/dev/memcg/"
80 #define MEMCG_MEMORY_USAGE "/dev/memcg/memory.usage_in_bytes"
81 #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
82 #define ZONEINFO_PATH "/proc/zoneinfo"
83 #define MEMINFO_PATH "/proc/meminfo"
84 #define VMSTAT_PATH "/proc/vmstat"
85 #define PROC_STATUS_TGID_FIELD "Tgid:"
86 #define LINE_MAX 128
87 
88 #define PERCEPTIBLE_APP_ADJ 200
89 
90 /* Android Logger event logtags (see event.logtags) */
91 #define KILLINFO_LOG_TAG 10195355
92 
93 /* gid containing AID_SYSTEM required */
94 #define INKERNEL_MINFREE_PATH "/sys/module/lowmemorykiller/parameters/minfree"
95 #define INKERNEL_ADJ_PATH "/sys/module/lowmemorykiller/parameters/adj"
96 
97 #define ARRAY_SIZE(x)   (sizeof(x) / sizeof(*(x)))
98 #define EIGHT_MEGA (1 << 23)
99 
100 #define TARGET_UPDATE_MIN_INTERVAL_MS 1000
101 #define THRASHING_RESET_INTERVAL_MS 1000
102 
103 #define NS_PER_MS (NS_PER_SEC / MS_PER_SEC)
104 #define US_PER_MS (US_PER_SEC / MS_PER_SEC)
105 
106 /* Defined as ProcessList.SYSTEM_ADJ in ProcessList.java */
107 #define SYSTEM_ADJ (-900)
108 
109 #define STRINGIFY(x) STRINGIFY_INTERNAL(x)
110 #define STRINGIFY_INTERNAL(x) #x
111 
112 /*
113  * PSI monitor tracking window size.
114  * PSI monitor generates events at most once per window,
115  * therefore we poll memory state for the duration of
116  * PSI_WINDOW_SIZE_MS after the event happens.
117  */
118 #define PSI_WINDOW_SIZE_MS 1000
119 /* Polling period after PSI signal when pressure is high */
120 #define PSI_POLL_PERIOD_SHORT_MS 10
121 /* Polling period after PSI signal when pressure is low */
122 #define PSI_POLL_PERIOD_LONG_MS 100
123 
124 #define min(a, b) (((a) < (b)) ? (a) : (b))
125 #define max(a, b) (((a) > (b)) ? (a) : (b))
126 
127 #define FAIL_REPORT_RLIMIT_MS 1000
128 
129 /*
130  * System property defaults
131  */
132 /* ro.lmk.swap_free_low_percentage property defaults */
133 #define DEF_LOW_SWAP 10
134 /* ro.lmk.thrashing_limit property defaults */
135 #define DEF_THRASHING_LOWRAM 30
136 #define DEF_THRASHING 100
137 /* ro.lmk.thrashing_limit_decay property defaults */
138 #define DEF_THRASHING_DECAY_LOWRAM 50
139 #define DEF_THRASHING_DECAY 10
140 /* ro.lmk.psi_partial_stall_ms property defaults */
141 #define DEF_PARTIAL_STALL_LOWRAM 200
142 #define DEF_PARTIAL_STALL 70
143 /* ro.lmk.psi_complete_stall_ms property defaults */
144 #define DEF_COMPLETE_STALL 700
145 
146 #define LMKD_REINIT_PROP "lmkd.reinit"
147 
sys_pidfd_open(pid_t pid,unsigned int flags)148 static inline int sys_pidfd_open(pid_t pid, unsigned int flags) {
149     return syscall(__NR_pidfd_open, pid, flags);
150 }
151 
sys_pidfd_send_signal(int pidfd,int sig,siginfo_t * info,unsigned int flags)152 static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
153                                         unsigned int flags) {
154     return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
155 }
156 
157 /* default to old in-kernel interface if no memory pressure events */
158 static bool use_inkernel_interface = true;
159 static bool has_inkernel_module;
160 
161 /* memory pressure levels */
162 enum vmpressure_level {
163     VMPRESS_LEVEL_LOW = 0,
164     VMPRESS_LEVEL_MEDIUM,
165     VMPRESS_LEVEL_CRITICAL,
166     VMPRESS_LEVEL_COUNT
167 };
168 
169 static const char *level_name[] = {
170     "low",
171     "medium",
172     "critical"
173 };
174 
175 struct {
176     int64_t min_nr_free_pages; /* recorded but not used yet */
177     int64_t max_nr_free_pages;
178 } low_pressure_mem = { -1, -1 };
179 
180 struct psi_threshold {
181     enum psi_stall_type stall_type;
182     int threshold_ms;
183 };
184 
185 static int level_oomadj[VMPRESS_LEVEL_COUNT];
186 static int mpevfd[VMPRESS_LEVEL_COUNT] = { -1, -1, -1 };
187 static bool pidfd_supported;
188 static int last_kill_pid_or_fd = -1;
189 static struct timespec last_kill_tm;
190 
191 /* lmkd configurable parameters */
192 static bool debug_process_killing;
193 static bool enable_pressure_upgrade;
194 static int64_t upgrade_pressure;
195 static int64_t downgrade_pressure;
196 static bool low_ram_device;
197 static bool kill_heaviest_task;
198 static unsigned long kill_timeout_ms;
199 static bool use_minfree_levels;
200 static bool per_app_memcg;
201 static int swap_free_low_percentage;
202 static int psi_partial_stall_ms;
203 static int psi_complete_stall_ms;
204 static int thrashing_limit_pct;
205 static int thrashing_limit_decay_pct;
206 static int thrashing_critical_pct;
207 static bool use_psi_monitors = false;
208 static int kpoll_fd;
209 static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
210     { PSI_SOME, 70 },    /* 70ms out of 1sec for partial stall */
211     { PSI_SOME, 100 },   /* 100ms out of 1sec for partial stall */
212     { PSI_FULL, 70 },    /* 70ms out of 1sec for complete stall */
213 };
214 
215 static android_log_context ctx;
216 
217 enum polling_update {
218     POLLING_DO_NOT_CHANGE,
219     POLLING_START,
220     POLLING_PAUSE,
221     POLLING_RESUME,
222 };
223 
224 /*
225  * Data used for periodic polling for the memory state of the device.
226  * Note that when system is not polling poll_handler is set to NULL,
227  * when polling starts poll_handler gets set and is reset back to
228  * NULL when polling stops.
229  */
230 struct polling_params {
231     struct event_handler_info* poll_handler;
232     struct event_handler_info* paused_handler;
233     struct timespec poll_start_tm;
234     struct timespec last_poll_tm;
235     int polling_interval_ms;
236     enum polling_update update;
237 };
238 
239 /* data required to handle events */
240 struct event_handler_info {
241     int data;
242     void (*handler)(int data, uint32_t events, struct polling_params *poll_params);
243 };
244 
245 /* data required to handle socket events */
246 struct sock_event_handler_info {
247     int sock;
248     pid_t pid;
249     uint32_t async_event_mask;
250     struct event_handler_info handler_info;
251 };
252 
253 /* max supported number of data connections (AMS, init, tests) */
254 #define MAX_DATA_CONN 3
255 
256 /* socket event handler data */
257 static struct sock_event_handler_info ctrl_sock;
258 static struct sock_event_handler_info data_sock[MAX_DATA_CONN];
259 
260 /* vmpressure event handler data */
261 static struct event_handler_info vmpressure_hinfo[VMPRESS_LEVEL_COUNT];
262 
263 /*
264  * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
265  * 1 lmk events + 1 fd to wait for process death
266  */
267 #define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1)
268 static int epollfd;
269 static int maxevents;
270 
271 /* OOM score values used by both kernel and framework */
272 #define OOM_SCORE_ADJ_MIN       (-1000)
273 #define OOM_SCORE_ADJ_MAX       1000
274 
275 static int lowmem_adj[MAX_TARGETS];
276 static int lowmem_minfree[MAX_TARGETS];
277 static int lowmem_targets_size;
278 
279 /* Fields to parse in /proc/zoneinfo */
280 /* zoneinfo per-zone fields */
281 enum zoneinfo_zone_field {
282     ZI_ZONE_NR_FREE_PAGES = 0,
283     ZI_ZONE_MIN,
284     ZI_ZONE_LOW,
285     ZI_ZONE_HIGH,
286     ZI_ZONE_PRESENT,
287     ZI_ZONE_NR_FREE_CMA,
288     ZI_ZONE_FIELD_COUNT
289 };
290 
291 static const char* const zoneinfo_zone_field_names[ZI_ZONE_FIELD_COUNT] = {
292     "nr_free_pages",
293     "min",
294     "low",
295     "high",
296     "present",
297     "nr_free_cma",
298 };
299 
300 /* zoneinfo per-zone special fields */
301 enum zoneinfo_zone_spec_field {
302     ZI_ZONE_SPEC_PROTECTION = 0,
303     ZI_ZONE_SPEC_PAGESETS,
304     ZI_ZONE_SPEC_FIELD_COUNT,
305 };
306 
307 static const char* const zoneinfo_zone_spec_field_names[ZI_ZONE_SPEC_FIELD_COUNT] = {
308     "protection:",
309     "pagesets",
310 };
311 
312 /* see __MAX_NR_ZONES definition in kernel mmzone.h */
313 #define MAX_NR_ZONES 6
314 
315 union zoneinfo_zone_fields {
316     struct {
317         int64_t nr_free_pages;
318         int64_t min;
319         int64_t low;
320         int64_t high;
321         int64_t present;
322         int64_t nr_free_cma;
323     } field;
324     int64_t arr[ZI_ZONE_FIELD_COUNT];
325 };
326 
327 struct zoneinfo_zone {
328     union zoneinfo_zone_fields fields;
329     int64_t protection[MAX_NR_ZONES];
330     int64_t max_protection;
331 };
332 
333 /* zoneinfo per-node fields */
334 enum zoneinfo_node_field {
335     ZI_NODE_NR_INACTIVE_FILE = 0,
336     ZI_NODE_NR_ACTIVE_FILE,
337     ZI_NODE_WORKINGSET_REFAULT,
338     ZI_NODE_FIELD_COUNT
339 };
340 
341 static const char* const zoneinfo_node_field_names[ZI_NODE_FIELD_COUNT] = {
342     "nr_inactive_file",
343     "nr_active_file",
344     "workingset_refault",
345 };
346 
347 union zoneinfo_node_fields {
348     struct {
349         int64_t nr_inactive_file;
350         int64_t nr_active_file;
351         int64_t workingset_refault;
352     } field;
353     int64_t arr[ZI_NODE_FIELD_COUNT];
354 };
355 
356 struct zoneinfo_node {
357     int id;
358     int zone_count;
359     struct zoneinfo_zone zones[MAX_NR_ZONES];
360     union zoneinfo_node_fields fields;
361 };
362 
363 /* for now two memory nodes is more than enough */
364 #define MAX_NR_NODES 2
365 
366 struct zoneinfo {
367     int node_count;
368     struct zoneinfo_node nodes[MAX_NR_NODES];
369     int64_t totalreserve_pages;
370     int64_t total_inactive_file;
371     int64_t total_active_file;
372     int64_t total_workingset_refault;
373 };
374 
375 /* Fields to parse in /proc/meminfo */
376 enum meminfo_field {
377     MI_NR_FREE_PAGES = 0,
378     MI_CACHED,
379     MI_SWAP_CACHED,
380     MI_BUFFERS,
381     MI_SHMEM,
382     MI_UNEVICTABLE,
383     MI_TOTAL_SWAP,
384     MI_FREE_SWAP,
385     MI_ACTIVE_ANON,
386     MI_INACTIVE_ANON,
387     MI_ACTIVE_FILE,
388     MI_INACTIVE_FILE,
389     MI_SRECLAIMABLE,
390     MI_SUNRECLAIM,
391     MI_KERNEL_STACK,
392     MI_PAGE_TABLES,
393     MI_ION_HELP,
394     MI_ION_HELP_POOL,
395     MI_CMA_FREE,
396     MI_FIELD_COUNT
397 };
398 
399 static const char* const meminfo_field_names[MI_FIELD_COUNT] = {
400     "MemFree:",
401     "Cached:",
402     "SwapCached:",
403     "Buffers:",
404     "Shmem:",
405     "Unevictable:",
406     "SwapTotal:",
407     "SwapFree:",
408     "Active(anon):",
409     "Inactive(anon):",
410     "Active(file):",
411     "Inactive(file):",
412     "SReclaimable:",
413     "SUnreclaim:",
414     "KernelStack:",
415     "PageTables:",
416     "ION_heap:",
417     "ION_heap_pool:",
418     "CmaFree:",
419 };
420 
421 union meminfo {
422     struct {
423         int64_t nr_free_pages;
424         int64_t cached;
425         int64_t swap_cached;
426         int64_t buffers;
427         int64_t shmem;
428         int64_t unevictable;
429         int64_t total_swap;
430         int64_t free_swap;
431         int64_t active_anon;
432         int64_t inactive_anon;
433         int64_t active_file;
434         int64_t inactive_file;
435         int64_t sreclaimable;
436         int64_t sunreclaimable;
437         int64_t kernel_stack;
438         int64_t page_tables;
439         int64_t ion_heap;
440         int64_t ion_heap_pool;
441         int64_t cma_free;
442         /* fields below are calculated rather than read from the file */
443         int64_t nr_file_pages;
444     } field;
445     int64_t arr[MI_FIELD_COUNT];
446 };
447 
448 /* Fields to parse in /proc/vmstat */
449 enum vmstat_field {
450     VS_FREE_PAGES,
451     VS_INACTIVE_FILE,
452     VS_ACTIVE_FILE,
453     VS_WORKINGSET_REFAULT,
454     VS_PGSCAN_KSWAPD,
455     VS_PGSCAN_DIRECT,
456     VS_PGSCAN_DIRECT_THROTTLE,
457     VS_FIELD_COUNT
458 };
459 
460 static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
461     "nr_free_pages",
462     "nr_inactive_file",
463     "nr_active_file",
464     "workingset_refault",
465     "pgscan_kswapd",
466     "pgscan_direct",
467     "pgscan_direct_throttle",
468 };
469 
470 union vmstat {
471     struct {
472         int64_t nr_free_pages;
473         int64_t nr_inactive_file;
474         int64_t nr_active_file;
475         int64_t workingset_refault;
476         int64_t pgscan_kswapd;
477         int64_t pgscan_direct;
478         int64_t pgscan_direct_throttle;
479     } field;
480     int64_t arr[VS_FIELD_COUNT];
481 };
482 
483 enum field_match_result {
484     NO_MATCH,
485     PARSE_FAIL,
486     PARSE_SUCCESS
487 };
488 
489 struct adjslot_list {
490     struct adjslot_list *next;
491     struct adjslot_list *prev;
492 };
493 
494 struct proc {
495     struct adjslot_list asl;
496     int pid;
497     int pidfd;
498     uid_t uid;
499     int oomadj;
500     pid_t reg_pid; /* PID of the process that registered this record */
501     struct proc *pidhash_next;
502 };
503 
504 struct reread_data {
505     const char* const filename;
506     int fd;
507 };
508 
509 #define PIDHASH_SZ 1024
510 static struct proc *pidhash[PIDHASH_SZ];
511 #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
512 
513 #define ADJTOSLOT(adj) ((adj) + -OOM_SCORE_ADJ_MIN)
514 #define ADJTOSLOT_COUNT (ADJTOSLOT(OOM_SCORE_ADJ_MAX) + 1)
515 static struct adjslot_list procadjslot_list[ADJTOSLOT_COUNT];
516 
517 #define MAX_DISTINCT_OOM_ADJ 32
518 #define KILLCNT_INVALID_IDX 0xFF
519 /*
520  * Because killcnt array is sparse a two-level indirection is used
521  * to keep the size small. killcnt_idx stores index of the element in
522  * killcnt array. Index KILLCNT_INVALID_IDX indicates an unused slot.
523  */
524 static uint8_t killcnt_idx[ADJTOSLOT_COUNT];
525 static uint16_t killcnt[MAX_DISTINCT_OOM_ADJ];
526 static int killcnt_free_idx = 0;
527 static uint32_t killcnt_total = 0;
528 
529 /* PAGE_SIZE / 1024 */
530 static long page_k;
531 
532 static void update_props();
533 static bool init_monitors();
534 static void destroy_monitors();
535 
clamp(int low,int high,int value)536 static int clamp(int low, int high, int value) {
537     return max(min(value, high), low);
538 }
539 
parse_int64(const char * str,int64_t * ret)540 static bool parse_int64(const char* str, int64_t* ret) {
541     char* endptr;
542     long long val = strtoll(str, &endptr, 10);
543     if (str == endptr || val > INT64_MAX) {
544         return false;
545     }
546     *ret = (int64_t)val;
547     return true;
548 }
549 
find_field(const char * name,const char * const field_names[],int field_count)550 static int find_field(const char* name, const char* const field_names[], int field_count) {
551     for (int i = 0; i < field_count; i++) {
552         if (!strcmp(name, field_names[i])) {
553             return i;
554         }
555     }
556     return -1;
557 }
558 
match_field(const char * cp,const char * ap,const char * const field_names[],int field_count,int64_t * field,int * field_idx)559 static enum field_match_result match_field(const char* cp, const char* ap,
560                                    const char* const field_names[],
561                                    int field_count, int64_t* field,
562                                    int *field_idx) {
563     int i = find_field(cp, field_names, field_count);
564     if (i < 0) {
565         return NO_MATCH;
566     }
567     *field_idx = i;
568     return parse_int64(ap, field) ? PARSE_SUCCESS : PARSE_FAIL;
569 }
570 
571 /*
572  * Read file content from the beginning up to max_len bytes or EOF
573  * whichever happens first.
574  */
read_all(int fd,char * buf,size_t max_len)575 static ssize_t read_all(int fd, char *buf, size_t max_len)
576 {
577     ssize_t ret = 0;
578     off_t offset = 0;
579 
580     while (max_len > 0) {
581         ssize_t r = TEMP_FAILURE_RETRY(pread(fd, buf, max_len, offset));
582         if (r == 0) {
583             break;
584         }
585         if (r == -1) {
586             return -1;
587         }
588         ret += r;
589         buf += r;
590         offset += r;
591         max_len -= r;
592     }
593 
594     return ret;
595 }
596 
597 /*
598  * Read a new or already opened file from the beginning.
599  * If the file has not been opened yet data->fd should be set to -1.
600  * To be used with files which are read often and possibly during high
601  * memory pressure to minimize file opening which by itself requires kernel
602  * memory allocation and might result in a stall on memory stressed system.
603  */
reread_file(struct reread_data * data)604 static char *reread_file(struct reread_data *data) {
605     /* start with page-size buffer and increase if needed */
606     static ssize_t buf_size = PAGE_SIZE;
607     static char *new_buf, *buf = NULL;
608     ssize_t size;
609 
610     if (data->fd == -1) {
611         /* First-time buffer initialization */
612         if (!buf && (buf = static_cast<char*>(malloc(buf_size))) == nullptr) {
613             return NULL;
614         }
615 
616         data->fd = TEMP_FAILURE_RETRY(open(data->filename, O_RDONLY | O_CLOEXEC));
617         if (data->fd < 0) {
618             ALOGE("%s open: %s", data->filename, strerror(errno));
619             return NULL;
620         }
621     }
622 
623     while (true) {
624         size = read_all(data->fd, buf, buf_size - 1);
625         if (size < 0) {
626             ALOGE("%s read: %s", data->filename, strerror(errno));
627             close(data->fd);
628             data->fd = -1;
629             return NULL;
630         }
631         if (size < buf_size - 1) {
632             break;
633         }
634         /*
635          * Since we are reading /proc files we can't use fstat to find out
636          * the real size of the file. Double the buffer size and keep retrying.
637          */
638         if ((new_buf = static_cast<char*>(realloc(buf, buf_size * 2))) == nullptr) {
639             errno = ENOMEM;
640             return NULL;
641         }
642         buf = new_buf;
643         buf_size *= 2;
644     }
645     buf[size] = 0;
646 
647     return buf;
648 }
649 
claim_record(struct proc * procp,pid_t pid)650 static bool claim_record(struct proc* procp, pid_t pid) {
651     if (procp->reg_pid == pid) {
652         /* Record already belongs to the registrant */
653         return true;
654     }
655     if (procp->reg_pid == 0) {
656         /* Old registrant is gone, claim the record */
657         procp->reg_pid = pid;
658         return true;
659     }
660     /* The record is owned by another registrant */
661     return false;
662 }
663 
remove_claims(pid_t pid)664 static void remove_claims(pid_t pid) {
665     int i;
666 
667     for (i = 0; i < PIDHASH_SZ; i++) {
668         struct proc* procp = pidhash[i];
669         while (procp) {
670             if (procp->reg_pid == pid) {
671                 procp->reg_pid = 0;
672             }
673             procp = procp->pidhash_next;
674         }
675     }
676 }
677 
ctrl_data_close(int dsock_idx)678 static void ctrl_data_close(int dsock_idx) {
679     struct epoll_event epev;
680 
681     ALOGI("closing lmkd data connection");
682     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, data_sock[dsock_idx].sock, &epev) == -1) {
683         // Log a warning and keep going
684         ALOGW("epoll_ctl for data connection socket failed; errno=%d", errno);
685     }
686     maxevents--;
687 
688     close(data_sock[dsock_idx].sock);
689     data_sock[dsock_idx].sock = -1;
690 
691     /* Mark all records of the old registrant as unclaimed */
692     remove_claims(data_sock[dsock_idx].pid);
693 }
694 
ctrl_data_read(int dsock_idx,char * buf,size_t bufsz,struct ucred * sender_cred)695 static ssize_t ctrl_data_read(int dsock_idx, char* buf, size_t bufsz, struct ucred* sender_cred) {
696     struct iovec iov = {buf, bufsz};
697     char control[CMSG_SPACE(sizeof(struct ucred))];
698     struct msghdr hdr = {
699             NULL, 0, &iov, 1, control, sizeof(control), 0,
700     };
701     ssize_t ret;
702     ret = TEMP_FAILURE_RETRY(recvmsg(data_sock[dsock_idx].sock, &hdr, 0));
703     if (ret == -1) {
704         ALOGE("control data socket read failed; %s", strerror(errno));
705         return -1;
706     }
707     if (ret == 0) {
708         ALOGE("Got EOF on control data socket");
709         return -1;
710     }
711 
712     struct ucred* cred = NULL;
713     struct cmsghdr* cmsg = CMSG_FIRSTHDR(&hdr);
714     while (cmsg != NULL) {
715         if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) {
716             cred = (struct ucred*)CMSG_DATA(cmsg);
717             break;
718         }
719         cmsg = CMSG_NXTHDR(&hdr, cmsg);
720     }
721 
722     if (cred == NULL) {
723         ALOGE("Failed to retrieve sender credentials");
724         /* Close the connection */
725         ctrl_data_close(dsock_idx);
726         return -1;
727     }
728 
729     memcpy(sender_cred, cred, sizeof(struct ucred));
730 
731     /* Store PID of the peer */
732     data_sock[dsock_idx].pid = cred->pid;
733 
734     return ret;
735 }
736 
ctrl_data_write(int dsock_idx,char * buf,size_t bufsz)737 static int ctrl_data_write(int dsock_idx, char* buf, size_t bufsz) {
738     int ret = 0;
739 
740     ret = TEMP_FAILURE_RETRY(write(data_sock[dsock_idx].sock, buf, bufsz));
741 
742     if (ret == -1) {
743         ALOGE("control data socket write failed; errno=%d", errno);
744     } else if (ret == 0) {
745         ALOGE("Got EOF on control data socket");
746         ret = -1;
747     }
748 
749     return ret;
750 }
751 
752 /*
753  * Write the pid/uid pair over the data socket, note: all active clients
754  * will receive this unsolicited notification.
755  */
ctrl_data_write_lmk_kill_occurred(pid_t pid,uid_t uid)756 static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid) {
757     LMKD_CTRL_PACKET packet;
758     size_t len = lmkd_pack_set_prockills(packet, pid, uid);
759 
760     for (int i = 0; i < MAX_DATA_CONN; i++) {
761         if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_KILL) {
762             ctrl_data_write(i, (char*)packet, len);
763         }
764     }
765 }
766 
poll_kernel(int poll_fd)767 static void poll_kernel(int poll_fd) {
768     if (poll_fd == -1) {
769         // not waiting
770         return;
771     }
772 
773     while (1) {
774         char rd_buf[256];
775         int bytes_read = TEMP_FAILURE_RETRY(pread(poll_fd, (void*)rd_buf, sizeof(rd_buf), 0));
776         if (bytes_read <= 0) break;
777         rd_buf[bytes_read] = '\0';
778 
779         int64_t pid;
780         int64_t uid;
781         int64_t group_leader_pid;
782         int64_t rss_in_pages;
783         struct memory_stat mem_st = {};
784         int16_t oom_score_adj;
785         int16_t min_score_adj;
786         int64_t starttime;
787         char* taskname = 0;
788 
789         int fields_read =
790                 sscanf(rd_buf,
791                        "%" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64 " %" SCNd64
792                        " %" SCNd16 " %" SCNd16 " %" SCNd64 "\n%m[^\n]",
793                        &pid, &uid, &group_leader_pid, &mem_st.pgfault, &mem_st.pgmajfault,
794                        &rss_in_pages, &oom_score_adj, &min_score_adj, &starttime, &taskname);
795 
796         /* only the death of the group leader process is logged */
797         if (fields_read == 10 && group_leader_pid == pid) {
798             ctrl_data_write_lmk_kill_occurred((pid_t)pid, (uid_t)uid);
799             mem_st.process_start_time_ns = starttime * (NS_PER_SEC / sysconf(_SC_CLK_TCK));
800             mem_st.rss_in_bytes = rss_in_pages * PAGE_SIZE;
801 
802             struct kill_stat kill_st = {
803                 .uid = static_cast<int32_t>(uid),
804                 .kill_reason = NONE,
805                 .oom_score = oom_score_adj,
806                 .min_oom_score = min_score_adj,
807                 .free_mem_kb = 0,
808                 .free_swap_kb = 0,
809                 .tasksize = 0,
810 
811             };
812             stats_write_lmk_kill_occurred_pid(pid, &kill_st, &mem_st);
813         }
814 
815         free(taskname);
816     }
817 }
818 
init_poll_kernel()819 static bool init_poll_kernel() {
820     kpoll_fd = TEMP_FAILURE_RETRY(open("/proc/lowmemorykiller", O_RDONLY | O_NONBLOCK | O_CLOEXEC));
821 
822     if (kpoll_fd < 0) {
823         ALOGE("kernel lmk event file could not be opened; errno=%d", errno);
824         return false;
825     }
826 
827     return true;
828 }
829 
pid_lookup(int pid)830 static struct proc *pid_lookup(int pid) {
831     struct proc *procp;
832 
833     for (procp = pidhash[pid_hashfn(pid)]; procp && procp->pid != pid;
834          procp = procp->pidhash_next)
835             ;
836 
837     return procp;
838 }
839 
adjslot_insert(struct adjslot_list * head,struct adjslot_list * new_element)840 static void adjslot_insert(struct adjslot_list *head, struct adjslot_list *new_element)
841 {
842     struct adjslot_list *next = head->next;
843     new_element->prev = head;
844     new_element->next = next;
845     next->prev = new_element;
846     head->next = new_element;
847 }
848 
adjslot_remove(struct adjslot_list * old)849 static void adjslot_remove(struct adjslot_list *old)
850 {
851     struct adjslot_list *prev = old->prev;
852     struct adjslot_list *next = old->next;
853     next->prev = prev;
854     prev->next = next;
855 }
856 
adjslot_tail(struct adjslot_list * head)857 static struct adjslot_list *adjslot_tail(struct adjslot_list *head) {
858     struct adjslot_list *asl = head->prev;
859 
860     return asl == head ? NULL : asl;
861 }
862 
proc_slot(struct proc * procp)863 static void proc_slot(struct proc *procp) {
864     int adjslot = ADJTOSLOT(procp->oomadj);
865 
866     adjslot_insert(&procadjslot_list[adjslot], &procp->asl);
867 }
868 
proc_unslot(struct proc * procp)869 static void proc_unslot(struct proc *procp) {
870     adjslot_remove(&procp->asl);
871 }
872 
proc_insert(struct proc * procp)873 static void proc_insert(struct proc *procp) {
874     int hval = pid_hashfn(procp->pid);
875 
876     procp->pidhash_next = pidhash[hval];
877     pidhash[hval] = procp;
878     proc_slot(procp);
879 }
880 
pid_remove(int pid)881 static int pid_remove(int pid) {
882     int hval = pid_hashfn(pid);
883     struct proc *procp;
884     struct proc *prevp;
885 
886     for (procp = pidhash[hval], prevp = NULL; procp && procp->pid != pid;
887          procp = procp->pidhash_next)
888             prevp = procp;
889 
890     if (!procp)
891         return -1;
892 
893     if (!prevp)
894         pidhash[hval] = procp->pidhash_next;
895     else
896         prevp->pidhash_next = procp->pidhash_next;
897 
898     proc_unslot(procp);
899     /*
900      * Close pidfd here if we are not waiting for corresponding process to die,
901      * in which case stop_wait_for_proc_kill() will close the pidfd later
902      */
903     if (procp->pidfd >= 0 && procp->pidfd != last_kill_pid_or_fd) {
904         close(procp->pidfd);
905     }
906     free(procp);
907     return 0;
908 }
909 
910 /*
911  * Write a string to a file.
912  * Returns false if the file does not exist.
913  */
writefilestring(const char * path,const char * s,bool err_if_missing)914 static bool writefilestring(const char *path, const char *s,
915                             bool err_if_missing) {
916     int fd = open(path, O_WRONLY | O_CLOEXEC);
917     ssize_t len = strlen(s);
918     ssize_t ret;
919 
920     if (fd < 0) {
921         if (err_if_missing) {
922             ALOGE("Error opening %s; errno=%d", path, errno);
923         }
924         return false;
925     }
926 
927     ret = TEMP_FAILURE_RETRY(write(fd, s, len));
928     if (ret < 0) {
929         ALOGE("Error writing %s; errno=%d", path, errno);
930     } else if (ret < len) {
931         ALOGE("Short write on %s; length=%zd", path, ret);
932     }
933 
934     close(fd);
935     return true;
936 }
937 
get_time_diff_ms(struct timespec * from,struct timespec * to)938 static inline long get_time_diff_ms(struct timespec *from,
939                                     struct timespec *to) {
940     return (to->tv_sec - from->tv_sec) * (long)MS_PER_SEC +
941            (to->tv_nsec - from->tv_nsec) / (long)NS_PER_MS;
942 }
943 
proc_get_tgid(int pid)944 static int proc_get_tgid(int pid) {
945     char path[PATH_MAX];
946     char buf[PAGE_SIZE];
947     int fd;
948     ssize_t size;
949     char *pos;
950     int64_t tgid = -1;
951 
952     snprintf(path, PATH_MAX, "/proc/%d/status", pid);
953     fd = open(path, O_RDONLY | O_CLOEXEC);
954     if (fd < 0) {
955         return -1;
956     }
957 
958     size = read_all(fd, buf, sizeof(buf) - 1);
959     if (size < 0) {
960         goto out;
961     }
962     buf[size] = 0;
963 
964     pos = buf;
965     while (true) {
966         pos = strstr(pos, PROC_STATUS_TGID_FIELD);
967         /* Stop if TGID tag not found or found at the line beginning */
968         if (pos == NULL || pos == buf || pos[-1] == '\n') {
969             break;
970         }
971         pos++;
972     }
973 
974     if (pos == NULL) {
975         goto out;
976     }
977 
978     pos += strlen(PROC_STATUS_TGID_FIELD);
979     while (*pos == ' ') pos++;
980     parse_int64(pos, &tgid);
981 
982 out:
983     close(fd);
984     return (int)tgid;
985 }
986 
proc_get_size(int pid)987 static int proc_get_size(int pid) {
988     char path[PATH_MAX];
989     char line[LINE_MAX];
990     int fd;
991     int rss = 0;
992     int total;
993     ssize_t ret;
994 
995     /* gid containing AID_READPROC required */
996     snprintf(path, PATH_MAX, "/proc/%d/statm", pid);
997     fd = open(path, O_RDONLY | O_CLOEXEC);
998     if (fd == -1)
999         return -1;
1000 
1001     ret = read_all(fd, line, sizeof(line) - 1);
1002     if (ret < 0) {
1003         close(fd);
1004         return -1;
1005     }
1006     line[ret] = '\0';
1007 
1008     sscanf(line, "%d %d ", &total, &rss);
1009     close(fd);
1010     return rss;
1011 }
1012 
proc_get_name(int pid,char * buf,size_t buf_size)1013 static char *proc_get_name(int pid, char *buf, size_t buf_size) {
1014     char path[PATH_MAX];
1015     int fd;
1016     char *cp;
1017     ssize_t ret;
1018 
1019     /* gid containing AID_READPROC required */
1020     snprintf(path, PATH_MAX, "/proc/%d/cmdline", pid);
1021     fd = open(path, O_RDONLY | O_CLOEXEC);
1022     if (fd == -1) {
1023         return NULL;
1024     }
1025     ret = read_all(fd, buf, buf_size - 1);
1026     close(fd);
1027     if (ret < 0) {
1028         return NULL;
1029     }
1030     buf[ret] = '\0';
1031 
1032     cp = strchr(buf, ' ');
1033     if (cp) {
1034         *cp = '\0';
1035     }
1036 
1037     return buf;
1038 }
1039 
cmd_procprio(LMKD_CTRL_PACKET packet,int field_count,struct ucred * cred)1040 static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
1041     struct proc *procp;
1042     char path[LINE_MAX];
1043     char val[20];
1044     int soft_limit_mult;
1045     struct lmk_procprio params;
1046     bool is_system_server;
1047     struct passwd *pwdrec;
1048     int tgid;
1049 
1050     lmkd_pack_get_procprio(packet, field_count, &params);
1051 
1052     if (params.oomadj < OOM_SCORE_ADJ_MIN ||
1053         params.oomadj > OOM_SCORE_ADJ_MAX) {
1054         ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
1055         return;
1056     }
1057 
1058     if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
1059         ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
1060         return;
1061     }
1062 
1063     /* Check if registered process is a thread group leader */
1064     tgid = proc_get_tgid(params.pid);
1065     if (tgid >= 0 && tgid != params.pid) {
1066         ALOGE("Attempt to register a task that is not a thread group leader (tid %d, tgid %d)",
1067             params.pid, tgid);
1068         return;
1069     }
1070 
1071     /* gid containing AID_READPROC required */
1072     /* CAP_SYS_RESOURCE required */
1073     /* CAP_DAC_OVERRIDE required */
1074     snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
1075     snprintf(val, sizeof(val), "%d", params.oomadj);
1076     if (!writefilestring(path, val, false)) {
1077         ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
1078               path, errno, params.pid);
1079         /* If this file does not exist the process is dead. */
1080         return;
1081     }
1082 
1083     if (use_inkernel_interface) {
1084         stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
1085         return;
1086     }
1087 
1088     /* lmkd should not change soft limits for services */
1089     if (params.ptype == PROC_TYPE_APP && per_app_memcg) {
1090         if (params.oomadj >= 900) {
1091             soft_limit_mult = 0;
1092         } else if (params.oomadj >= 800) {
1093             soft_limit_mult = 0;
1094         } else if (params.oomadj >= 700) {
1095             soft_limit_mult = 0;
1096         } else if (params.oomadj >= 600) {
1097             // Launcher should be perceptible, don't kill it.
1098             params.oomadj = 200;
1099             soft_limit_mult = 1;
1100         } else if (params.oomadj >= 500) {
1101             soft_limit_mult = 0;
1102         } else if (params.oomadj >= 400) {
1103             soft_limit_mult = 0;
1104         } else if (params.oomadj >= 300) {
1105             soft_limit_mult = 1;
1106         } else if (params.oomadj >= 200) {
1107             soft_limit_mult = 8;
1108         } else if (params.oomadj >= 100) {
1109             soft_limit_mult = 10;
1110         } else if (params.oomadj >=   0) {
1111             soft_limit_mult = 20;
1112         } else {
1113             // Persistent processes will have a large
1114             // soft limit 512MB.
1115             soft_limit_mult = 64;
1116         }
1117 
1118         snprintf(path, sizeof(path), MEMCG_SYSFS_PATH
1119                  "apps/uid_%d/pid_%d/memory.soft_limit_in_bytes",
1120                  params.uid, params.pid);
1121         snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
1122 
1123         /*
1124          * system_server process has no memcg under /dev/memcg/apps but should be
1125          * registered with lmkd. This is the best way so far to identify it.
1126          */
1127         is_system_server = (params.oomadj == SYSTEM_ADJ &&
1128                             (pwdrec = getpwnam("system")) != NULL &&
1129                             params.uid == pwdrec->pw_uid);
1130         writefilestring(path, val, !is_system_server);
1131     }
1132 
1133     procp = pid_lookup(params.pid);
1134     if (!procp) {
1135         int pidfd = -1;
1136 
1137         if (pidfd_supported) {
1138             pidfd = TEMP_FAILURE_RETRY(sys_pidfd_open(params.pid, 0));
1139             if (pidfd < 0) {
1140                 ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
1141                 return;
1142             }
1143         }
1144 
1145         procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
1146         if (!procp) {
1147             // Oh, the irony.  May need to rebuild our state.
1148             return;
1149         }
1150 
1151         procp->pid = params.pid;
1152         procp->pidfd = pidfd;
1153         procp->uid = params.uid;
1154         procp->reg_pid = cred->pid;
1155         procp->oomadj = params.oomadj;
1156         proc_insert(procp);
1157     } else {
1158         if (!claim_record(procp, cred->pid)) {
1159             char buf[LINE_MAX];
1160             char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1161             /* Only registrant of the record can remove it */
1162             ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
1163                 taskname ? taskname : "A process ", cred->uid, cred->pid);
1164             return;
1165         }
1166         proc_unslot(procp);
1167         procp->oomadj = params.oomadj;
1168         proc_slot(procp);
1169     }
1170 }
1171 
cmd_procremove(LMKD_CTRL_PACKET packet,struct ucred * cred)1172 static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) {
1173     struct lmk_procremove params;
1174     struct proc *procp;
1175 
1176     lmkd_pack_get_procremove(packet, &params);
1177 
1178     if (use_inkernel_interface) {
1179         /*
1180          * Perform an extra check before the pid is removed, after which it
1181          * will be impossible for poll_kernel to get the taskname. poll_kernel()
1182          * is potentially a long-running blocking function; however this method
1183          * handles AMS requests but does not block AMS.
1184          */
1185         poll_kernel(kpoll_fd);
1186 
1187         stats_remove_taskname(params.pid);
1188         return;
1189     }
1190 
1191     procp = pid_lookup(params.pid);
1192     if (!procp) {
1193         return;
1194     }
1195 
1196     if (!claim_record(procp, cred->pid)) {
1197         char buf[LINE_MAX];
1198         char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
1199         /* Only registrant of the record can remove it */
1200         ALOGE("%s (%d, %d) attempts to unregister a process registered by another client",
1201             taskname ? taskname : "A process ", cred->uid, cred->pid);
1202         return;
1203     }
1204 
1205     /*
1206      * WARNING: After pid_remove() procp is freed and can't be used!
1207      * Therefore placed at the end of the function.
1208      */
1209     pid_remove(params.pid);
1210 }
1211 
cmd_procpurge(struct ucred * cred)1212 static void cmd_procpurge(struct ucred *cred) {
1213     int i;
1214     struct proc *procp;
1215     struct proc *next;
1216 
1217     if (use_inkernel_interface) {
1218         stats_purge_tasknames();
1219         return;
1220     }
1221 
1222     for (i = 0; i < PIDHASH_SZ; i++) {
1223         procp = pidhash[i];
1224         while (procp) {
1225             next = procp->pidhash_next;
1226             /* Purge only records created by the requestor */
1227             if (claim_record(procp, cred->pid)) {
1228                 pid_remove(procp->pid);
1229             }
1230             procp = next;
1231         }
1232     }
1233 }
1234 
cmd_subscribe(int dsock_idx,LMKD_CTRL_PACKET packet)1235 static void cmd_subscribe(int dsock_idx, LMKD_CTRL_PACKET packet) {
1236     struct lmk_subscribe params;
1237 
1238     lmkd_pack_get_subscribe(packet, &params);
1239     data_sock[dsock_idx].async_event_mask |= 1 << params.evt_type;
1240 }
1241 
inc_killcnt(int oomadj)1242 static void inc_killcnt(int oomadj) {
1243     int slot = ADJTOSLOT(oomadj);
1244     uint8_t idx = killcnt_idx[slot];
1245 
1246     if (idx == KILLCNT_INVALID_IDX) {
1247         /* index is not assigned for this oomadj */
1248         if (killcnt_free_idx < MAX_DISTINCT_OOM_ADJ) {
1249             killcnt_idx[slot] = killcnt_free_idx;
1250             killcnt[killcnt_free_idx] = 1;
1251             killcnt_free_idx++;
1252         } else {
1253             ALOGW("Number of distinct oomadj levels exceeds %d",
1254                 MAX_DISTINCT_OOM_ADJ);
1255         }
1256     } else {
1257         /*
1258          * wraparound is highly unlikely and is detectable using total
1259          * counter because it has to be equal to the sum of all counters
1260          */
1261         killcnt[idx]++;
1262     }
1263     /* increment total kill counter */
1264     killcnt_total++;
1265 }
1266 
get_killcnt(int min_oomadj,int max_oomadj)1267 static int get_killcnt(int min_oomadj, int max_oomadj) {
1268     int slot;
1269     int count = 0;
1270 
1271     if (min_oomadj > max_oomadj)
1272         return 0;
1273 
1274     /* special case to get total kill count */
1275     if (min_oomadj > OOM_SCORE_ADJ_MAX)
1276         return killcnt_total;
1277 
1278     while (min_oomadj <= max_oomadj &&
1279            (slot = ADJTOSLOT(min_oomadj)) < ADJTOSLOT_COUNT) {
1280         uint8_t idx = killcnt_idx[slot];
1281         if (idx != KILLCNT_INVALID_IDX) {
1282             count += killcnt[idx];
1283         }
1284         min_oomadj++;
1285     }
1286 
1287     return count;
1288 }
1289 
cmd_getkillcnt(LMKD_CTRL_PACKET packet)1290 static int cmd_getkillcnt(LMKD_CTRL_PACKET packet) {
1291     struct lmk_getkillcnt params;
1292 
1293     if (use_inkernel_interface) {
1294         /* kernel driver does not expose this information */
1295         return 0;
1296     }
1297 
1298     lmkd_pack_get_getkillcnt(packet, &params);
1299 
1300     return get_killcnt(params.min_oomadj, params.max_oomadj);
1301 }
1302 
cmd_target(int ntargets,LMKD_CTRL_PACKET packet)1303 static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) {
1304     int i;
1305     struct lmk_target target;
1306     char minfree_str[PROPERTY_VALUE_MAX];
1307     char *pstr = minfree_str;
1308     char *pend = minfree_str + sizeof(minfree_str);
1309     static struct timespec last_req_tm;
1310     struct timespec curr_tm;
1311 
1312     if (ntargets < 1 || ntargets > (int)ARRAY_SIZE(lowmem_adj))
1313         return;
1314 
1315     /*
1316      * Ratelimit minfree updates to once per TARGET_UPDATE_MIN_INTERVAL_MS
1317      * to prevent DoS attacks
1318      */
1319     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1320         ALOGE("Failed to get current time");
1321         return;
1322     }
1323 
1324     if (get_time_diff_ms(&last_req_tm, &curr_tm) <
1325         TARGET_UPDATE_MIN_INTERVAL_MS) {
1326         ALOGE("Ignoring frequent updated to lmkd limits");
1327         return;
1328     }
1329 
1330     last_req_tm = curr_tm;
1331 
1332     for (i = 0; i < ntargets; i++) {
1333         lmkd_pack_get_target(packet, i, &target);
1334         lowmem_minfree[i] = target.minfree;
1335         lowmem_adj[i] = target.oom_adj_score;
1336 
1337         pstr += snprintf(pstr, pend - pstr, "%d:%d,", target.minfree,
1338             target.oom_adj_score);
1339         if (pstr >= pend) {
1340             /* if no more space in the buffer then terminate the loop */
1341             pstr = pend;
1342             break;
1343         }
1344     }
1345 
1346     lowmem_targets_size = ntargets;
1347 
1348     /* Override the last extra comma */
1349     pstr[-1] = '\0';
1350     property_set("sys.lmk.minfree_levels", minfree_str);
1351 
1352     if (has_inkernel_module) {
1353         char minfreestr[128];
1354         char killpriostr[128];
1355 
1356         minfreestr[0] = '\0';
1357         killpriostr[0] = '\0';
1358 
1359         for (i = 0; i < lowmem_targets_size; i++) {
1360             char val[40];
1361 
1362             if (i) {
1363                 strlcat(minfreestr, ",", sizeof(minfreestr));
1364                 strlcat(killpriostr, ",", sizeof(killpriostr));
1365             }
1366 
1367             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_minfree[i] : 0);
1368             strlcat(minfreestr, val, sizeof(minfreestr));
1369             snprintf(val, sizeof(val), "%d", use_inkernel_interface ? lowmem_adj[i] : 0);
1370             strlcat(killpriostr, val, sizeof(killpriostr));
1371         }
1372 
1373         writefilestring(INKERNEL_MINFREE_PATH, minfreestr, true);
1374         writefilestring(INKERNEL_ADJ_PATH, killpriostr, true);
1375     }
1376 }
1377 
ctrl_command_handler(int dsock_idx)1378 static void ctrl_command_handler(int dsock_idx) {
1379     LMKD_CTRL_PACKET packet;
1380     struct ucred cred;
1381     int len;
1382     enum lmk_cmd cmd;
1383     int nargs;
1384     int targets;
1385     int kill_cnt;
1386     int result;
1387 
1388     len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
1389     if (len <= 0)
1390         return;
1391 
1392     if (len < (int)sizeof(int)) {
1393         ALOGE("Wrong control socket read length len=%d", len);
1394         return;
1395     }
1396 
1397     cmd = lmkd_pack_get_cmd(packet);
1398     nargs = len / sizeof(int) - 1;
1399     if (nargs < 0)
1400         goto wronglen;
1401 
1402     switch(cmd) {
1403     case LMK_TARGET:
1404         targets = nargs / 2;
1405         if (nargs & 0x1 || targets > (int)ARRAY_SIZE(lowmem_adj))
1406             goto wronglen;
1407         cmd_target(targets, packet);
1408         break;
1409     case LMK_PROCPRIO:
1410         /* process type field is optional for backward compatibility */
1411         if (nargs < 3 || nargs > 4)
1412             goto wronglen;
1413         cmd_procprio(packet, nargs, &cred);
1414         break;
1415     case LMK_PROCREMOVE:
1416         if (nargs != 1)
1417             goto wronglen;
1418         cmd_procremove(packet, &cred);
1419         break;
1420     case LMK_PROCPURGE:
1421         if (nargs != 0)
1422             goto wronglen;
1423         cmd_procpurge(&cred);
1424         break;
1425     case LMK_GETKILLCNT:
1426         if (nargs != 2)
1427             goto wronglen;
1428         kill_cnt = cmd_getkillcnt(packet);
1429         len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
1430         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
1431             return;
1432         break;
1433     case LMK_SUBSCRIBE:
1434         if (nargs != 1)
1435             goto wronglen;
1436         cmd_subscribe(dsock_idx, packet);
1437         break;
1438     case LMK_PROCKILL:
1439         /* This command code is NOT expected at all */
1440         ALOGE("Received unexpected command code %d", cmd);
1441         break;
1442     case LMK_UPDATE_PROPS:
1443         if (nargs != 0)
1444             goto wronglen;
1445         update_props();
1446         if (!use_inkernel_interface) {
1447             /* Reinitialize monitors to apply new settings */
1448             destroy_monitors();
1449             result = init_monitors() ? 0 : -1;
1450         } else {
1451             result = 0;
1452         }
1453         len = lmkd_pack_set_update_props_repl(packet, result);
1454         if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
1455             ALOGE("Failed to report operation results");
1456         }
1457         if (!result) {
1458             ALOGI("Properties reinitilized");
1459         } else {
1460             /* New settings can't be supported, crash to be restarted */
1461             ALOGE("New configuration is not supported. Exiting...");
1462             exit(1);
1463         }
1464         break;
1465     default:
1466         ALOGE("Received unknown command code %d", cmd);
1467         return;
1468     }
1469 
1470     return;
1471 
1472 wronglen:
1473     ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
1474 }
1475 
ctrl_data_handler(int data,uint32_t events,struct polling_params * poll_params __unused)1476 static void ctrl_data_handler(int data, uint32_t events,
1477                               struct polling_params *poll_params __unused) {
1478     if (events & EPOLLIN) {
1479         ctrl_command_handler(data);
1480     }
1481 }
1482 
get_free_dsock()1483 static int get_free_dsock() {
1484     for (int i = 0; i < MAX_DATA_CONN; i++) {
1485         if (data_sock[i].sock < 0) {
1486             return i;
1487         }
1488     }
1489     return -1;
1490 }
1491 
ctrl_connect_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)1492 static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
1493                                  struct polling_params *poll_params __unused) {
1494     struct epoll_event epev;
1495     int free_dscock_idx = get_free_dsock();
1496 
1497     if (free_dscock_idx < 0) {
1498         /*
1499          * Number of data connections exceeded max supported. This should not
1500          * happen but if it does we drop all existing connections and accept
1501          * the new one. This prevents inactive connections from monopolizing
1502          * data socket and if we drop ActivityManager connection it will
1503          * immediately reconnect.
1504          */
1505         for (int i = 0; i < MAX_DATA_CONN; i++) {
1506             ctrl_data_close(i);
1507         }
1508         free_dscock_idx = 0;
1509     }
1510 
1511     data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
1512     if (data_sock[free_dscock_idx].sock < 0) {
1513         ALOGE("lmkd control socket accept failed; errno=%d", errno);
1514         return;
1515     }
1516 
1517     ALOGI("lmkd data connection established");
1518     /* use data to store data connection idx */
1519     data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
1520     data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
1521     data_sock[free_dscock_idx].async_event_mask = 0;
1522     epev.events = EPOLLIN;
1523     epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
1524     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
1525         ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
1526         ctrl_data_close(free_dscock_idx);
1527         return;
1528     }
1529     maxevents++;
1530 }
1531 
1532 /*
1533  * /proc/zoneinfo parsing routines
1534  * Expected file format is:
1535  *
1536  *   Node <node_id>, zone   <zone_name>
1537  *   (
1538  *    per-node stats
1539  *       (<per-node field name> <value>)+
1540  *   )?
1541  *   (pages free     <value>
1542  *       (<per-zone field name> <value>)+
1543  *    pagesets
1544  *       (<unused fields>)*
1545  *   )+
1546  *   ...
1547  */
zoneinfo_parse_protection(char * buf,struct zoneinfo_zone * zone)1548 static void zoneinfo_parse_protection(char *buf, struct zoneinfo_zone *zone) {
1549     int zone_idx;
1550     int64_t max = 0;
1551     char *save_ptr;
1552 
1553     for (buf = strtok_r(buf, "(), ", &save_ptr), zone_idx = 0;
1554          buf && zone_idx < MAX_NR_ZONES;
1555          buf = strtok_r(NULL, "), ", &save_ptr), zone_idx++) {
1556         long long zoneval = strtoll(buf, &buf, 0);
1557         if (zoneval > max) {
1558             max = (zoneval > INT64_MAX) ? INT64_MAX : zoneval;
1559         }
1560         zone->protection[zone_idx] = zoneval;
1561     }
1562     zone->max_protection = max;
1563 }
1564 
zoneinfo_parse_zone(char ** buf,struct zoneinfo_zone * zone)1565 static int zoneinfo_parse_zone(char **buf, struct zoneinfo_zone *zone) {
1566     for (char *line = strtok_r(NULL, "\n", buf); line;
1567          line = strtok_r(NULL, "\n", buf)) {
1568         char *cp;
1569         char *ap;
1570         char *save_ptr;
1571         int64_t val;
1572         int field_idx;
1573         enum field_match_result match_res;
1574 
1575         cp = strtok_r(line, " ", &save_ptr);
1576         if (!cp) {
1577             return false;
1578         }
1579 
1580         field_idx = find_field(cp, zoneinfo_zone_spec_field_names, ZI_ZONE_SPEC_FIELD_COUNT);
1581         if (field_idx >= 0) {
1582             /* special field */
1583             if (field_idx == ZI_ZONE_SPEC_PAGESETS) {
1584                 /* no mode fields we are interested in */
1585                 return true;
1586             }
1587 
1588             /* protection field */
1589             ap = strtok_r(NULL, ")", &save_ptr);
1590             if (ap) {
1591                 zoneinfo_parse_protection(ap, zone);
1592             }
1593             continue;
1594         }
1595 
1596         ap = strtok_r(NULL, " ", &save_ptr);
1597         if (!ap) {
1598             continue;
1599         }
1600 
1601         match_res = match_field(cp, ap, zoneinfo_zone_field_names, ZI_ZONE_FIELD_COUNT,
1602             &val, &field_idx);
1603         if (match_res == PARSE_FAIL) {
1604             return false;
1605         }
1606         if (match_res == PARSE_SUCCESS) {
1607             zone->fields.arr[field_idx] = val;
1608         }
1609         if (field_idx == ZI_ZONE_PRESENT && val == 0) {
1610             /* zone is not populated, stop parsing it */
1611             return true;
1612         }
1613     }
1614     return false;
1615 }
1616 
zoneinfo_parse_node(char ** buf,struct zoneinfo_node * node)1617 static int zoneinfo_parse_node(char **buf, struct zoneinfo_node *node) {
1618     int fields_to_match = ZI_NODE_FIELD_COUNT;
1619 
1620     for (char *line = strtok_r(NULL, "\n", buf); line;
1621          line = strtok_r(NULL, "\n", buf)) {
1622         char *cp;
1623         char *ap;
1624         char *save_ptr;
1625         int64_t val;
1626         int field_idx;
1627         enum field_match_result match_res;
1628 
1629         cp = strtok_r(line, " ", &save_ptr);
1630         if (!cp) {
1631             return false;
1632         }
1633 
1634         ap = strtok_r(NULL, " ", &save_ptr);
1635         if (!ap) {
1636             return false;
1637         }
1638 
1639         match_res = match_field(cp, ap, zoneinfo_node_field_names, ZI_NODE_FIELD_COUNT,
1640             &val, &field_idx);
1641         if (match_res == PARSE_FAIL) {
1642             return false;
1643         }
1644         if (match_res == PARSE_SUCCESS) {
1645             node->fields.arr[field_idx] = val;
1646             fields_to_match--;
1647             if (!fields_to_match) {
1648                 return true;
1649             }
1650         }
1651     }
1652     return false;
1653 }
1654 
zoneinfo_parse(struct zoneinfo * zi)1655 static int zoneinfo_parse(struct zoneinfo *zi) {
1656     static struct reread_data file_data = {
1657         .filename = ZONEINFO_PATH,
1658         .fd = -1,
1659     };
1660     char *buf;
1661     char *save_ptr;
1662     char *line;
1663     char zone_name[LINE_MAX + 1];
1664     struct zoneinfo_node *node = NULL;
1665     int node_idx = 0;
1666     int zone_idx = 0;
1667 
1668     memset(zi, 0, sizeof(struct zoneinfo));
1669 
1670     if ((buf = reread_file(&file_data)) == NULL) {
1671         return -1;
1672     }
1673 
1674     for (line = strtok_r(buf, "\n", &save_ptr); line;
1675          line = strtok_r(NULL, "\n", &save_ptr)) {
1676         int node_id;
1677         if (sscanf(line, "Node %d, zone %" STRINGIFY(LINE_MAX) "s", &node_id, zone_name) == 2) {
1678             if (!node || node->id != node_id) {
1679                 /* new node is found */
1680                 if (node) {
1681                     node->zone_count = zone_idx + 1;
1682                     node_idx++;
1683                     if (node_idx == MAX_NR_NODES) {
1684                         /* max node count exceeded */
1685                         ALOGE("%s parse error", file_data.filename);
1686                         return -1;
1687                     }
1688                 }
1689                 node = &zi->nodes[node_idx];
1690                 node->id = node_id;
1691                 zone_idx = 0;
1692                 if (!zoneinfo_parse_node(&save_ptr, node)) {
1693                     ALOGE("%s parse error", file_data.filename);
1694                     return -1;
1695                 }
1696             } else {
1697                 /* new zone is found */
1698                 zone_idx++;
1699             }
1700             if (!zoneinfo_parse_zone(&save_ptr, &node->zones[zone_idx])) {
1701                 ALOGE("%s parse error", file_data.filename);
1702                 return -1;
1703             }
1704         }
1705     }
1706     if (!node) {
1707         ALOGE("%s parse error", file_data.filename);
1708         return -1;
1709     }
1710     node->zone_count = zone_idx + 1;
1711     zi->node_count = node_idx + 1;
1712 
1713     /* calculate totals fields */
1714     for (node_idx = 0; node_idx < zi->node_count; node_idx++) {
1715         node = &zi->nodes[node_idx];
1716         for (zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
1717             struct zoneinfo_zone *zone = &zi->nodes[node_idx].zones[zone_idx];
1718             zi->totalreserve_pages += zone->max_protection + zone->fields.field.high;
1719         }
1720         zi->total_inactive_file += node->fields.field.nr_inactive_file;
1721         zi->total_active_file += node->fields.field.nr_active_file;
1722         zi->total_workingset_refault += node->fields.field.workingset_refault;
1723     }
1724     return 0;
1725 }
1726 
1727 /* /proc/meminfo parsing routines */
meminfo_parse_line(char * line,union meminfo * mi)1728 static bool meminfo_parse_line(char *line, union meminfo *mi) {
1729     char *cp = line;
1730     char *ap;
1731     char *save_ptr;
1732     int64_t val;
1733     int field_idx;
1734     enum field_match_result match_res;
1735 
1736     cp = strtok_r(line, " ", &save_ptr);
1737     if (!cp) {
1738         return false;
1739     }
1740 
1741     ap = strtok_r(NULL, " ", &save_ptr);
1742     if (!ap) {
1743         return false;
1744     }
1745 
1746     match_res = match_field(cp, ap, meminfo_field_names, MI_FIELD_COUNT,
1747         &val, &field_idx);
1748     if (match_res == PARSE_SUCCESS) {
1749         mi->arr[field_idx] = val / page_k;
1750     }
1751     return (match_res != PARSE_FAIL);
1752 }
1753 
meminfo_parse(union meminfo * mi)1754 static int meminfo_parse(union meminfo *mi) {
1755     static struct reread_data file_data = {
1756         .filename = MEMINFO_PATH,
1757         .fd = -1,
1758     };
1759     char *buf;
1760     char *save_ptr;
1761     char *line;
1762 
1763     memset(mi, 0, sizeof(union meminfo));
1764 
1765     if ((buf = reread_file(&file_data)) == NULL) {
1766         return -1;
1767     }
1768 
1769     for (line = strtok_r(buf, "\n", &save_ptr); line;
1770          line = strtok_r(NULL, "\n", &save_ptr)) {
1771         if (!meminfo_parse_line(line, mi)) {
1772             ALOGE("%s parse error", file_data.filename);
1773             return -1;
1774         }
1775     }
1776     mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached +
1777         mi->field.buffers;
1778 
1779     return 0;
1780 }
1781 
1782 /* /proc/vmstat parsing routines */
vmstat_parse_line(char * line,union vmstat * vs)1783 static bool vmstat_parse_line(char *line, union vmstat *vs) {
1784     char *cp;
1785     char *ap;
1786     char *save_ptr;
1787     int64_t val;
1788     int field_idx;
1789     enum field_match_result match_res;
1790 
1791     cp = strtok_r(line, " ", &save_ptr);
1792     if (!cp) {
1793         return false;
1794     }
1795 
1796     ap = strtok_r(NULL, " ", &save_ptr);
1797     if (!ap) {
1798         return false;
1799     }
1800 
1801     match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
1802         &val, &field_idx);
1803     if (match_res == PARSE_SUCCESS) {
1804         vs->arr[field_idx] = val;
1805     }
1806     return (match_res != PARSE_FAIL);
1807 }
1808 
vmstat_parse(union vmstat * vs)1809 static int vmstat_parse(union vmstat *vs) {
1810     static struct reread_data file_data = {
1811         .filename = VMSTAT_PATH,
1812         .fd = -1,
1813     };
1814     char *buf;
1815     char *save_ptr;
1816     char *line;
1817 
1818     memset(vs, 0, sizeof(union vmstat));
1819 
1820     if ((buf = reread_file(&file_data)) == NULL) {
1821         return -1;
1822     }
1823 
1824     for (line = strtok_r(buf, "\n", &save_ptr); line;
1825          line = strtok_r(NULL, "\n", &save_ptr)) {
1826         if (!vmstat_parse_line(line, vs)) {
1827             ALOGE("%s parse error", file_data.filename);
1828             return -1;
1829         }
1830     }
1831 
1832     return 0;
1833 }
1834 
1835 enum wakeup_reason {
1836     Event,
1837     Polling
1838 };
1839 
1840 struct wakeup_info {
1841     struct timespec wakeup_tm;
1842     struct timespec prev_wakeup_tm;
1843     struct timespec last_event_tm;
1844     int wakeups_since_event;
1845     int skipped_wakeups;
1846 };
1847 
1848 /*
1849  * After the initial memory pressure event is received lmkd schedules periodic wakeups to check
1850  * the memory conditions and kill if needed (polling). This is done because pressure events are
1851  * rate-limited and memory conditions can change in between events. Therefore after the initial
1852  * event there might be multiple wakeups. This function records the wakeup information such as the
1853  * timestamps of the last event and the last wakeup, the number of wakeups since the last event
1854  * and how many of those wakeups were skipped (some wakeups are skipped if previously killed
1855  * process is still freeing its memory).
1856  */
record_wakeup_time(struct timespec * tm,enum wakeup_reason reason,struct wakeup_info * wi)1857 static void record_wakeup_time(struct timespec *tm, enum wakeup_reason reason,
1858                                struct wakeup_info *wi) {
1859     wi->prev_wakeup_tm = wi->wakeup_tm;
1860     wi->wakeup_tm = *tm;
1861     if (reason == Event) {
1862         wi->last_event_tm = *tm;
1863         wi->wakeups_since_event = 0;
1864         wi->skipped_wakeups = 0;
1865     } else {
1866         wi->wakeups_since_event++;
1867     }
1868 }
1869 
killinfo_log(struct proc * procp,int min_oom_score,int tasksize,int kill_reason,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)1870 static void killinfo_log(struct proc* procp, int min_oom_score, int tasksize,
1871                          int kill_reason, union meminfo *mi,
1872                          struct wakeup_info *wi, struct timespec *tm) {
1873     /* log process information */
1874     android_log_write_int32(ctx, procp->pid);
1875     android_log_write_int32(ctx, procp->uid);
1876     android_log_write_int32(ctx, procp->oomadj);
1877     android_log_write_int32(ctx, min_oom_score);
1878     android_log_write_int32(ctx, (int32_t)min(tasksize * page_k, INT32_MAX));
1879     android_log_write_int32(ctx, kill_reason);
1880 
1881     /* log meminfo fields */
1882     for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
1883         android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
1884     }
1885 
1886     /* log lmkd wakeup information */
1887     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->last_event_tm, tm));
1888     android_log_write_int32(ctx, (int32_t)get_time_diff_ms(&wi->prev_wakeup_tm, tm));
1889     android_log_write_int32(ctx, wi->wakeups_since_event);
1890     android_log_write_int32(ctx, wi->skipped_wakeups);
1891 
1892     android_log_write_list(ctx, LOG_ID_EVENTS);
1893     android_log_reset(ctx);
1894 }
1895 
proc_adj_lru(int oomadj)1896 static struct proc *proc_adj_lru(int oomadj) {
1897     return (struct proc *)adjslot_tail(&procadjslot_list[ADJTOSLOT(oomadj)]);
1898 }
1899 
proc_get_heaviest(int oomadj)1900 static struct proc *proc_get_heaviest(int oomadj) {
1901     struct adjslot_list *head = &procadjslot_list[ADJTOSLOT(oomadj)];
1902     struct adjslot_list *curr = head->next;
1903     struct proc *maxprocp = NULL;
1904     int maxsize = 0;
1905     while (curr != head) {
1906         int pid = ((struct proc *)curr)->pid;
1907         int tasksize = proc_get_size(pid);
1908         if (tasksize <= 0) {
1909             struct adjslot_list *next = curr->next;
1910             pid_remove(pid);
1911             curr = next;
1912         } else {
1913             if (tasksize > maxsize) {
1914                 maxsize = tasksize;
1915                 maxprocp = (struct proc *)curr;
1916             }
1917             curr = curr->next;
1918         }
1919     }
1920     return maxprocp;
1921 }
1922 
set_process_group_and_prio(int pid,SchedPolicy sp,int prio)1923 static void set_process_group_and_prio(int pid, SchedPolicy sp, int prio) {
1924     DIR* d;
1925     char proc_path[PATH_MAX];
1926     struct dirent* de;
1927 
1928     snprintf(proc_path, sizeof(proc_path), "/proc/%d/task", pid);
1929     if (!(d = opendir(proc_path))) {
1930         ALOGW("Failed to open %s; errno=%d: process pid(%d) might have died", proc_path, errno,
1931               pid);
1932         return;
1933     }
1934 
1935     while ((de = readdir(d))) {
1936         int t_pid;
1937 
1938         if (de->d_name[0] == '.') continue;
1939         t_pid = atoi(de->d_name);
1940 
1941         if (!t_pid) {
1942             ALOGW("Failed to get t_pid for '%s' of pid(%d)", de->d_name, pid);
1943             continue;
1944         }
1945 
1946         if (setpriority(PRIO_PROCESS, t_pid, prio) && errno != ESRCH) {
1947             ALOGW("Unable to raise priority of killing t_pid (%d): errno=%d", t_pid, errno);
1948         }
1949 
1950         if (set_cpuset_policy(t_pid, sp)) {
1951             ALOGW("Failed to set_cpuset_policy on pid(%d) t_pid(%d) to %d", pid, t_pid, (int)sp);
1952             continue;
1953         }
1954     }
1955     closedir(d);
1956 }
1957 
is_kill_pending(void)1958 static bool is_kill_pending(void) {
1959     char buf[24];
1960 
1961     if (last_kill_pid_or_fd < 0) {
1962         return false;
1963     }
1964 
1965     if (pidfd_supported) {
1966         return true;
1967     }
1968 
1969     /* when pidfd is not supported base the decision on /proc/<pid> existence */
1970     snprintf(buf, sizeof(buf), "/proc/%d/", last_kill_pid_or_fd);
1971     if (access(buf, F_OK) == 0) {
1972         return true;
1973     }
1974 
1975     return false;
1976 }
1977 
is_waiting_for_kill(void)1978 static bool is_waiting_for_kill(void) {
1979     return pidfd_supported && last_kill_pid_or_fd >= 0;
1980 }
1981 
stop_wait_for_proc_kill(bool finished)1982 static void stop_wait_for_proc_kill(bool finished) {
1983     struct epoll_event epev;
1984 
1985     if (last_kill_pid_or_fd < 0) {
1986         return;
1987     }
1988 
1989     if (debug_process_killing) {
1990         struct timespec curr_tm;
1991 
1992         if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
1993             /*
1994              * curr_tm is used here merely to report kill duration, so this failure is not fatal.
1995              * Log an error and continue.
1996              */
1997             ALOGE("Failed to get current time");
1998         }
1999 
2000         if (finished) {
2001             ALOGI("Process got killed in %ldms",
2002                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2003         } else {
2004             ALOGI("Stop waiting for process kill after %ldms",
2005                 get_time_diff_ms(&last_kill_tm, &curr_tm));
2006         }
2007     }
2008 
2009     if (pidfd_supported) {
2010         /* unregister fd */
2011         if (epoll_ctl(epollfd, EPOLL_CTL_DEL, last_kill_pid_or_fd, &epev)) {
2012             // Log an error and keep going
2013             ALOGE("epoll_ctl for last killed process failed; errno=%d", errno);
2014         }
2015         maxevents--;
2016         close(last_kill_pid_or_fd);
2017     }
2018 
2019     last_kill_pid_or_fd = -1;
2020 }
2021 
kill_done_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params)2022 static void kill_done_handler(int data __unused, uint32_t events __unused,
2023                               struct polling_params *poll_params) {
2024     stop_wait_for_proc_kill(true);
2025     poll_params->update = POLLING_RESUME;
2026 }
2027 
start_wait_for_proc_kill(int pid_or_fd)2028 static void start_wait_for_proc_kill(int pid_or_fd) {
2029     static struct event_handler_info kill_done_hinfo = { 0, kill_done_handler };
2030     struct epoll_event epev;
2031 
2032     if (last_kill_pid_or_fd >= 0) {
2033         /* Should not happen but if it does we should stop previous wait */
2034         ALOGE("Attempt to wait for a kill while another wait is in progress");
2035         stop_wait_for_proc_kill(false);
2036     }
2037 
2038     last_kill_pid_or_fd = pid_or_fd;
2039 
2040     if (!pidfd_supported) {
2041         /* If pidfd is not supported just store PID and exit */
2042         return;
2043     }
2044 
2045     epev.events = EPOLLIN;
2046     epev.data.ptr = (void *)&kill_done_hinfo;
2047     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, last_kill_pid_or_fd, &epev) != 0) {
2048         ALOGE("epoll_ctl for last kill failed; errno=%d", errno);
2049         close(last_kill_pid_or_fd);
2050         last_kill_pid_or_fd = -1;
2051         return;
2052     }
2053     maxevents++;
2054 }
2055 
2056 /* Kill one process specified by procp.  Returns the size of the process killed */
kill_one_process(struct proc * procp,int min_oom_score,enum kill_reasons kill_reason,const char * kill_desc,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2057 static int kill_one_process(struct proc* procp, int min_oom_score, enum kill_reasons kill_reason,
2058                             const char *kill_desc, union meminfo *mi, struct wakeup_info *wi,
2059                             struct timespec *tm) {
2060     int pid = procp->pid;
2061     int pidfd = procp->pidfd;
2062     uid_t uid = procp->uid;
2063     int tgid;
2064     char *taskname;
2065     int tasksize;
2066     int r;
2067     int result = -1;
2068     struct memory_stat *mem_st;
2069     char buf[LINE_MAX];
2070     struct kill_stat kill_st;
2071 
2072     tgid = proc_get_tgid(pid);
2073     if (tgid >= 0 && tgid != pid) {
2074         ALOGE("Possible pid reuse detected (pid %d, tgid %d)!", pid, tgid);
2075         goto out;
2076     }
2077 
2078     taskname = proc_get_name(pid, buf, sizeof(buf));
2079     if (!taskname) {
2080         goto out;
2081     }
2082 
2083     tasksize = proc_get_size(pid);
2084     if (tasksize <= 0) {
2085         goto out;
2086     }
2087 
2088     mem_st = stats_read_memory_stat(per_app_memcg, pid, uid);
2089 
2090     TRACE_KILL_START(pid);
2091 
2092     /* CAP_KILL required */
2093     if (pidfd < 0) {
2094         start_wait_for_proc_kill(pid);
2095         r = kill(pid, SIGKILL);
2096     } else {
2097         start_wait_for_proc_kill(pidfd);
2098         r = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
2099     }
2100 
2101     TRACE_KILL_END();
2102 
2103     if (r) {
2104         stop_wait_for_proc_kill(false);
2105         ALOGE("kill(%d): errno=%d", pid, errno);
2106         /* Delete process record even when we fail to kill so that we don't get stuck on it */
2107         goto out;
2108     }
2109 
2110     set_process_group_and_prio(pid, SP_FOREGROUND, ANDROID_PRIORITY_HIGHEST);
2111 
2112     last_kill_tm = *tm;
2113 
2114     inc_killcnt(procp->oomadj);
2115 
2116     killinfo_log(procp, min_oom_score, tasksize, kill_reason, mi, wi, tm);
2117 
2118     if (kill_desc) {
2119         ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB; reason: %s", taskname, pid,
2120               uid, procp->oomadj, tasksize * page_k, kill_desc);
2121     } else {
2122         ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB", taskname, pid,
2123               uid, procp->oomadj, tasksize * page_k);
2124     }
2125 
2126     kill_st.uid = static_cast<int32_t>(uid);
2127     kill_st.taskname = taskname;
2128     kill_st.kill_reason = kill_reason;
2129     kill_st.oom_score = procp->oomadj;
2130     kill_st.min_oom_score = min_oom_score;
2131     kill_st.free_mem_kb = mi->field.nr_free_pages * page_k;
2132     kill_st.free_swap_kb = mi->field.free_swap * page_k;
2133     kill_st.tasksize = tasksize;
2134     stats_write_lmk_kill_occurred(&kill_st, mem_st);
2135 
2136     ctrl_data_write_lmk_kill_occurred((pid_t)pid, uid);
2137 
2138     result = tasksize;
2139 
2140 out:
2141     /*
2142      * WARNING: After pid_remove() procp is freed and can't be used!
2143      * Therefore placed at the end of the function.
2144      */
2145     pid_remove(pid);
2146     return result;
2147 }
2148 
2149 /*
2150  * Find one process to kill at or above the given oom_adj level.
2151  * Returns size of the killed process.
2152  */
find_and_kill_process(int min_score_adj,enum kill_reasons kill_reason,const char * kill_desc,union meminfo * mi,struct wakeup_info * wi,struct timespec * tm)2153 static int find_and_kill_process(int min_score_adj, enum kill_reasons kill_reason,
2154                                  const char *kill_desc, union meminfo *mi,
2155                                  struct wakeup_info *wi, struct timespec *tm) {
2156     int i;
2157     int killed_size = 0;
2158     bool lmk_state_change_start = false;
2159     bool choose_heaviest_task = kill_heaviest_task;
2160 
2161     for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
2162         struct proc *procp;
2163 
2164         if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
2165             /*
2166              * If we have to choose a perceptible process, choose the heaviest one to
2167              * hopefully minimize the number of victims.
2168              */
2169             choose_heaviest_task = true;
2170         }
2171 
2172         while (true) {
2173             procp = choose_heaviest_task ?
2174                 proc_get_heaviest(i) : proc_adj_lru(i);
2175 
2176             if (!procp)
2177                 break;
2178 
2179             killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc,
2180                                            mi, wi, tm);
2181             if (killed_size >= 0) {
2182                 if (!lmk_state_change_start) {
2183                     lmk_state_change_start = true;
2184                     stats_write_lmk_state_changed(
2185                             android::lmkd::stats::LMK_STATE_CHANGED__STATE__START);
2186                 }
2187                 break;
2188             }
2189         }
2190         if (killed_size) {
2191             break;
2192         }
2193     }
2194 
2195     if (lmk_state_change_start) {
2196         stats_write_lmk_state_changed(android::lmkd::stats::LMK_STATE_CHANGED__STATE__STOP);
2197     }
2198 
2199     return killed_size;
2200 }
2201 
get_memory_usage(struct reread_data * file_data)2202 static int64_t get_memory_usage(struct reread_data *file_data) {
2203     int64_t mem_usage;
2204     char *buf;
2205 
2206     if ((buf = reread_file(file_data)) == NULL) {
2207         return -1;
2208     }
2209 
2210     if (!parse_int64(buf, &mem_usage)) {
2211         ALOGE("%s parse error", file_data->filename);
2212         return -1;
2213     }
2214     if (mem_usage == 0) {
2215         ALOGE("No memory!");
2216         return -1;
2217     }
2218     return mem_usage;
2219 }
2220 
record_low_pressure_levels(union meminfo * mi)2221 void record_low_pressure_levels(union meminfo *mi) {
2222     if (low_pressure_mem.min_nr_free_pages == -1 ||
2223         low_pressure_mem.min_nr_free_pages > mi->field.nr_free_pages) {
2224         if (debug_process_killing) {
2225             ALOGI("Low pressure min memory update from %" PRId64 " to %" PRId64,
2226                 low_pressure_mem.min_nr_free_pages, mi->field.nr_free_pages);
2227         }
2228         low_pressure_mem.min_nr_free_pages = mi->field.nr_free_pages;
2229     }
2230     /*
2231      * Free memory at low vmpressure events occasionally gets spikes,
2232      * possibly a stale low vmpressure event with memory already
2233      * freed up (no memory pressure should have been reported).
2234      * Ignore large jumps in max_nr_free_pages that would mess up our stats.
2235      */
2236     if (low_pressure_mem.max_nr_free_pages == -1 ||
2237         (low_pressure_mem.max_nr_free_pages < mi->field.nr_free_pages &&
2238          mi->field.nr_free_pages - low_pressure_mem.max_nr_free_pages <
2239          low_pressure_mem.max_nr_free_pages * 0.1)) {
2240         if (debug_process_killing) {
2241             ALOGI("Low pressure max memory update from %" PRId64 " to %" PRId64,
2242                 low_pressure_mem.max_nr_free_pages, mi->field.nr_free_pages);
2243         }
2244         low_pressure_mem.max_nr_free_pages = mi->field.nr_free_pages;
2245     }
2246 }
2247 
upgrade_level(enum vmpressure_level level)2248 enum vmpressure_level upgrade_level(enum vmpressure_level level) {
2249     return (enum vmpressure_level)((level < VMPRESS_LEVEL_CRITICAL) ?
2250         level + 1 : level);
2251 }
2252 
downgrade_level(enum vmpressure_level level)2253 enum vmpressure_level downgrade_level(enum vmpressure_level level) {
2254     return (enum vmpressure_level)((level > VMPRESS_LEVEL_LOW) ?
2255         level - 1 : level);
2256 }
2257 
2258 enum zone_watermark {
2259     WMARK_MIN = 0,
2260     WMARK_LOW,
2261     WMARK_HIGH,
2262     WMARK_NONE
2263 };
2264 
2265 struct zone_watermarks {
2266     long high_wmark;
2267     long low_wmark;
2268     long min_wmark;
2269 };
2270 
2271 /*
2272  * Returns lowest breached watermark or WMARK_NONE.
2273  */
get_lowest_watermark(union meminfo * mi,struct zone_watermarks * watermarks)2274 static enum zone_watermark get_lowest_watermark(union meminfo *mi,
2275                                                 struct zone_watermarks *watermarks)
2276 {
2277     int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
2278 
2279     if (nr_free_pages < watermarks->min_wmark) {
2280         return WMARK_MIN;
2281     }
2282     if (nr_free_pages < watermarks->low_wmark) {
2283         return WMARK_LOW;
2284     }
2285     if (nr_free_pages < watermarks->high_wmark) {
2286         return WMARK_HIGH;
2287     }
2288     return WMARK_NONE;
2289 }
2290 
calc_zone_watermarks(struct zoneinfo * zi,struct zone_watermarks * watermarks)2291 void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
2292     memset(watermarks, 0, sizeof(struct zone_watermarks));
2293 
2294     for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
2295         struct zoneinfo_node *node = &zi->nodes[node_idx];
2296         for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
2297             struct zoneinfo_zone *zone = &node->zones[zone_idx];
2298 
2299             if (!zone->fields.field.present) {
2300                 continue;
2301             }
2302 
2303             watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
2304             watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
2305             watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
2306         }
2307     }
2308 }
2309 
mp_event_psi(int data,uint32_t events,struct polling_params * poll_params)2310 static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
2311     enum reclaim_state {
2312         NO_RECLAIM = 0,
2313         KSWAPD_RECLAIM,
2314         DIRECT_RECLAIM,
2315     };
2316     static int64_t init_ws_refault;
2317     static int64_t prev_workingset_refault;
2318     static int64_t base_file_lru;
2319     static int64_t init_pgscan_kswapd;
2320     static int64_t init_pgscan_direct;
2321     static int64_t swap_low_threshold;
2322     static bool killing;
2323     static int thrashing_limit = thrashing_limit_pct;
2324     static struct zone_watermarks watermarks;
2325     static struct timespec wmark_update_tm;
2326     static struct wakeup_info wi;
2327     static struct timespec thrashing_reset_tm;
2328     static int64_t prev_thrash_growth = 0;
2329 
2330     union meminfo mi;
2331     union vmstat vs;
2332     struct timespec curr_tm;
2333     int64_t thrashing = 0;
2334     bool swap_is_low = false;
2335     enum vmpressure_level level = (enum vmpressure_level)data;
2336     enum kill_reasons kill_reason = NONE;
2337     bool cycle_after_kill = false;
2338     enum reclaim_state reclaim = NO_RECLAIM;
2339     enum zone_watermark wmark = WMARK_NONE;
2340     char kill_desc[LINE_MAX];
2341     bool cut_thrashing_limit = false;
2342     int min_score_adj = 0;
2343     long since_thrashing_reset_ms;
2344 
2345     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2346         ALOGE("Failed to get current time");
2347         return;
2348     }
2349 
2350     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2351 
2352     bool kill_pending = is_kill_pending();
2353     if (kill_pending && (kill_timeout_ms == 0 ||
2354         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
2355         /* Skip while still killing a process */
2356         wi.skipped_wakeups++;
2357         goto no_kill;
2358     }
2359     /*
2360      * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
2361      * supported and death notification already caused waiting to stop.
2362      */
2363     stop_wait_for_proc_kill(!kill_pending);
2364 
2365     if (vmstat_parse(&vs) < 0) {
2366         ALOGE("Failed to parse vmstat!");
2367         return;
2368     }
2369 
2370     if (meminfo_parse(&mi) < 0) {
2371         ALOGE("Failed to parse meminfo!");
2372         return;
2373     }
2374 
2375     /* Reset states after process got killed */
2376     if (killing) {
2377         killing = false;
2378         cycle_after_kill = true;
2379         /* Reset file-backed pagecache size and refault amounts after a kill */
2380         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2381         init_ws_refault = vs.field.workingset_refault;
2382         thrashing_reset_tm = curr_tm;
2383         prev_thrash_growth = 0;
2384     }
2385 
2386     /* Check free swap levels */
2387     if (swap_free_low_percentage) {
2388         if (!swap_low_threshold) {
2389             swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
2390         }
2391         swap_is_low = mi.field.free_swap < swap_low_threshold;
2392     }
2393 
2394     /* Identify reclaim state */
2395     if (vs.field.pgscan_direct > init_pgscan_direct) {
2396         init_pgscan_direct = vs.field.pgscan_direct;
2397         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2398         reclaim = DIRECT_RECLAIM;
2399     } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
2400         init_pgscan_kswapd = vs.field.pgscan_kswapd;
2401         reclaim = KSWAPD_RECLAIM;
2402     } else if (vs.field.workingset_refault == prev_workingset_refault) {
2403         /* Device is not thrashing and not reclaiming, bail out early until we see these stats changing*/
2404         goto no_kill;
2405     }
2406 
2407     prev_workingset_refault = vs.field.workingset_refault;
2408 
2409      /*
2410      * It's possible we fail to find an eligible process to kill (ex. no process is
2411      * above oom_adj_min). When this happens, we should retry to find a new process
2412      * for a kill whenever a new eligible process is available. This is especially
2413      * important for a slow growing refault case. While retrying, we should keep
2414      * monitoring new thrashing counter as someone could release the memory to mitigate
2415      * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing
2416      * counter by window counts. if the counter is still greater than thrashing limit,
2417      * we preserve the current prev_thrash counter so we will retry kill again. Otherwise,
2418      * we reset the prev_thrash counter so we will stop retrying.
2419      */
2420     since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
2421     if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
2422         long windows_passed;
2423         /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
2424         prev_thrash_growth = (vs.field.workingset_refault - init_ws_refault) * 100
2425                             / (base_file_lru + 1);
2426         windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
2427         /*
2428          * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
2429          * just crossed, which means there were no eligible processes to kill. We preserve the
2430          * counter in that case to ensure a kill if a new eligible process appears.
2431          */
2432         if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
2433             prev_thrash_growth >>= windows_passed;
2434         }
2435 
2436         /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
2437         base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
2438         init_ws_refault = vs.field.workingset_refault;
2439         thrashing_reset_tm = curr_tm;
2440         thrashing_limit = thrashing_limit_pct;
2441     } else {
2442         /* Calculate what % of the file-backed pagecache refaulted so far */
2443         thrashing = (vs.field.workingset_refault - init_ws_refault) * 100 / (base_file_lru + 1);
2444     }
2445     /* Add previous cycle's decayed thrashing amount */
2446     thrashing += prev_thrash_growth;
2447 
2448     /*
2449      * Refresh watermarks once per min in case user updated one of the margins.
2450      * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
2451      * that zone watermarks were changed by the system software.
2452      */
2453     if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
2454         struct zoneinfo zi;
2455 
2456         if (zoneinfo_parse(&zi) < 0) {
2457             ALOGE("Failed to parse zoneinfo!");
2458             return;
2459         }
2460 
2461         calc_zone_watermarks(&zi, &watermarks);
2462         wmark_update_tm = curr_tm;
2463     }
2464 
2465     /* Find out which watermark is breached if any */
2466     wmark = get_lowest_watermark(&mi, &watermarks);
2467 
2468     /*
2469      * TODO: move this logic into a separate function
2470      * Decide if killing a process is necessary and record the reason
2471      */
2472     if (cycle_after_kill && wmark < WMARK_LOW) {
2473         /*
2474          * Prevent kills not freeing enough memory which might lead to OOM kill.
2475          * This might happen when a process is consuming memory faster than reclaim can
2476          * free even after a kill. Mostly happens when running memory stress tests.
2477          */
2478         kill_reason = PRESSURE_AFTER_KILL;
2479         strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
2480     } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
2481         /*
2482          * Device is too busy reclaiming memory which might lead to ANR.
2483          * Critical level is triggered when PSI complete stall (all tasks are blocked because
2484          * of the memory congestion) breaches the configured threshold.
2485          */
2486         kill_reason = NOT_RESPONDING;
2487         strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
2488     } else if (swap_is_low && thrashing > thrashing_limit_pct) {
2489         /* Page cache is thrashing while swap is low */
2490         kill_reason = LOW_SWAP_AND_THRASHING;
2491         snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
2492             "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
2493             mi.field.free_swap * page_k, swap_low_threshold * page_k, thrashing);
2494         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2495         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2496             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2497         }
2498     } else if (swap_is_low && wmark < WMARK_HIGH) {
2499         /* Both free memory and swap are low */
2500         kill_reason = LOW_MEM_AND_SWAP;
2501         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
2502             PRId64 "kB < %" PRId64 "kB)", wmark > WMARK_LOW ? "min" : "low",
2503             mi.field.free_swap * page_k, swap_low_threshold * page_k);
2504         /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
2505         if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
2506             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2507         }
2508     } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
2509         /* Page cache is thrashing while memory is low */
2510         kill_reason = LOW_MEM_AND_THRASHING;
2511         snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
2512             PRId64 "%%)", wmark > WMARK_LOW ? "min" : "low", thrashing);
2513         cut_thrashing_limit = true;
2514         /* Do not kill perceptible apps unless thrashing at critical levels */
2515         if (thrashing < thrashing_critical_pct) {
2516             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2517         }
2518     } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
2519         /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
2520         kill_reason = DIRECT_RECL_AND_THRASHING;
2521         snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
2522             PRId64 "%%)", thrashing);
2523         cut_thrashing_limit = true;
2524         /* Do not kill perceptible apps unless thrashing at critical levels */
2525         if (thrashing < thrashing_critical_pct) {
2526             min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
2527         }
2528     }
2529 
2530     /* Kill a process if necessary */
2531     if (kill_reason != NONE) {
2532         int pages_freed = find_and_kill_process(min_score_adj, kill_reason, kill_desc, &mi,
2533                                                 &wi, &curr_tm);
2534         if (pages_freed > 0) {
2535             killing = true;
2536             if (cut_thrashing_limit) {
2537                 /*
2538                  * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current
2539                  * thrashing limit until the system stops thrashing.
2540                  */
2541                 thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
2542             }
2543         }
2544     }
2545 
2546 no_kill:
2547     /* Do not poll if kernel supports pidfd waiting */
2548     if (is_waiting_for_kill()) {
2549         /* Pause polling if we are waiting for process death notification */
2550         poll_params->update = POLLING_PAUSE;
2551         return;
2552     }
2553 
2554     /*
2555      * Start polling after initial PSI event;
2556      * extend polling while device is in direct reclaim or process is being killed;
2557      * do not extend when kswapd reclaims because that might go on for a long time
2558      * without causing memory pressure
2559      */
2560     if (events || killing || reclaim == DIRECT_RECLAIM) {
2561         poll_params->update = POLLING_START;
2562     }
2563 
2564     /* Decide the polling interval */
2565     if (swap_is_low || killing) {
2566         /* Fast polling during and after a kill or when swap is low */
2567         poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2568     } else {
2569         /* By default use long intervals */
2570         poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
2571     }
2572 }
2573 
mp_event_common(int data,uint32_t events,struct polling_params * poll_params)2574 static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
2575     unsigned long long evcount;
2576     int64_t mem_usage, memsw_usage;
2577     int64_t mem_pressure;
2578     union meminfo mi;
2579     struct zoneinfo zi;
2580     struct timespec curr_tm;
2581     static unsigned long kill_skip_count = 0;
2582     enum vmpressure_level level = (enum vmpressure_level)data;
2583     long other_free = 0, other_file = 0;
2584     int min_score_adj;
2585     int minfree = 0;
2586     static struct reread_data mem_usage_file_data = {
2587         .filename = MEMCG_MEMORY_USAGE,
2588         .fd = -1,
2589     };
2590     static struct reread_data memsw_usage_file_data = {
2591         .filename = MEMCG_MEMORYSW_USAGE,
2592         .fd = -1,
2593     };
2594     static struct wakeup_info wi;
2595 
2596     if (debug_process_killing) {
2597         ALOGI("%s memory pressure event is triggered", level_name[level]);
2598     }
2599 
2600     if (!use_psi_monitors) {
2601         /*
2602          * Check all event counters from low to critical
2603          * and upgrade to the highest priority one. By reading
2604          * eventfd we also reset the event counters.
2605          */
2606         for (int lvl = VMPRESS_LEVEL_LOW; lvl < VMPRESS_LEVEL_COUNT; lvl++) {
2607             if (mpevfd[lvl] != -1 &&
2608                 TEMP_FAILURE_RETRY(read(mpevfd[lvl],
2609                                    &evcount, sizeof(evcount))) > 0 &&
2610                 evcount > 0 && lvl > level) {
2611                 level = static_cast<vmpressure_level>(lvl);
2612             }
2613         }
2614     }
2615 
2616     /* Start polling after initial PSI event */
2617     if (use_psi_monitors && events) {
2618         /* Override polling params only if current event is more critical */
2619         if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
2620             poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
2621             poll_params->update = POLLING_START;
2622         }
2623     }
2624 
2625     if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
2626         ALOGE("Failed to get current time");
2627         return;
2628     }
2629 
2630     record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
2631 
2632     if (kill_timeout_ms &&
2633         get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms)) {
2634         /*
2635          * If we're within the no-kill timeout, see if there's pending reclaim work
2636          * from the last killed process. If so, skip killing for now.
2637          */
2638         if (is_kill_pending()) {
2639             kill_skip_count++;
2640             wi.skipped_wakeups++;
2641             return;
2642         }
2643         /*
2644          * Process is dead, stop waiting. This has no effect if pidfds are supported and
2645          * death notification already caused waiting to stop.
2646          */
2647         stop_wait_for_proc_kill(true);
2648     } else {
2649         /*
2650          * Killing took longer than no-kill timeout. Stop waiting for the last process
2651          * to die because we are ready to kill again.
2652          */
2653         stop_wait_for_proc_kill(false);
2654     }
2655 
2656     if (kill_skip_count > 0) {
2657         ALOGI("%lu memory pressure events were skipped after a kill!",
2658               kill_skip_count);
2659         kill_skip_count = 0;
2660     }
2661 
2662     if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) {
2663         ALOGE("Failed to get free memory!");
2664         return;
2665     }
2666 
2667     if (use_minfree_levels) {
2668         int i;
2669 
2670         other_free = mi.field.nr_free_pages - zi.totalreserve_pages;
2671         if (mi.field.nr_file_pages > (mi.field.shmem + mi.field.unevictable + mi.field.swap_cached)) {
2672             other_file = (mi.field.nr_file_pages - mi.field.shmem -
2673                           mi.field.unevictable - mi.field.swap_cached);
2674         } else {
2675             other_file = 0;
2676         }
2677 
2678         min_score_adj = OOM_SCORE_ADJ_MAX + 1;
2679         for (i = 0; i < lowmem_targets_size; i++) {
2680             minfree = lowmem_minfree[i];
2681             if (other_free < minfree && other_file < minfree) {
2682                 min_score_adj = lowmem_adj[i];
2683                 break;
2684             }
2685         }
2686 
2687         if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
2688             if (debug_process_killing) {
2689                 ALOGI("Ignore %s memory pressure event "
2690                       "(free memory=%ldkB, cache=%ldkB, limit=%ldkB)",
2691                       level_name[level], other_free * page_k, other_file * page_k,
2692                       (long)lowmem_minfree[lowmem_targets_size - 1] * page_k);
2693             }
2694             return;
2695         }
2696 
2697         goto do_kill;
2698     }
2699 
2700     if (level == VMPRESS_LEVEL_LOW) {
2701         record_low_pressure_levels(&mi);
2702     }
2703 
2704     if (level_oomadj[level] > OOM_SCORE_ADJ_MAX) {
2705         /* Do not monitor this pressure level */
2706         return;
2707     }
2708 
2709     if ((mem_usage = get_memory_usage(&mem_usage_file_data)) < 0) {
2710         goto do_kill;
2711     }
2712     if ((memsw_usage = get_memory_usage(&memsw_usage_file_data)) < 0) {
2713         goto do_kill;
2714     }
2715 
2716     // Calculate percent for swappinness.
2717     mem_pressure = (mem_usage * 100) / memsw_usage;
2718 
2719     if (enable_pressure_upgrade && level != VMPRESS_LEVEL_CRITICAL) {
2720         // We are swapping too much.
2721         if (mem_pressure < upgrade_pressure) {
2722             level = upgrade_level(level);
2723             if (debug_process_killing) {
2724                 ALOGI("Event upgraded to %s", level_name[level]);
2725             }
2726         }
2727     }
2728 
2729     // If we still have enough swap space available, check if we want to
2730     // ignore/downgrade pressure events.
2731     if (mi.field.free_swap >=
2732         mi.field.total_swap * swap_free_low_percentage / 100) {
2733         // If the pressure is larger than downgrade_pressure lmk will not
2734         // kill any process, since enough memory is available.
2735         if (mem_pressure > downgrade_pressure) {
2736             if (debug_process_killing) {
2737                 ALOGI("Ignore %s memory pressure", level_name[level]);
2738             }
2739             return;
2740         } else if (level == VMPRESS_LEVEL_CRITICAL && mem_pressure > upgrade_pressure) {
2741             if (debug_process_killing) {
2742                 ALOGI("Downgrade critical memory pressure");
2743             }
2744             // Downgrade event, since enough memory available.
2745             level = downgrade_level(level);
2746         }
2747     }
2748 
2749 do_kill:
2750     if (low_ram_device) {
2751         /* For Go devices kill only one task */
2752         if (find_and_kill_process(level_oomadj[level], NONE, NULL, &mi, &wi, &curr_tm) == 0) {
2753             if (debug_process_killing) {
2754                 ALOGI("Nothing to kill");
2755             }
2756         }
2757     } else {
2758         int pages_freed;
2759         static struct timespec last_report_tm;
2760         static unsigned long report_skip_count = 0;
2761 
2762         if (!use_minfree_levels) {
2763             /* Free up enough memory to downgrate the memory pressure to low level */
2764             if (mi.field.nr_free_pages >= low_pressure_mem.max_nr_free_pages) {
2765                 if (debug_process_killing) {
2766                     ALOGI("Ignoring pressure since more memory is "
2767                         "available (%" PRId64 ") than watermark (%" PRId64 ")",
2768                         mi.field.nr_free_pages, low_pressure_mem.max_nr_free_pages);
2769                 }
2770                 return;
2771             }
2772             min_score_adj = level_oomadj[level];
2773         }
2774 
2775         pages_freed = find_and_kill_process(min_score_adj, NONE, NULL, &mi, &wi, &curr_tm);
2776 
2777         if (pages_freed == 0) {
2778             /* Rate limit kill reports when nothing was reclaimed */
2779             if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) {
2780                 report_skip_count++;
2781                 return;
2782             }
2783         }
2784 
2785         /* Log whenever we kill or when report rate limit allows */
2786         if (use_minfree_levels) {
2787             ALOGI("Reclaimed %ldkB, cache(%ldkB) and "
2788                 "free(%" PRId64 "kB)-reserved(%" PRId64 "kB) below min(%ldkB) for oom_adj %d",
2789                 pages_freed * page_k,
2790                 other_file * page_k, mi.field.nr_free_pages * page_k,
2791                 zi.totalreserve_pages * page_k,
2792                 minfree * page_k, min_score_adj);
2793         } else {
2794             ALOGI("Reclaimed %ldkB at oom_adj %d",
2795                 pages_freed * page_k, min_score_adj);
2796         }
2797 
2798         if (report_skip_count > 0) {
2799             ALOGI("Suppressed %lu failed kill reports", report_skip_count);
2800             report_skip_count = 0;
2801         }
2802 
2803         last_report_tm = curr_tm;
2804     }
2805     if (is_waiting_for_kill()) {
2806         /* pause polling if we are waiting for process death notification */
2807         poll_params->update = POLLING_PAUSE;
2808     }
2809 }
2810 
init_mp_psi(enum vmpressure_level level,bool use_new_strategy)2811 static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
2812     int fd;
2813 
2814     /* Do not register a handler if threshold_ms is not set */
2815     if (!psi_thresholds[level].threshold_ms) {
2816         return true;
2817     }
2818 
2819     fd = init_psi_monitor(psi_thresholds[level].stall_type,
2820         psi_thresholds[level].threshold_ms * US_PER_MS,
2821         PSI_WINDOW_SIZE_MS * US_PER_MS);
2822 
2823     if (fd < 0) {
2824         return false;
2825     }
2826 
2827     vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
2828     vmpressure_hinfo[level].data = level;
2829     if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
2830         destroy_psi_monitor(fd);
2831         return false;
2832     }
2833     maxevents++;
2834     mpevfd[level] = fd;
2835 
2836     return true;
2837 }
2838 
destroy_mp_psi(enum vmpressure_level level)2839 static void destroy_mp_psi(enum vmpressure_level level) {
2840     int fd = mpevfd[level];
2841 
2842     if (fd < 0) {
2843         return;
2844     }
2845 
2846     if (unregister_psi_monitor(epollfd, fd) < 0) {
2847         ALOGE("Failed to unregister psi monitor for %s memory pressure; errno=%d",
2848             level_name[level], errno);
2849     }
2850     maxevents--;
2851     destroy_psi_monitor(fd);
2852     mpevfd[level] = -1;
2853 }
2854 
init_psi_monitors()2855 static bool init_psi_monitors() {
2856     /*
2857      * When PSI is used on low-ram devices or on high-end devices without memfree levels
2858      * use new kill strategy based on zone watermarks, free swap and thrashing stats
2859      */
2860     bool use_new_strategy =
2861         property_get_bool("ro.lmk.use_new_strategy", low_ram_device || !use_minfree_levels);
2862 
2863     /* In default PSI mode override stall amounts using system properties */
2864     if (use_new_strategy) {
2865         /* Do not use low pressure level */
2866         psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
2867         psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
2868         psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
2869     }
2870 
2871     if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
2872         return false;
2873     }
2874     if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
2875         destroy_mp_psi(VMPRESS_LEVEL_LOW);
2876         return false;
2877     }
2878     if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
2879         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
2880         destroy_mp_psi(VMPRESS_LEVEL_LOW);
2881         return false;
2882     }
2883     return true;
2884 }
2885 
init_mp_common(enum vmpressure_level level)2886 static bool init_mp_common(enum vmpressure_level level) {
2887     int mpfd;
2888     int evfd;
2889     int evctlfd;
2890     char buf[256];
2891     struct epoll_event epev;
2892     int ret;
2893     int level_idx = (int)level;
2894     const char *levelstr = level_name[level_idx];
2895 
2896     /* gid containing AID_SYSTEM required */
2897     mpfd = open(MEMCG_SYSFS_PATH "memory.pressure_level", O_RDONLY | O_CLOEXEC);
2898     if (mpfd < 0) {
2899         ALOGI("No kernel memory.pressure_level support (errno=%d)", errno);
2900         goto err_open_mpfd;
2901     }
2902 
2903     evctlfd = open(MEMCG_SYSFS_PATH "cgroup.event_control", O_WRONLY | O_CLOEXEC);
2904     if (evctlfd < 0) {
2905         ALOGI("No kernel memory cgroup event control (errno=%d)", errno);
2906         goto err_open_evctlfd;
2907     }
2908 
2909     evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
2910     if (evfd < 0) {
2911         ALOGE("eventfd failed for level %s; errno=%d", levelstr, errno);
2912         goto err_eventfd;
2913     }
2914 
2915     ret = snprintf(buf, sizeof(buf), "%d %d %s", evfd, mpfd, levelstr);
2916     if (ret >= (ssize_t)sizeof(buf)) {
2917         ALOGE("cgroup.event_control line overflow for level %s", levelstr);
2918         goto err;
2919     }
2920 
2921     ret = TEMP_FAILURE_RETRY(write(evctlfd, buf, strlen(buf) + 1));
2922     if (ret == -1) {
2923         ALOGE("cgroup.event_control write failed for level %s; errno=%d",
2924               levelstr, errno);
2925         goto err;
2926     }
2927 
2928     epev.events = EPOLLIN;
2929     /* use data to store event level */
2930     vmpressure_hinfo[level_idx].data = level_idx;
2931     vmpressure_hinfo[level_idx].handler = mp_event_common;
2932     epev.data.ptr = (void *)&vmpressure_hinfo[level_idx];
2933     ret = epoll_ctl(epollfd, EPOLL_CTL_ADD, evfd, &epev);
2934     if (ret == -1) {
2935         ALOGE("epoll_ctl for level %s failed; errno=%d", levelstr, errno);
2936         goto err;
2937     }
2938     maxevents++;
2939     mpevfd[level] = evfd;
2940     close(evctlfd);
2941     return true;
2942 
2943 err:
2944     close(evfd);
2945 err_eventfd:
2946     close(evctlfd);
2947 err_open_evctlfd:
2948     close(mpfd);
2949 err_open_mpfd:
2950     return false;
2951 }
2952 
destroy_mp_common(enum vmpressure_level level)2953 static void destroy_mp_common(enum vmpressure_level level) {
2954     struct epoll_event epev;
2955     int fd = mpevfd[level];
2956 
2957     if (fd < 0) {
2958         return;
2959     }
2960 
2961     if (epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &epev)) {
2962         // Log an error and keep going
2963         ALOGE("epoll_ctl for level %s failed; errno=%d", level_name[level], errno);
2964     }
2965     maxevents--;
2966     close(fd);
2967     mpevfd[level] = -1;
2968 }
2969 
kernel_event_handler(int data __unused,uint32_t events __unused,struct polling_params * poll_params __unused)2970 static void kernel_event_handler(int data __unused, uint32_t events __unused,
2971                                  struct polling_params *poll_params __unused) {
2972     poll_kernel(kpoll_fd);
2973 }
2974 
init_monitors()2975 static bool init_monitors() {
2976     /* Try to use psi monitor first if kernel has it */
2977     use_psi_monitors = property_get_bool("ro.lmk.use_psi", true) &&
2978         init_psi_monitors();
2979     /* Fall back to vmpressure */
2980     if (!use_psi_monitors &&
2981         (!init_mp_common(VMPRESS_LEVEL_LOW) ||
2982         !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
2983         !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
2984         ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
2985         return false;
2986     }
2987     if (use_psi_monitors) {
2988         ALOGI("Using psi monitors for memory pressure detection");
2989     } else {
2990         ALOGI("Using vmpressure for memory pressure detection");
2991     }
2992     return true;
2993 }
2994 
destroy_monitors()2995 static void destroy_monitors() {
2996     if (use_psi_monitors) {
2997         destroy_mp_psi(VMPRESS_LEVEL_CRITICAL);
2998         destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
2999         destroy_mp_psi(VMPRESS_LEVEL_LOW);
3000     } else {
3001         destroy_mp_common(VMPRESS_LEVEL_CRITICAL);
3002         destroy_mp_common(VMPRESS_LEVEL_MEDIUM);
3003         destroy_mp_common(VMPRESS_LEVEL_LOW);
3004     }
3005 }
3006 
init(void)3007 static int init(void) {
3008     static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
3009     struct reread_data file_data = {
3010         .filename = ZONEINFO_PATH,
3011         .fd = -1,
3012     };
3013     struct epoll_event epev;
3014     int pidfd;
3015     int i;
3016     int ret;
3017 
3018     page_k = sysconf(_SC_PAGESIZE);
3019     if (page_k == -1)
3020         page_k = PAGE_SIZE;
3021     page_k /= 1024;
3022 
3023     epollfd = epoll_create(MAX_EPOLL_EVENTS);
3024     if (epollfd == -1) {
3025         ALOGE("epoll_create failed (errno=%d)", errno);
3026         return -1;
3027     }
3028 
3029     // mark data connections as not connected
3030     for (int i = 0; i < MAX_DATA_CONN; i++) {
3031         data_sock[i].sock = -1;
3032     }
3033 
3034     ctrl_sock.sock = android_get_control_socket("lmkd");
3035     if (ctrl_sock.sock < 0) {
3036         ALOGE("get lmkd control socket failed");
3037         return -1;
3038     }
3039 
3040     ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
3041     if (ret < 0) {
3042         ALOGE("lmkd control socket listen failed (errno=%d)", errno);
3043         return -1;
3044     }
3045 
3046     epev.events = EPOLLIN;
3047     ctrl_sock.handler_info.handler = ctrl_connect_handler;
3048     epev.data.ptr = (void *)&(ctrl_sock.handler_info);
3049     if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
3050         ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
3051         return -1;
3052     }
3053     maxevents++;
3054 
3055     has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
3056     use_inkernel_interface = has_inkernel_module;
3057 
3058     if (use_inkernel_interface) {
3059         ALOGI("Using in-kernel low memory killer interface");
3060         if (init_poll_kernel()) {
3061             epev.events = EPOLLIN;
3062             epev.data.ptr = (void*)&kernel_poll_hinfo;
3063             if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
3064                 ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
3065                 close(kpoll_fd);
3066                 kpoll_fd = -1;
3067             } else {
3068                 maxevents++;
3069                 /* let the others know it does support reporting kills */
3070                 property_set("sys.lmk.reportkills", "1");
3071             }
3072         }
3073     } else {
3074         if (!init_monitors()) {
3075             return -1;
3076         }
3077         /* let the others know it does support reporting kills */
3078         property_set("sys.lmk.reportkills", "1");
3079     }
3080 
3081     for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) {
3082         procadjslot_list[i].next = &procadjslot_list[i];
3083         procadjslot_list[i].prev = &procadjslot_list[i];
3084     }
3085 
3086     memset(killcnt_idx, KILLCNT_INVALID_IDX, sizeof(killcnt_idx));
3087 
3088     /*
3089      * Read zoneinfo as the biggest file we read to create and size the initial
3090      * read buffer and avoid memory re-allocations during memory pressure
3091      */
3092     if (reread_file(&file_data) == NULL) {
3093         ALOGE("Failed to read %s: %s", file_data.filename, strerror(errno));
3094     }
3095 
3096     /* check if kernel supports pidfd_open syscall */
3097     pidfd = TEMP_FAILURE_RETRY(sys_pidfd_open(getpid(), 0));
3098     if (pidfd < 0) {
3099         pidfd_supported = (errno != ENOSYS);
3100     } else {
3101         pidfd_supported = true;
3102         close(pidfd);
3103     }
3104     ALOGI("Process polling is %s", pidfd_supported ? "supported" : "not supported" );
3105 
3106     return 0;
3107 }
3108 
polling_paused(struct polling_params * poll_params)3109 static bool polling_paused(struct polling_params *poll_params) {
3110     return poll_params->paused_handler != NULL;
3111 }
3112 
resume_polling(struct polling_params * poll_params,struct timespec curr_tm)3113 static void resume_polling(struct polling_params *poll_params, struct timespec curr_tm) {
3114     poll_params->poll_start_tm = curr_tm;
3115     poll_params->poll_handler = poll_params->paused_handler;
3116     poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
3117     poll_params->paused_handler = NULL;
3118 }
3119 
call_handler(struct event_handler_info * handler_info,struct polling_params * poll_params,uint32_t events)3120 static void call_handler(struct event_handler_info* handler_info,
3121                          struct polling_params *poll_params, uint32_t events) {
3122     struct timespec curr_tm;
3123 
3124     poll_params->update = POLLING_DO_NOT_CHANGE;
3125     handler_info->handler(handler_info->data, events, poll_params);
3126     clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3127     if (poll_params->poll_handler == handler_info) {
3128         poll_params->last_poll_tm = curr_tm;
3129     }
3130 
3131     switch (poll_params->update) {
3132     case POLLING_START:
3133         /*
3134          * Poll for the duration of PSI_WINDOW_SIZE_MS after the
3135          * initial PSI event because psi events are rate-limited
3136          * at one per sec.
3137          */
3138         poll_params->poll_start_tm = curr_tm;
3139         poll_params->poll_handler = handler_info;
3140         break;
3141     case POLLING_PAUSE:
3142         poll_params->paused_handler = handler_info;
3143         poll_params->poll_handler = NULL;
3144         break;
3145     case POLLING_RESUME:
3146         resume_polling(poll_params, curr_tm);
3147         break;
3148     case POLLING_DO_NOT_CHANGE:
3149         if (get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
3150             /* Polled for the duration of PSI window, time to stop */
3151             poll_params->poll_handler = NULL;
3152         }
3153         break;
3154     }
3155 }
3156 
mainloop(void)3157 static void mainloop(void) {
3158     struct event_handler_info* handler_info;
3159     struct polling_params poll_params;
3160     struct timespec curr_tm;
3161     struct epoll_event *evt;
3162     long delay = -1;
3163 
3164     poll_params.poll_handler = NULL;
3165     poll_params.paused_handler = NULL;
3166 
3167     while (1) {
3168         struct epoll_event events[MAX_EPOLL_EVENTS];
3169         int nevents;
3170         int i;
3171 
3172         if (poll_params.poll_handler) {
3173             bool poll_now;
3174 
3175             clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3176             if (poll_params.update == POLLING_RESUME) {
3177                 /* Just transitioned into POLLING_RESUME, poll immediately. */
3178                 poll_now = true;
3179                 nevents = 0;
3180             } else {
3181                 /* Calculate next timeout */
3182                 delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
3183                 delay = (delay < poll_params.polling_interval_ms) ?
3184                     poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
3185 
3186                 /* Wait for events until the next polling timeout */
3187                 nevents = epoll_wait(epollfd, events, maxevents, delay);
3188 
3189                 /* Update current time after wait */
3190                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3191                 poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
3192                     poll_params.polling_interval_ms);
3193             }
3194             if (poll_now) {
3195                 call_handler(poll_params.poll_handler, &poll_params, 0);
3196             }
3197         } else {
3198             if (kill_timeout_ms && is_waiting_for_kill()) {
3199                 clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3200                 delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
3201                 /* Wait for pidfds notification or kill timeout to expire */
3202                 nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
3203                 if (nevents == 0) {
3204                     /* Kill notification timed out */
3205                     stop_wait_for_proc_kill(false);
3206                     if (polling_paused(&poll_params)) {
3207                         clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
3208                         poll_params.update = POLLING_RESUME;
3209                         resume_polling(&poll_params, curr_tm);
3210                     }
3211                 }
3212             } else {
3213                 /* Wait for events with no timeout */
3214                 nevents = epoll_wait(epollfd, events, maxevents, -1);
3215             }
3216         }
3217 
3218         if (nevents == -1) {
3219             if (errno == EINTR)
3220                 continue;
3221             ALOGE("epoll_wait failed (errno=%d)", errno);
3222             continue;
3223         }
3224 
3225         /*
3226          * First pass to see if any data socket connections were dropped.
3227          * Dropped connection should be handled before any other events
3228          * to deallocate data connection and correctly handle cases when
3229          * connection gets dropped and reestablished in the same epoll cycle.
3230          * In such cases it's essential to handle connection closures first.
3231          */
3232         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3233             if ((evt->events & EPOLLHUP) && evt->data.ptr) {
3234                 ALOGI("lmkd data connection dropped");
3235                 handler_info = (struct event_handler_info*)evt->data.ptr;
3236                 ctrl_data_close(handler_info->data);
3237             }
3238         }
3239 
3240         /* Second pass to handle all other events */
3241         for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
3242             if (evt->events & EPOLLERR) {
3243                 ALOGD("EPOLLERR on event #%d", i);
3244             }
3245             if (evt->events & EPOLLHUP) {
3246                 /* This case was handled in the first pass */
3247                 continue;
3248             }
3249             if (evt->data.ptr) {
3250                 handler_info = (struct event_handler_info*)evt->data.ptr;
3251                 call_handler(handler_info, &poll_params, evt->events);
3252             }
3253         }
3254     }
3255 }
3256 
issue_reinit()3257 int issue_reinit() {
3258     int sock;
3259 
3260     sock = lmkd_connect();
3261     if (sock < 0) {
3262         ALOGE("failed to connect to lmkd: %s", strerror(errno));
3263         return -1;
3264     }
3265 
3266     enum update_props_result res = lmkd_update_props(sock);
3267     switch (res) {
3268     case UPDATE_PROPS_SUCCESS:
3269         ALOGI("lmkd updated properties successfully");
3270         break;
3271     case UPDATE_PROPS_SEND_ERR:
3272         ALOGE("failed to send lmkd request: %s", strerror(errno));
3273         break;
3274     case UPDATE_PROPS_RECV_ERR:
3275         ALOGE("failed to receive lmkd reply: %s", strerror(errno));
3276         break;
3277     case UPDATE_PROPS_FORMAT_ERR:
3278         ALOGE("lmkd reply is invalid");
3279         break;
3280     case UPDATE_PROPS_FAIL:
3281         ALOGE("lmkd failed to update its properties");
3282         break;
3283     }
3284 
3285     close(sock);
3286     return res == UPDATE_PROPS_SUCCESS ? 0 : -1;
3287 }
3288 
update_props()3289 static void update_props() {
3290     /* By default disable low level vmpressure events */
3291     level_oomadj[VMPRESS_LEVEL_LOW] =
3292         property_get_int32("ro.lmk.low", OOM_SCORE_ADJ_MAX + 1);
3293     level_oomadj[VMPRESS_LEVEL_MEDIUM] =
3294         property_get_int32("ro.lmk.medium", 800);
3295     level_oomadj[VMPRESS_LEVEL_CRITICAL] =
3296         property_get_int32("ro.lmk.critical", 0);
3297     debug_process_killing = property_get_bool("ro.lmk.debug", false);
3298 
3299     /* By default disable upgrade/downgrade logic */
3300     enable_pressure_upgrade =
3301         property_get_bool("ro.lmk.critical_upgrade", false);
3302     upgrade_pressure =
3303         (int64_t)property_get_int32("ro.lmk.upgrade_pressure", 100);
3304     downgrade_pressure =
3305         (int64_t)property_get_int32("ro.lmk.downgrade_pressure", 100);
3306     kill_heaviest_task =
3307         property_get_bool("ro.lmk.kill_heaviest_task", false);
3308     low_ram_device = property_get_bool("ro.config.low_ram", false);
3309     kill_timeout_ms =
3310         (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 100);
3311     use_minfree_levels =
3312         property_get_bool("ro.lmk.use_minfree_levels", false);
3313     per_app_memcg =
3314         property_get_bool("ro.config.per_app_memcg", low_ram_device);
3315     swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
3316         DEF_LOW_SWAP));
3317     psi_partial_stall_ms = property_get_int32("ro.lmk.psi_partial_stall_ms",
3318         low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
3319     psi_complete_stall_ms = property_get_int32("ro.lmk.psi_complete_stall_ms",
3320         DEF_COMPLETE_STALL);
3321     thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
3322         low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
3323     thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
3324         low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
3325     thrashing_critical_pct = max(0, property_get_int32("ro.lmk.thrashing_limit_critical",
3326         thrashing_limit_pct * 2));
3327 }
3328 
main(int argc,char ** argv)3329 int main(int argc, char **argv) {
3330     if ((argc > 1) && argv[1] && !strcmp(argv[1], "--reinit")) {
3331         if (property_set(LMKD_REINIT_PROP, "0")) {
3332             ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
3333         }
3334         return issue_reinit();
3335     }
3336 
3337     update_props();
3338 
3339     ctx = create_android_logger(KILLINFO_LOG_TAG);
3340 
3341     if (!init()) {
3342         if (!use_inkernel_interface) {
3343             /*
3344              * MCL_ONFAULT pins pages as they fault instead of loading
3345              * everything immediately all at once. (Which would be bad,
3346              * because as of this writing, we have a lot of mapped pages we
3347              * never use.) Old kernels will see MCL_ONFAULT and fail with
3348              * EINVAL; we ignore this failure.
3349              *
3350              * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
3351              * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
3352              * in pages.
3353              */
3354             /* CAP_IPC_LOCK required */
3355             if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
3356                 ALOGW("mlockall failed %s", strerror(errno));
3357             }
3358 
3359             /* CAP_NICE required */
3360             struct sched_param param = {
3361                     .sched_priority = 1,
3362             };
3363             if (sched_setscheduler(0, SCHED_FIFO, &param)) {
3364                 ALOGW("set SCHED_FIFO failed %s", strerror(errno));
3365             }
3366         }
3367 
3368         mainloop();
3369     }
3370 
3371     android_log_destroy(&ctx);
3372 
3373     ALOGI("exiting");
3374     return 0;
3375 }
3376