1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_SCHED_H
3 #define _LINUX_SCHED_H
4
5 /*
6 * Define 'struct task_struct' and provide the main scheduler
7 * APIs (schedule(), wakeup variants, etc.)
8 */
9
10 #include <uapi/linux/sched.h>
11
12 #include <asm/current.h>
13
14 #include <linux/pid.h>
15 #include <linux/sem.h>
16 #include <linux/shm.h>
17 #include <linux/kmsan_types.h>
18 #include <linux/mutex.h>
19 #include <linux/plist.h>
20 #include <linux/hrtimer.h>
21 #include <linux/irqflags.h>
22 #include <linux/seccomp.h>
23 #include <linux/nodemask.h>
24 #include <linux/rcupdate.h>
25 #include <linux/refcount.h>
26 #include <linux/resource.h>
27 #include <linux/latencytop.h>
28 #include <linux/sched/prio.h>
29 #include <linux/sched/types.h>
30 #include <linux/signal_types.h>
31 #include <linux/syscall_user_dispatch.h>
32 #include <linux/mm_types_task.h>
33 #include <linux/task_io_accounting.h>
34 #include <linux/posix-timers.h>
35 #include <linux/rseq.h>
36 #include <linux/seqlock.h>
37 #include <linux/kcsan.h>
38 #include <linux/rv.h>
39 #include <linux/livepatch_sched.h>
40 #include <linux/android_vendor.h>
41 #include <linux/android_kabi.h>
42 #include <asm/kmap_size.h>
43
44 /* task_struct member predeclarations (sorted alphabetically): */
45 struct audit_context;
46 struct bio_list;
47 struct blk_plug;
48 struct bpf_local_storage;
49 struct bpf_run_ctx;
50 struct capture_control;
51 struct cfs_rq;
52 struct fs_struct;
53 struct futex_pi_state;
54 struct io_context;
55 struct io_uring_task;
56 struct mempolicy;
57 struct nameidata;
58 struct nsproxy;
59 struct perf_event_context;
60 struct pid_namespace;
61 struct pipe_inode_info;
62 struct rcu_node;
63 struct reclaim_state;
64 struct robust_list_head;
65 struct root_domain;
66 struct rq;
67 struct sched_attr;
68 struct sched_param;
69 struct seq_file;
70 struct sighand_struct;
71 struct signal_struct;
72 struct task_delay_info;
73 struct task_group;
74 struct user_event_mm;
75
76 /*
77 * Task state bitmask. NOTE! These bits are also
78 * encoded in fs/proc/array.c: get_task_state().
79 *
80 * We have two separate sets of flags: task->__state
81 * is about runnability, while task->exit_state are
82 * about the task exiting. Confusing, but this way
83 * modifying one set can't modify the other one by
84 * mistake.
85 */
86
87 /* Used in tsk->__state: */
88 #define TASK_RUNNING 0x00000000
89 #define TASK_INTERRUPTIBLE 0x00000001
90 #define TASK_UNINTERRUPTIBLE 0x00000002
91 #define __TASK_STOPPED 0x00000004
92 #define __TASK_TRACED 0x00000008
93 /* Used in tsk->exit_state: */
94 #define EXIT_DEAD 0x00000010
95 #define EXIT_ZOMBIE 0x00000020
96 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
97 /* Used in tsk->__state again: */
98 #define TASK_PARKED 0x00000040
99 #define TASK_DEAD 0x00000080
100 #define TASK_WAKEKILL 0x00000100
101 #define TASK_WAKING 0x00000200
102 #define TASK_NOLOAD 0x00000400
103 #define TASK_NEW 0x00000800
104 #define TASK_RTLOCK_WAIT 0x00001000
105 #define TASK_FREEZABLE 0x00002000
106 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
107 #define TASK_FROZEN 0x00008000
108 #define TASK_STATE_MAX 0x00010000
109
110 #define TASK_ANY (TASK_STATE_MAX-1)
111
112 /*
113 * DO NOT ADD ANY NEW USERS !
114 */
115 #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)
116
117 /* Convenience macros for the sake of set_current_state: */
118 #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
119 #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
120 #define TASK_TRACED __TASK_TRACED
121
122 #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
123
124 /* Convenience macros for the sake of wake_up(): */
125 #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
126
127 /* get_task_state(): */
128 #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
129 TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
130 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
131 TASK_PARKED)
132
133 #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)
134
135 #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
136 #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
137 #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)
138
139 /*
140 * Special states are those that do not use the normal wait-loop pattern. See
141 * the comment with set_special_state().
142 */
143 #define is_special_task_state(state) \
144 ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
145
146 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
147 # define debug_normal_state_change(state_value) \
148 do { \
149 WARN_ON_ONCE(is_special_task_state(state_value)); \
150 current->task_state_change = _THIS_IP_; \
151 } while (0)
152
153 # define debug_special_state_change(state_value) \
154 do { \
155 WARN_ON_ONCE(!is_special_task_state(state_value)); \
156 current->task_state_change = _THIS_IP_; \
157 } while (0)
158
159 # define debug_rtlock_wait_set_state() \
160 do { \
161 current->saved_state_change = current->task_state_change;\
162 current->task_state_change = _THIS_IP_; \
163 } while (0)
164
165 # define debug_rtlock_wait_restore_state() \
166 do { \
167 current->task_state_change = current->saved_state_change;\
168 } while (0)
169
170 #else
171 # define debug_normal_state_change(cond) do { } while (0)
172 # define debug_special_state_change(cond) do { } while (0)
173 # define debug_rtlock_wait_set_state() do { } while (0)
174 # define debug_rtlock_wait_restore_state() do { } while (0)
175 #endif
176
177 /*
178 * set_current_state() includes a barrier so that the write of current->__state
179 * is correctly serialised wrt the caller's subsequent test of whether to
180 * actually sleep:
181 *
182 * for (;;) {
183 * set_current_state(TASK_UNINTERRUPTIBLE);
184 * if (CONDITION)
185 * break;
186 *
187 * schedule();
188 * }
189 * __set_current_state(TASK_RUNNING);
190 *
191 * If the caller does not need such serialisation (because, for instance, the
192 * CONDITION test and condition change and wakeup are under the same lock) then
193 * use __set_current_state().
194 *
195 * The above is typically ordered against the wakeup, which does:
196 *
197 * CONDITION = 1;
198 * wake_up_state(p, TASK_UNINTERRUPTIBLE);
199 *
200 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
201 * accessing p->__state.
202 *
203 * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
204 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
205 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
206 *
207 * However, with slightly different timing the wakeup TASK_RUNNING store can
208 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
209 * a problem either because that will result in one extra go around the loop
210 * and our @cond test will save the day.
211 *
212 * Also see the comments of try_to_wake_up().
213 */
214 #define __set_current_state(state_value) \
215 do { \
216 debug_normal_state_change((state_value)); \
217 WRITE_ONCE(current->__state, (state_value)); \
218 } while (0)
219
220 #define set_current_state(state_value) \
221 do { \
222 debug_normal_state_change((state_value)); \
223 smp_store_mb(current->__state, (state_value)); \
224 } while (0)
225
226 /*
227 * set_special_state() should be used for those states when the blocking task
228 * can not use the regular condition based wait-loop. In that case we must
229 * serialize against wakeups such that any possible in-flight TASK_RUNNING
230 * stores will not collide with our state change.
231 */
232 #define set_special_state(state_value) \
233 do { \
234 unsigned long flags; /* may shadow */ \
235 \
236 raw_spin_lock_irqsave(¤t->pi_lock, flags); \
237 debug_special_state_change((state_value)); \
238 WRITE_ONCE(current->__state, (state_value)); \
239 raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \
240 } while (0)
241
242 /*
243 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
244 *
245 * RT's spin/rwlock substitutions are state preserving. The state of the
246 * task when blocking on the lock is saved in task_struct::saved_state and
247 * restored after the lock has been acquired. These operations are
248 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
249 * lock related wakeups while the task is blocked on the lock are
250 * redirected to operate on task_struct::saved_state to ensure that these
251 * are not dropped. On restore task_struct::saved_state is set to
252 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
253 *
254 * The lock operation looks like this:
255 *
256 * current_save_and_set_rtlock_wait_state();
257 * for (;;) {
258 * if (try_lock())
259 * break;
260 * raw_spin_unlock_irq(&lock->wait_lock);
261 * schedule_rtlock();
262 * raw_spin_lock_irq(&lock->wait_lock);
263 * set_current_state(TASK_RTLOCK_WAIT);
264 * }
265 * current_restore_rtlock_saved_state();
266 */
267 #define current_save_and_set_rtlock_wait_state() \
268 do { \
269 lockdep_assert_irqs_disabled(); \
270 raw_spin_lock(¤t->pi_lock); \
271 current->saved_state = current->__state; \
272 debug_rtlock_wait_set_state(); \
273 WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \
274 raw_spin_unlock(¤t->pi_lock); \
275 } while (0);
276
277 #define current_restore_rtlock_saved_state() \
278 do { \
279 lockdep_assert_irqs_disabled(); \
280 raw_spin_lock(¤t->pi_lock); \
281 debug_rtlock_wait_restore_state(); \
282 WRITE_ONCE(current->__state, current->saved_state); \
283 current->saved_state = TASK_RUNNING; \
284 raw_spin_unlock(¤t->pi_lock); \
285 } while (0);
286
287 #define get_current_state() READ_ONCE(current->__state)
288
289 /*
290 * Define the task command name length as enum, then it can be visible to
291 * BPF programs.
292 */
293 enum {
294 TASK_COMM_LEN = 16,
295 };
296
297 extern void scheduler_tick(void);
298
299 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
300
301 extern long schedule_timeout(long timeout);
302 extern long schedule_timeout_interruptible(long timeout);
303 extern long schedule_timeout_killable(long timeout);
304 extern long schedule_timeout_uninterruptible(long timeout);
305 extern long schedule_timeout_idle(long timeout);
306 asmlinkage void schedule(void);
307 extern void schedule_preempt_disabled(void);
308 asmlinkage void preempt_schedule_irq(void);
309 #ifdef CONFIG_PREEMPT_RT
310 extern void schedule_rtlock(void);
311 #endif
312
313 extern int __must_check io_schedule_prepare(void);
314 extern void io_schedule_finish(int token);
315 extern long io_schedule_timeout(long timeout);
316 extern void io_schedule(void);
317 extern struct task_struct *pick_migrate_task(struct rq *rq);
318 extern int select_fallback_rq(int cpu, struct task_struct *p);
319
320 /**
321 * struct prev_cputime - snapshot of system and user cputime
322 * @utime: time spent in user mode
323 * @stime: time spent in system mode
324 * @lock: protects the above two fields
325 *
326 * Stores previous user/system time values such that we can guarantee
327 * monotonicity.
328 */
329 struct prev_cputime {
330 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
331 u64 utime;
332 u64 stime;
333 raw_spinlock_t lock;
334 #endif
335 };
336
337 enum vtime_state {
338 /* Task is sleeping or running in a CPU with VTIME inactive: */
339 VTIME_INACTIVE = 0,
340 /* Task is idle */
341 VTIME_IDLE,
342 /* Task runs in kernelspace in a CPU with VTIME active: */
343 VTIME_SYS,
344 /* Task runs in userspace in a CPU with VTIME active: */
345 VTIME_USER,
346 /* Task runs as guests in a CPU with VTIME active: */
347 VTIME_GUEST,
348 };
349
350 struct vtime {
351 seqcount_t seqcount;
352 unsigned long long starttime;
353 enum vtime_state state;
354 unsigned int cpu;
355 u64 utime;
356 u64 stime;
357 u64 gtime;
358 };
359
360 /*
361 * Utilization clamp constraints.
362 * @UCLAMP_MIN: Minimum utilization
363 * @UCLAMP_MAX: Maximum utilization
364 * @UCLAMP_CNT: Utilization clamp constraints count
365 */
366 enum uclamp_id {
367 UCLAMP_MIN = 0,
368 UCLAMP_MAX,
369 UCLAMP_CNT
370 };
371
372 #ifdef CONFIG_SMP
373 extern struct root_domain def_root_domain;
374 extern struct mutex sched_domains_mutex;
375 #endif
376
377 struct sched_info {
378 #ifdef CONFIG_SCHED_INFO
379 /* Cumulative counters: */
380
381 /* # of times we have run on this CPU: */
382 unsigned long pcount;
383
384 /* Time spent waiting on a runqueue: */
385 unsigned long long run_delay;
386
387 /* Timestamps: */
388
389 /* When did we last run on a CPU? */
390 unsigned long long last_arrival;
391
392 /* When were we last queued to run? */
393 unsigned long long last_queued;
394
395 #endif /* CONFIG_SCHED_INFO */
396 };
397
398 /*
399 * Integer metrics need fixed point arithmetic, e.g., sched/fair
400 * has a few: load, load_avg, util_avg, freq, and capacity.
401 *
402 * We define a basic fixed point arithmetic range, and then formalize
403 * all these metrics based on that basic range.
404 */
405 # define SCHED_FIXEDPOINT_SHIFT 10
406 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
407
408 /* Increase resolution of cpu_capacity calculations */
409 # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
410 # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
411
412 struct load_weight {
413 unsigned long weight;
414 u32 inv_weight;
415 };
416
417 /*
418 * The load/runnable/util_avg accumulates an infinite geometric series
419 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
420 *
421 * [load_avg definition]
422 *
423 * load_avg = runnable% * scale_load_down(load)
424 *
425 * [runnable_avg definition]
426 *
427 * runnable_avg = runnable% * SCHED_CAPACITY_SCALE
428 *
429 * [util_avg definition]
430 *
431 * util_avg = running% * SCHED_CAPACITY_SCALE
432 *
433 * where runnable% is the time ratio that a sched_entity is runnable and
434 * running% the time ratio that a sched_entity is running.
435 *
436 * For cfs_rq, they are the aggregated values of all runnable and blocked
437 * sched_entities.
438 *
439 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
440 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
441 * for computing those signals (see update_rq_clock_pelt())
442 *
443 * N.B., the above ratios (runnable% and running%) themselves are in the
444 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
445 * to as large a range as necessary. This is for example reflected by
446 * util_avg's SCHED_CAPACITY_SCALE.
447 *
448 * [Overflow issue]
449 *
450 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
451 * with the highest load (=88761), always runnable on a single cfs_rq,
452 * and should not overflow as the number already hits PID_MAX_LIMIT.
453 *
454 * For all other cases (including 32-bit kernels), struct load_weight's
455 * weight will overflow first before we do, because:
456 *
457 * Max(load_avg) <= Max(load.weight)
458 *
459 * Then it is the load_weight's responsibility to consider overflow
460 * issues.
461 */
462 struct sched_avg {
463 u64 last_update_time;
464 u64 load_sum;
465 u64 runnable_sum;
466 u32 util_sum;
467 u32 period_contrib;
468 unsigned long load_avg;
469 unsigned long runnable_avg;
470 unsigned long util_avg;
471 unsigned int util_est;
472 u32 reserved;
473 } ____cacheline_aligned;
474
475 /*
476 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
477 * updates. When a task is dequeued, its util_est should not be updated if its
478 * util_avg has not been updated in the meantime.
479 * This information is mapped into the MSB bit of util_est at dequeue time.
480 * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
481 * it is safe to use MSB.
482 */
483 #define UTIL_EST_WEIGHT_SHIFT 2
484 #define UTIL_AVG_UNCHANGED 0x80000000
485
486 struct sched_statistics {
487 #ifdef CONFIG_SCHEDSTATS
488 u64 wait_start;
489 u64 wait_max;
490 u64 wait_count;
491 u64 wait_sum;
492 u64 iowait_count;
493 u64 iowait_sum;
494
495 u64 sleep_start;
496 u64 sleep_max;
497 s64 sum_sleep_runtime;
498
499 u64 block_start;
500 u64 block_max;
501 s64 sum_block_runtime;
502
503 u64 exec_max;
504 u64 slice_max;
505
506 u64 nr_migrations_cold;
507 u64 nr_failed_migrations_affine;
508 u64 nr_failed_migrations_running;
509 u64 nr_failed_migrations_hot;
510 u64 nr_forced_migrations;
511
512 u64 nr_wakeups;
513 u64 nr_wakeups_sync;
514 u64 nr_wakeups_migrate;
515 u64 nr_wakeups_local;
516 u64 nr_wakeups_remote;
517 u64 nr_wakeups_affine;
518 u64 nr_wakeups_affine_attempts;
519 u64 nr_wakeups_passive;
520 u64 nr_wakeups_idle;
521
522 #ifdef CONFIG_SCHED_CORE
523 u64 core_forceidle_sum;
524 #endif
525 #endif /* CONFIG_SCHEDSTATS */
526 } ____cacheline_aligned;
527
528 struct sched_entity {
529 /* For load-balancing: */
530 struct load_weight load;
531 struct rb_node run_node;
532 u64 deadline;
533 u64 min_vruntime;
534
535 struct list_head group_node;
536 unsigned int on_rq;
537
538 u64 exec_start;
539 u64 sum_exec_runtime;
540 u64 prev_sum_exec_runtime;
541 u64 vruntime;
542 s64 vlag;
543 u64 slice;
544
545 u64 nr_migrations;
546
547 #ifdef CONFIG_FAIR_GROUP_SCHED
548 int depth;
549 struct sched_entity *parent;
550 /* rq on which this entity is (to be) queued: */
551 struct cfs_rq *cfs_rq;
552 /* rq "owned" by this entity/group: */
553 struct cfs_rq *my_q;
554 /* cached value of my_q->h_nr_running */
555 unsigned long runnable_weight;
556 #endif
557
558 #ifdef CONFIG_SMP
559 /*
560 * Per entity load average tracking.
561 *
562 * Put into separate cache line so it does not
563 * collide with read-mostly values above.
564 */
565 struct sched_avg avg;
566 #endif
567
568 ANDROID_KABI_RESERVE(1);
569 ANDROID_KABI_RESERVE(2);
570 ANDROID_KABI_RESERVE(3);
571 ANDROID_KABI_RESERVE(4);
572 };
573
574 struct sched_rt_entity {
575 struct list_head run_list;
576 unsigned long timeout;
577 unsigned long watchdog_stamp;
578 unsigned int time_slice;
579 unsigned short on_rq;
580 unsigned short on_list;
581
582 struct sched_rt_entity *back;
583 #ifdef CONFIG_RT_GROUP_SCHED
584 struct sched_rt_entity *parent;
585 /* rq on which this entity is (to be) queued: */
586 struct rt_rq *rt_rq;
587 /* rq "owned" by this entity/group: */
588 struct rt_rq *my_q;
589 #endif
590
591 ANDROID_KABI_RESERVE(1);
592 ANDROID_KABI_RESERVE(2);
593 ANDROID_KABI_RESERVE(3);
594 ANDROID_KABI_RESERVE(4);
595 } __randomize_layout;
596
597 struct sched_dl_entity {
598 struct rb_node rb_node;
599
600 /*
601 * Original scheduling parameters. Copied here from sched_attr
602 * during sched_setattr(), they will remain the same until
603 * the next sched_setattr().
604 */
605 u64 dl_runtime; /* Maximum runtime for each instance */
606 u64 dl_deadline; /* Relative deadline of each instance */
607 u64 dl_period; /* Separation of two instances (period) */
608 u64 dl_bw; /* dl_runtime / dl_period */
609 u64 dl_density; /* dl_runtime / dl_deadline */
610
611 /*
612 * Actual scheduling parameters. Initialized with the values above,
613 * they are continuously updated during task execution. Note that
614 * the remaining runtime could be < 0 in case we are in overrun.
615 */
616 s64 runtime; /* Remaining runtime for this instance */
617 u64 deadline; /* Absolute deadline for this instance */
618 unsigned int flags; /* Specifying the scheduler behaviour */
619
620 /*
621 * Some bool flags:
622 *
623 * @dl_throttled tells if we exhausted the runtime. If so, the
624 * task has to wait for a replenishment to be performed at the
625 * next firing of dl_timer.
626 *
627 * @dl_yielded tells if task gave up the CPU before consuming
628 * all its available runtime during the last job.
629 *
630 * @dl_non_contending tells if the task is inactive while still
631 * contributing to the active utilization. In other words, it
632 * indicates if the inactive timer has been armed and its handler
633 * has not been executed yet. This flag is useful to avoid race
634 * conditions between the inactive timer handler and the wakeup
635 * code.
636 *
637 * @dl_overrun tells if the task asked to be informed about runtime
638 * overruns.
639 */
640 unsigned int dl_throttled : 1;
641 unsigned int dl_yielded : 1;
642 unsigned int dl_non_contending : 1;
643 unsigned int dl_overrun : 1;
644
645 /*
646 * Bandwidth enforcement timer. Each -deadline task has its
647 * own bandwidth to be enforced, thus we need one timer per task.
648 */
649 struct hrtimer dl_timer;
650
651 /*
652 * Inactive timer, responsible for decreasing the active utilization
653 * at the "0-lag time". When a -deadline task blocks, it contributes
654 * to GRUB's active utilization until the "0-lag time", hence a
655 * timer is needed to decrease the active utilization at the correct
656 * time.
657 */
658 struct hrtimer inactive_timer;
659
660 #ifdef CONFIG_RT_MUTEXES
661 /*
662 * Priority Inheritance. When a DEADLINE scheduling entity is boosted
663 * pi_se points to the donor, otherwise points to the dl_se it belongs
664 * to (the original one/itself).
665 */
666 struct sched_dl_entity *pi_se;
667 #endif
668 };
669
670 #ifdef CONFIG_UCLAMP_TASK
671 /* Number of utilization clamp buckets (shorter alias) */
672 #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
673
674 /*
675 * Utilization clamp for a scheduling entity
676 * @value: clamp value "assigned" to a se
677 * @bucket_id: bucket index corresponding to the "assigned" value
678 * @active: the se is currently refcounted in a rq's bucket
679 * @user_defined: the requested clamp value comes from user-space
680 *
681 * The bucket_id is the index of the clamp bucket matching the clamp value
682 * which is pre-computed and stored to avoid expensive integer divisions from
683 * the fast path.
684 *
685 * The active bit is set whenever a task has got an "effective" value assigned,
686 * which can be different from the clamp value "requested" from user-space.
687 * This allows to know a task is refcounted in the rq's bucket corresponding
688 * to the "effective" bucket_id.
689 *
690 * The user_defined bit is set whenever a task has got a task-specific clamp
691 * value requested from userspace, i.e. the system defaults apply to this task
692 * just as a restriction. This allows to relax default clamps when a less
693 * restrictive task-specific value has been requested, thus allowing to
694 * implement a "nice" semantic. For example, a task running with a 20%
695 * default boost can still drop its own boosting to 0%.
696 */
697 struct uclamp_se {
698 unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
699 unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
700 unsigned int active : 1;
701 unsigned int user_defined : 1;
702 };
703 #endif /* CONFIG_UCLAMP_TASK */
704
705 union rcu_special {
706 struct {
707 u8 blocked;
708 u8 need_qs;
709 u8 exp_hint; /* Hint for performance. */
710 u8 need_mb; /* Readers need smp_mb(). */
711 } b; /* Bits. */
712 u32 s; /* Set of bits. */
713 };
714
715 enum perf_event_task_context {
716 perf_invalid_context = -1,
717 perf_hw_context = 0,
718 perf_sw_context,
719 perf_nr_task_contexts,
720 };
721
722 struct wake_q_node {
723 struct wake_q_node *next;
724 };
725
726 struct kmap_ctrl {
727 #ifdef CONFIG_KMAP_LOCAL
728 int idx;
729 pte_t pteval[KM_MAX_IDX];
730 #endif
731 };
732
733 struct task_struct {
734 #ifdef CONFIG_THREAD_INFO_IN_TASK
735 /*
736 * For reasons of header soup (see current_thread_info()), this
737 * must be the first element of task_struct.
738 */
739 struct thread_info thread_info;
740 #endif
741 unsigned int __state;
742
743 /* saved state for "spinlock sleepers" */
744 unsigned int saved_state;
745
746 /*
747 * This begins the randomizable portion of task_struct. Only
748 * scheduling-critical items should be added above here.
749 */
750 randomized_struct_fields_start
751
752 void *stack;
753 refcount_t usage;
754 /* Per task flags (PF_*), defined further below: */
755 unsigned int flags;
756 unsigned int ptrace;
757
758 #ifdef CONFIG_SMP
759 int on_cpu;
760 struct __call_single_node wake_entry;
761 unsigned int wakee_flips;
762 unsigned long wakee_flip_decay_ts;
763 struct task_struct *last_wakee;
764
765 /*
766 * recent_used_cpu is initially set as the last CPU used by a task
767 * that wakes affine another task. Waker/wakee relationships can
768 * push tasks around a CPU where each wakeup moves to the next one.
769 * Tracking a recently used CPU allows a quick search for a recently
770 * used CPU that may be idle.
771 */
772 int recent_used_cpu;
773 int wake_cpu;
774 #endif
775 int on_rq;
776
777 int prio;
778 int static_prio;
779 int normal_prio;
780 unsigned int rt_priority;
781
782 struct sched_entity se;
783 struct sched_rt_entity rt;
784 struct sched_dl_entity dl;
785 const struct sched_class *sched_class;
786
787 #ifdef CONFIG_SCHED_CORE
788 struct rb_node core_node;
789 unsigned long core_cookie;
790 unsigned int core_occupation;
791 #endif
792
793 #ifdef CONFIG_CGROUP_SCHED
794 struct task_group *sched_task_group;
795 #endif
796
797 #ifdef CONFIG_UCLAMP_TASK
798 /*
799 * Clamp values requested for a scheduling entity.
800 * Must be updated with task_rq_lock() held.
801 */
802 struct uclamp_se uclamp_req[UCLAMP_CNT];
803 /*
804 * Effective clamp values used for a scheduling entity.
805 * Must be updated with task_rq_lock() held.
806 */
807 struct uclamp_se uclamp[UCLAMP_CNT];
808 #endif
809
810 struct sched_statistics stats;
811
812 #ifdef CONFIG_PREEMPT_NOTIFIERS
813 /* List of struct preempt_notifier: */
814 struct hlist_head preempt_notifiers;
815 #endif
816
817 #ifdef CONFIG_BLK_DEV_IO_TRACE
818 unsigned int btrace_seq;
819 #endif
820
821 unsigned int policy;
822 int nr_cpus_allowed;
823 const cpumask_t *cpus_ptr;
824 cpumask_t *user_cpus_ptr;
825 cpumask_t cpus_mask;
826 void *migration_pending;
827 #ifdef CONFIG_SMP
828 unsigned short migration_disabled;
829 #endif
830 unsigned short migration_flags;
831
832 #ifdef CONFIG_PREEMPT_RCU
833 int rcu_read_lock_nesting;
834 union rcu_special rcu_read_unlock_special;
835 struct list_head rcu_node_entry;
836 struct rcu_node *rcu_blocked_node;
837 #endif /* #ifdef CONFIG_PREEMPT_RCU */
838
839 #ifdef CONFIG_TASKS_RCU
840 unsigned long rcu_tasks_nvcsw;
841 u8 rcu_tasks_holdout;
842 u8 rcu_tasks_idx;
843 int rcu_tasks_idle_cpu;
844 struct list_head rcu_tasks_holdout_list;
845 #endif /* #ifdef CONFIG_TASKS_RCU */
846
847 #ifdef CONFIG_TASKS_TRACE_RCU
848 int trc_reader_nesting;
849 int trc_ipi_to_cpu;
850 union rcu_special trc_reader_special;
851 struct list_head trc_holdout_list;
852 struct list_head trc_blkd_node;
853 int trc_blkd_cpu;
854 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
855
856 struct sched_info sched_info;
857
858 struct list_head tasks;
859 #ifdef CONFIG_SMP
860 struct plist_node pushable_tasks;
861 struct rb_node pushable_dl_tasks;
862 #endif
863
864 struct mm_struct *mm;
865 struct mm_struct *active_mm;
866
867 int exit_state;
868 int exit_code;
869 int exit_signal;
870 /* The signal sent when the parent dies: */
871 int pdeath_signal;
872 /* JOBCTL_*, siglock protected: */
873 unsigned long jobctl;
874
875 /* Used for emulating ABI behavior of previous Linux versions: */
876 unsigned int personality;
877
878 /* Scheduler bits, serialized by scheduler locks: */
879 unsigned sched_reset_on_fork:1;
880 unsigned sched_contributes_to_load:1;
881 unsigned sched_migrated:1;
882
883 /* Force alignment to the next boundary: */
884 unsigned :0;
885
886 /* Unserialized, strictly 'current' */
887
888 /*
889 * This field must not be in the scheduler word above due to wakelist
890 * queueing no longer being serialized by p->on_cpu. However:
891 *
892 * p->XXX = X; ttwu()
893 * schedule() if (p->on_rq && ..) // false
894 * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true
895 * deactivate_task() ttwu_queue_wakelist())
896 * p->on_rq = 0; p->sched_remote_wakeup = Y;
897 *
898 * guarantees all stores of 'current' are visible before
899 * ->sched_remote_wakeup gets used, so it can be in this word.
900 */
901 unsigned sched_remote_wakeup:1;
902
903 /* Bit to tell LSMs we're in execve(): */
904 unsigned in_execve:1;
905 unsigned in_iowait:1;
906 #ifndef TIF_RESTORE_SIGMASK
907 unsigned restore_sigmask:1;
908 #endif
909 #ifdef CONFIG_MEMCG
910 unsigned in_user_fault:1;
911 #endif
912 #ifdef CONFIG_LRU_GEN
913 /* whether the LRU algorithm may apply to this access */
914 unsigned in_lru_fault:1;
915 #endif
916 #ifdef CONFIG_COMPAT_BRK
917 unsigned brk_randomized:1;
918 #endif
919 #ifdef CONFIG_CGROUPS
920 /* disallow userland-initiated cgroup migration */
921 unsigned no_cgroup_migration:1;
922 /* task is frozen/stopped (used by the cgroup freezer) */
923 unsigned frozen:1;
924 #endif
925 #ifdef CONFIG_BLK_CGROUP
926 unsigned use_memdelay:1;
927 #endif
928 #ifdef CONFIG_PSI
929 /* Stalled due to lack of memory */
930 unsigned in_memstall:1;
931 #endif
932 #ifdef CONFIG_PAGE_OWNER
933 /* Used by page_owner=on to detect recursion in page tracking. */
934 unsigned in_page_owner:1;
935 #endif
936 #ifdef CONFIG_EVENTFD
937 /* Recursion prevention for eventfd_signal() */
938 unsigned in_eventfd:1;
939 #endif
940 #ifdef CONFIG_IOMMU_SVA
941 unsigned pasid_activated:1;
942 #endif
943 #ifdef CONFIG_CPU_SUP_INTEL
944 unsigned reported_split_lock:1;
945 #endif
946 #ifdef CONFIG_TASK_DELAY_ACCT
947 /* delay due to memory thrashing */
948 unsigned in_thrashing:1;
949 #endif
950
951 unsigned long atomic_flags; /* Flags requiring atomic access. */
952
953 struct restart_block restart_block;
954
955 pid_t pid;
956 pid_t tgid;
957
958 #ifdef CONFIG_STACKPROTECTOR
959 /* Canary value for the -fstack-protector GCC feature: */
960 unsigned long stack_canary;
961 #endif
962 /*
963 * Pointers to the (original) parent process, youngest child, younger sibling,
964 * older sibling, respectively. (p->father can be replaced with
965 * p->real_parent->pid)
966 */
967
968 /* Real parent process: */
969 struct task_struct __rcu *real_parent;
970
971 /* Recipient of SIGCHLD, wait4() reports: */
972 struct task_struct __rcu *parent;
973
974 /*
975 * Children/sibling form the list of natural children:
976 */
977 struct list_head children;
978 struct list_head sibling;
979 struct task_struct *group_leader;
980
981 /*
982 * 'ptraced' is the list of tasks this task is using ptrace() on.
983 *
984 * This includes both natural children and PTRACE_ATTACH targets.
985 * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
986 */
987 struct list_head ptraced;
988 struct list_head ptrace_entry;
989
990 /* PID/PID hash table linkage. */
991 struct pid *thread_pid;
992 struct hlist_node pid_links[PIDTYPE_MAX];
993 struct list_head thread_group;
994 struct list_head thread_node;
995
996 struct completion *vfork_done;
997
998 /* CLONE_CHILD_SETTID: */
999 int __user *set_child_tid;
1000
1001 /* CLONE_CHILD_CLEARTID: */
1002 int __user *clear_child_tid;
1003
1004 /* PF_KTHREAD | PF_IO_WORKER */
1005 void *worker_private;
1006
1007 u64 utime;
1008 u64 stime;
1009 #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
1010 u64 utimescaled;
1011 u64 stimescaled;
1012 #endif
1013 u64 gtime;
1014 #ifdef CONFIG_CPU_FREQ_TIMES
1015 u64 *time_in_state;
1016 unsigned int max_state;
1017 #endif
1018 struct prev_cputime prev_cputime;
1019 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1020 struct vtime vtime;
1021 #endif
1022
1023 #ifdef CONFIG_NO_HZ_FULL
1024 atomic_t tick_dep_mask;
1025 #endif
1026 /* Context switch counts: */
1027 unsigned long nvcsw;
1028 unsigned long nivcsw;
1029
1030 /* Monotonic time in nsecs: */
1031 u64 start_time;
1032
1033 /* Boot based time in nsecs: */
1034 u64 start_boottime;
1035
1036 /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
1037 unsigned long min_flt;
1038 unsigned long maj_flt;
1039
1040 /* Empty if CONFIG_POSIX_CPUTIMERS=n */
1041 struct posix_cputimers posix_cputimers;
1042
1043 #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
1044 struct posix_cputimers_work posix_cputimers_work;
1045 #endif
1046
1047 /* Process credentials: */
1048
1049 /* Tracer's credentials at attach: */
1050 const struct cred __rcu *ptracer_cred;
1051
1052 /* Objective and real subjective task credentials (COW): */
1053 const struct cred __rcu *real_cred;
1054
1055 /* Effective (overridable) subjective task credentials (COW): */
1056 const struct cred __rcu *cred;
1057
1058 #ifdef CONFIG_KEYS
1059 /* Cached requested key. */
1060 struct key *cached_requested_key;
1061 #endif
1062
1063 /*
1064 * executable name, excluding path.
1065 *
1066 * - normally initialized setup_new_exec()
1067 * - access it with [gs]et_task_comm()
1068 * - lock it with task_lock()
1069 */
1070 char comm[TASK_COMM_LEN];
1071
1072 struct nameidata *nameidata;
1073
1074 #ifdef CONFIG_SYSVIPC
1075 struct sysv_sem sysvsem;
1076 struct sysv_shm sysvshm;
1077 #endif
1078 #ifdef CONFIG_DETECT_HUNG_TASK
1079 unsigned long last_switch_count;
1080 unsigned long last_switch_time;
1081 #endif
1082 /* Filesystem information: */
1083 struct fs_struct *fs;
1084
1085 /* Open file information: */
1086 struct files_struct *files;
1087
1088 #ifdef CONFIG_IO_URING
1089 struct io_uring_task *io_uring;
1090 #endif
1091
1092 /* Namespaces: */
1093 struct nsproxy *nsproxy;
1094
1095 /* Signal handlers: */
1096 struct signal_struct *signal;
1097 struct sighand_struct __rcu *sighand;
1098 sigset_t blocked;
1099 sigset_t real_blocked;
1100 /* Restored if set_restore_sigmask() was used: */
1101 sigset_t saved_sigmask;
1102 struct sigpending pending;
1103 unsigned long sas_ss_sp;
1104 size_t sas_ss_size;
1105 unsigned int sas_ss_flags;
1106
1107 struct callback_head *task_works;
1108
1109 #ifdef CONFIG_AUDIT
1110 #ifdef CONFIG_AUDITSYSCALL
1111 struct audit_context *audit_context;
1112 #endif
1113 kuid_t loginuid;
1114 unsigned int sessionid;
1115 #endif
1116 struct seccomp seccomp;
1117 struct syscall_user_dispatch syscall_dispatch;
1118
1119 /* Thread group tracking: */
1120 u64 parent_exec_id;
1121 u64 self_exec_id;
1122
1123 /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
1124 spinlock_t alloc_lock;
1125
1126 /* Protection of the PI data structures: */
1127 raw_spinlock_t pi_lock;
1128
1129 struct wake_q_node wake_q;
1130 int wake_q_count;
1131
1132 #ifdef CONFIG_RT_MUTEXES
1133 /* PI waiters blocked on a rt_mutex held by this task: */
1134 struct rb_root_cached pi_waiters;
1135 /* Updated under owner's pi_lock and rq lock */
1136 struct task_struct *pi_top_task;
1137 /* Deadlock detection and priority inheritance handling: */
1138 struct rt_mutex_waiter *pi_blocked_on;
1139 #endif
1140
1141 #ifdef CONFIG_DEBUG_MUTEXES
1142 /* Mutex deadlock detection: */
1143 struct mutex_waiter *blocked_on;
1144 #endif
1145
1146 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
1147 int non_block_count;
1148 #endif
1149
1150 #ifdef CONFIG_TRACE_IRQFLAGS
1151 struct irqtrace_events irqtrace;
1152 unsigned int hardirq_threaded;
1153 u64 hardirq_chain_key;
1154 int softirqs_enabled;
1155 int softirq_context;
1156 int irq_config;
1157 #endif
1158 #ifdef CONFIG_PREEMPT_RT
1159 int softirq_disable_cnt;
1160 #endif
1161
1162 #ifdef CONFIG_LOCKDEP
1163 # define MAX_LOCK_DEPTH 48UL
1164 u64 curr_chain_key;
1165 int lockdep_depth;
1166 unsigned int lockdep_recursion;
1167 struct held_lock held_locks[MAX_LOCK_DEPTH];
1168 #endif
1169
1170 #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
1171 unsigned int in_ubsan;
1172 #endif
1173
1174 /* Journalling filesystem info: */
1175 void *journal_info;
1176
1177 /* Stacked block device info: */
1178 struct bio_list *bio_list;
1179
1180 /* Stack plugging: */
1181 struct blk_plug *plug;
1182
1183 /* VM state: */
1184 struct reclaim_state *reclaim_state;
1185
1186 struct io_context *io_context;
1187
1188 #ifdef CONFIG_COMPACTION
1189 struct capture_control *capture_control;
1190 #endif
1191 /* Ptrace state: */
1192 unsigned long ptrace_message;
1193 kernel_siginfo_t *last_siginfo;
1194
1195 struct task_io_accounting ioac;
1196 #ifdef CONFIG_PSI
1197 /* Pressure stall state */
1198 unsigned int psi_flags;
1199 #endif
1200 #ifdef CONFIG_TASK_XACCT
1201 /* Accumulated RSS usage: */
1202 u64 acct_rss_mem1;
1203 /* Accumulated virtual memory usage: */
1204 u64 acct_vm_mem1;
1205 /* stime + utime since last update: */
1206 u64 acct_timexpd;
1207 #endif
1208 #ifdef CONFIG_CPUSETS
1209 /* Protected by ->alloc_lock: */
1210 nodemask_t mems_allowed;
1211 /* Sequence number to catch updates: */
1212 seqcount_spinlock_t mems_allowed_seq;
1213 int cpuset_mem_spread_rotor;
1214 int cpuset_slab_spread_rotor;
1215 #endif
1216 #ifdef CONFIG_CGROUPS
1217 /* Control Group info protected by css_set_lock: */
1218 struct css_set __rcu *cgroups;
1219 /* cg_list protected by css_set_lock and tsk->alloc_lock: */
1220 struct list_head cg_list;
1221 #endif
1222 #ifdef CONFIG_X86_CPU_RESCTRL
1223 u32 closid;
1224 u32 rmid;
1225 #endif
1226 #ifdef CONFIG_FUTEX
1227 struct robust_list_head __user *robust_list;
1228 #ifdef CONFIG_COMPAT
1229 struct compat_robust_list_head __user *compat_robust_list;
1230 #endif
1231 struct list_head pi_state_list;
1232 struct futex_pi_state *pi_state_cache;
1233 struct mutex futex_exit_mutex;
1234 unsigned int futex_state;
1235 #endif
1236 #ifdef CONFIG_PERF_EVENTS
1237 struct perf_event_context *perf_event_ctxp;
1238 struct mutex perf_event_mutex;
1239 struct list_head perf_event_list;
1240 #endif
1241 #ifdef CONFIG_DEBUG_PREEMPT
1242 unsigned long preempt_disable_ip;
1243 #endif
1244 #ifdef CONFIG_NUMA
1245 /* Protected by alloc_lock: */
1246 struct mempolicy *mempolicy;
1247 short il_prev;
1248 short pref_node_fork;
1249 #endif
1250 #ifdef CONFIG_NUMA_BALANCING
1251 int numa_scan_seq;
1252 unsigned int numa_scan_period;
1253 unsigned int numa_scan_period_max;
1254 int numa_preferred_nid;
1255 unsigned long numa_migrate_retry;
1256 /* Migration stamp: */
1257 u64 node_stamp;
1258 u64 last_task_numa_placement;
1259 u64 last_sum_exec_runtime;
1260 struct callback_head numa_work;
1261
1262 /*
1263 * This pointer is only modified for current in syscall and
1264 * pagefault context (and for tasks being destroyed), so it can be read
1265 * from any of the following contexts:
1266 * - RCU read-side critical section
1267 * - current->numa_group from everywhere
1268 * - task's runqueue locked, task not running
1269 */
1270 struct numa_group __rcu *numa_group;
1271
1272 /*
1273 * numa_faults is an array split into four regions:
1274 * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
1275 * in this precise order.
1276 *
1277 * faults_memory: Exponential decaying average of faults on a per-node
1278 * basis. Scheduling placement decisions are made based on these
1279 * counts. The values remain static for the duration of a PTE scan.
1280 * faults_cpu: Track the nodes the process was running on when a NUMA
1281 * hinting fault was incurred.
1282 * faults_memory_buffer and faults_cpu_buffer: Record faults per node
1283 * during the current scan window. When the scan completes, the counts
1284 * in faults_memory and faults_cpu decay and these values are copied.
1285 */
1286 unsigned long *numa_faults;
1287 unsigned long total_numa_faults;
1288
1289 /*
1290 * numa_faults_locality tracks if faults recorded during the last
1291 * scan window were remote/local or failed to migrate. The task scan
1292 * period is adapted based on the locality of the faults with different
1293 * weights depending on whether they were shared or private faults
1294 */
1295 unsigned long numa_faults_locality[3];
1296
1297 unsigned long numa_pages_migrated;
1298 #endif /* CONFIG_NUMA_BALANCING */
1299
1300 #ifdef CONFIG_RSEQ
1301 struct rseq __user *rseq;
1302 u32 rseq_len;
1303 u32 rseq_sig;
1304 /*
1305 * RmW on rseq_event_mask must be performed atomically
1306 * with respect to preemption.
1307 */
1308 unsigned long rseq_event_mask;
1309 #endif
1310
1311 #ifdef CONFIG_SCHED_MM_CID
1312 int mm_cid; /* Current cid in mm */
1313 int last_mm_cid; /* Most recent cid in mm */
1314 int migrate_from_cpu;
1315 int mm_cid_active; /* Whether cid bitmap is active */
1316 struct callback_head cid_work;
1317 #endif
1318
1319 struct tlbflush_unmap_batch tlb_ubc;
1320
1321 /* Cache last used pipe for splice(): */
1322 struct pipe_inode_info *splice_pipe;
1323
1324 struct page_frag task_frag;
1325
1326 #ifdef CONFIG_TASK_DELAY_ACCT
1327 struct task_delay_info *delays;
1328 #endif
1329
1330 #ifdef CONFIG_FAULT_INJECTION
1331 int make_it_fail;
1332 unsigned int fail_nth;
1333 #endif
1334 /*
1335 * When (nr_dirtied >= nr_dirtied_pause), it's time to call
1336 * balance_dirty_pages() for a dirty throttling pause:
1337 */
1338 int nr_dirtied;
1339 int nr_dirtied_pause;
1340 /* Start of a write-and-pause period: */
1341 unsigned long dirty_paused_when;
1342
1343 #ifdef CONFIG_LATENCYTOP
1344 int latency_record_count;
1345 struct latency_record latency_record[LT_SAVECOUNT];
1346 #endif
1347 /*
1348 * Time slack values; these are used to round up poll() and
1349 * select() etc timeout values. These are in nanoseconds.
1350 */
1351 u64 timer_slack_ns;
1352 u64 default_timer_slack_ns;
1353
1354 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
1355 unsigned int kasan_depth;
1356 #endif
1357
1358 #ifdef CONFIG_KCSAN
1359 struct kcsan_ctx kcsan_ctx;
1360 #ifdef CONFIG_TRACE_IRQFLAGS
1361 struct irqtrace_events kcsan_save_irqtrace;
1362 #endif
1363 #ifdef CONFIG_KCSAN_WEAK_MEMORY
1364 int kcsan_stack_depth;
1365 #endif
1366 #endif
1367
1368 #ifdef CONFIG_KMSAN
1369 struct kmsan_ctx kmsan_ctx;
1370 #endif
1371
1372 #if IS_ENABLED(CONFIG_KUNIT)
1373 struct kunit *kunit_test;
1374 #endif
1375
1376 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
1377 /* Index of current stored address in ret_stack: */
1378 int curr_ret_stack;
1379 int curr_ret_depth;
1380
1381 /* Stack of return addresses for return function tracing: */
1382 struct ftrace_ret_stack *ret_stack;
1383
1384 /* Timestamp for last schedule: */
1385 unsigned long long ftrace_timestamp;
1386
1387 /*
1388 * Number of functions that haven't been traced
1389 * because of depth overrun:
1390 */
1391 atomic_t trace_overrun;
1392
1393 /* Pause tracing: */
1394 atomic_t tracing_graph_pause;
1395 #endif
1396
1397 #ifdef CONFIG_TRACING
1398 /* Bitmask and counter of trace recursion: */
1399 unsigned long trace_recursion;
1400 #endif /* CONFIG_TRACING */
1401
1402 #ifdef CONFIG_KCOV
1403 /* See kernel/kcov.c for more details. */
1404
1405 /* Coverage collection mode enabled for this task (0 if disabled): */
1406 unsigned int kcov_mode;
1407
1408 /* Size of the kcov_area: */
1409 unsigned int kcov_size;
1410
1411 /* Buffer for coverage collection: */
1412 void *kcov_area;
1413
1414 /* KCOV descriptor wired with this task or NULL: */
1415 struct kcov *kcov;
1416
1417 /* KCOV common handle for remote coverage collection: */
1418 u64 kcov_handle;
1419
1420 /* KCOV sequence number: */
1421 int kcov_sequence;
1422
1423 /* Collect coverage from softirq context: */
1424 unsigned int kcov_softirq;
1425 #endif
1426
1427 #ifdef CONFIG_MEMCG
1428 struct mem_cgroup *memcg_in_oom;
1429 gfp_t memcg_oom_gfp_mask;
1430 int memcg_oom_order;
1431
1432 /* Number of pages to reclaim on returning to userland: */
1433 unsigned int memcg_nr_pages_over_high;
1434
1435 /* Used by memcontrol for targeted memcg charge: */
1436 struct mem_cgroup *active_memcg;
1437 #endif
1438
1439 #ifdef CONFIG_BLK_CGROUP
1440 struct gendisk *throttle_disk;
1441 #endif
1442
1443 #ifdef CONFIG_UPROBES
1444 struct uprobe_task *utask;
1445 #endif
1446 #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
1447 unsigned int sequential_io;
1448 unsigned int sequential_io_avg;
1449 #endif
1450 struct kmap_ctrl kmap_ctrl;
1451 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
1452 unsigned long task_state_change;
1453 # ifdef CONFIG_PREEMPT_RT
1454 unsigned long saved_state_change;
1455 # endif
1456 #endif
1457 struct rcu_head rcu;
1458 refcount_t rcu_users;
1459 int pagefault_disabled;
1460 #ifdef CONFIG_MMU
1461 struct task_struct *oom_reaper_list;
1462 struct timer_list oom_reaper_timer;
1463 #endif
1464 #ifdef CONFIG_VMAP_STACK
1465 struct vm_struct *stack_vm_area;
1466 #endif
1467 #ifdef CONFIG_THREAD_INFO_IN_TASK
1468 /* A live task holds one reference: */
1469 refcount_t stack_refcount;
1470 #endif
1471 #ifdef CONFIG_LIVEPATCH
1472 int patch_state;
1473 #endif
1474 #ifdef CONFIG_SECURITY
1475 /* Used by LSM modules for access restriction: */
1476 void *security;
1477 #endif
1478 #ifdef CONFIG_BPF_SYSCALL
1479 /* Used by BPF task local storage */
1480 struct bpf_local_storage __rcu *bpf_storage;
1481 /* Used for BPF run context */
1482 struct bpf_run_ctx *bpf_ctx;
1483 #endif
1484
1485 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK
1486 unsigned long lowest_stack;
1487 unsigned long prev_lowest_stack;
1488 #endif
1489
1490 #ifdef CONFIG_X86_MCE
1491 void __user *mce_vaddr;
1492 __u64 mce_kflags;
1493 u64 mce_addr;
1494 __u64 mce_ripv : 1,
1495 mce_whole_page : 1,
1496 __mce_reserved : 62;
1497 struct callback_head mce_kill_me;
1498 int mce_count;
1499 #endif
1500 ANDROID_VENDOR_DATA_ARRAY(1, 64);
1501 ANDROID_OEM_DATA_ARRAY(1, 6);
1502
1503 #ifdef CONFIG_KRETPROBES
1504 struct llist_head kretprobe_instances;
1505 #endif
1506 #ifdef CONFIG_RETHOOK
1507 struct llist_head rethooks;
1508 #endif
1509
1510 #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
1511 /*
1512 * If L1D flush is supported on mm context switch
1513 * then we use this callback head to queue kill work
1514 * to kill tasks that are not running on SMT disabled
1515 * cores
1516 */
1517 struct callback_head l1d_flush_kill;
1518 #endif
1519 ANDROID_KABI_RESERVE(1);
1520 ANDROID_KABI_RESERVE(2);
1521 ANDROID_KABI_RESERVE(3);
1522 ANDROID_KABI_RESERVE(4);
1523 ANDROID_KABI_RESERVE(5);
1524 ANDROID_KABI_RESERVE(6);
1525 ANDROID_KABI_RESERVE(7);
1526 ANDROID_KABI_RESERVE(8);
1527
1528 #ifdef CONFIG_RV
1529 /*
1530 * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
1531 * If we find justification for more monitors, we can think
1532 * about adding more or developing a dynamic method. So far,
1533 * none of these are justified.
1534 */
1535 union rv_task_monitor rv[RV_PER_TASK_MONITORS];
1536 #endif
1537
1538 #ifdef CONFIG_USER_EVENTS
1539 struct user_event_mm *user_event_mm;
1540 #endif
1541
1542 /*
1543 * New fields for task_struct should be added above here, so that
1544 * they are included in the randomized portion of task_struct.
1545 */
1546 randomized_struct_fields_end
1547
1548 /* CPU-specific state of this task: */
1549 struct thread_struct thread;
1550
1551 /*
1552 * WARNING: on x86, 'thread_struct' contains a variable-sized
1553 * structure. It *MUST* be at the end of 'task_struct'.
1554 *
1555 * Do not put anything below here!
1556 */
1557 };
1558
task_pid(struct task_struct * task)1559 static inline struct pid *task_pid(struct task_struct *task)
1560 {
1561 return task->thread_pid;
1562 }
1563
1564 /*
1565 * the helpers to get the task's different pids as they are seen
1566 * from various namespaces
1567 *
1568 * task_xid_nr() : global id, i.e. the id seen from the init namespace;
1569 * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
1570 * current.
1571 * task_xid_nr_ns() : id seen from the ns specified;
1572 *
1573 * see also pid_nr() etc in include/linux/pid.h
1574 */
1575 pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);
1576
task_pid_nr(struct task_struct * tsk)1577 static inline pid_t task_pid_nr(struct task_struct *tsk)
1578 {
1579 return tsk->pid;
1580 }
1581
task_pid_nr_ns(struct task_struct * tsk,struct pid_namespace * ns)1582 static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
1583 {
1584 return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
1585 }
1586
task_pid_vnr(struct task_struct * tsk)1587 static inline pid_t task_pid_vnr(struct task_struct *tsk)
1588 {
1589 return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
1590 }
1591
1592
task_tgid_nr(struct task_struct * tsk)1593 static inline pid_t task_tgid_nr(struct task_struct *tsk)
1594 {
1595 return tsk->tgid;
1596 }
1597
1598 /**
1599 * pid_alive - check that a task structure is not stale
1600 * @p: Task structure to be checked.
1601 *
1602 * Test if a process is not yet dead (at most zombie state)
1603 * If pid_alive fails, then pointers within the task structure
1604 * can be stale and must not be dereferenced.
1605 *
1606 * Return: 1 if the process is alive. 0 otherwise.
1607 */
pid_alive(const struct task_struct * p)1608 static inline int pid_alive(const struct task_struct *p)
1609 {
1610 return p->thread_pid != NULL;
1611 }
1612
task_pgrp_nr_ns(struct task_struct * tsk,struct pid_namespace * ns)1613 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
1614 {
1615 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
1616 }
1617
task_pgrp_vnr(struct task_struct * tsk)1618 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
1619 {
1620 return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
1621 }
1622
1623
task_session_nr_ns(struct task_struct * tsk,struct pid_namespace * ns)1624 static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
1625 {
1626 return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
1627 }
1628
task_session_vnr(struct task_struct * tsk)1629 static inline pid_t task_session_vnr(struct task_struct *tsk)
1630 {
1631 return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
1632 }
1633
task_tgid_nr_ns(struct task_struct * tsk,struct pid_namespace * ns)1634 static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
1635 {
1636 return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
1637 }
1638
task_tgid_vnr(struct task_struct * tsk)1639 static inline pid_t task_tgid_vnr(struct task_struct *tsk)
1640 {
1641 return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
1642 }
1643
task_ppid_nr_ns(const struct task_struct * tsk,struct pid_namespace * ns)1644 static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
1645 {
1646 pid_t pid = 0;
1647
1648 rcu_read_lock();
1649 if (pid_alive(tsk))
1650 pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
1651 rcu_read_unlock();
1652
1653 return pid;
1654 }
1655
task_ppid_nr(const struct task_struct * tsk)1656 static inline pid_t task_ppid_nr(const struct task_struct *tsk)
1657 {
1658 return task_ppid_nr_ns(tsk, &init_pid_ns);
1659 }
1660
1661 /* Obsolete, do not use: */
task_pgrp_nr(struct task_struct * tsk)1662 static inline pid_t task_pgrp_nr(struct task_struct *tsk)
1663 {
1664 return task_pgrp_nr_ns(tsk, &init_pid_ns);
1665 }
1666
1667 #define TASK_REPORT_IDLE (TASK_REPORT + 1)
1668 #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
1669
__task_state_index(unsigned int tsk_state,unsigned int tsk_exit_state)1670 static inline unsigned int __task_state_index(unsigned int tsk_state,
1671 unsigned int tsk_exit_state)
1672 {
1673 unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;
1674
1675 BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
1676
1677 if ((tsk_state & TASK_IDLE) == TASK_IDLE)
1678 state = TASK_REPORT_IDLE;
1679
1680 /*
1681 * We're lying here, but rather than expose a completely new task state
1682 * to userspace, we can make this appear as if the task has gone through
1683 * a regular rt_mutex_lock() call.
1684 */
1685 if (tsk_state & TASK_RTLOCK_WAIT)
1686 state = TASK_UNINTERRUPTIBLE;
1687
1688 return fls(state);
1689 }
1690
task_state_index(struct task_struct * tsk)1691 static inline unsigned int task_state_index(struct task_struct *tsk)
1692 {
1693 return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
1694 }
1695
task_index_to_char(unsigned int state)1696 static inline char task_index_to_char(unsigned int state)
1697 {
1698 static const char state_char[] = "RSDTtXZPI";
1699
1700 BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1);
1701
1702 return state_char[state];
1703 }
1704
task_state_to_char(struct task_struct * tsk)1705 static inline char task_state_to_char(struct task_struct *tsk)
1706 {
1707 return task_index_to_char(task_state_index(tsk));
1708 }
1709
1710 /**
1711 * is_global_init - check if a task structure is init. Since init
1712 * is free to have sub-threads we need to check tgid.
1713 * @tsk: Task structure to be checked.
1714 *
1715 * Check if a task structure is the first user space task the kernel created.
1716 *
1717 * Return: 1 if the task structure is init. 0 otherwise.
1718 */
is_global_init(struct task_struct * tsk)1719 static inline int is_global_init(struct task_struct *tsk)
1720 {
1721 return task_tgid_nr(tsk) == 1;
1722 }
1723
1724 extern struct pid *cad_pid;
1725
1726 /*
1727 * Per process flags
1728 */
1729 #define PF_VCPU 0x00000001 /* I'm a virtual CPU */
1730 #define PF_IDLE 0x00000002 /* I am an IDLE thread */
1731 #define PF_EXITING 0x00000004 /* Getting shut down */
1732 #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */
1733 #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
1734 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
1735 #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
1736 #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */
1737 #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */
1738 #define PF_DUMPCORE 0x00000200 /* Dumped core */
1739 #define PF_SIGNALED 0x00000400 /* Killed by a signal */
1740 #define PF_MEMALLOC 0x00000800 /* Allocating memory */
1741 #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */
1742 #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */
1743 #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */
1744 #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */
1745 #define PF__HOLE__00010000 0x00010000
1746 #define PF_KSWAPD 0x00020000 /* I am kswapd */
1747 #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */
1748 #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */
1749 #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to,
1750 * I am cleaning dirty pages from some other bdi. */
1751 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */
1752 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */
1753 #define PF__HOLE__00800000 0x00800000
1754 #define PF__HOLE__01000000 0x01000000
1755 #define PF__HOLE__02000000 0x02000000
1756 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
1757 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1758 #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
1759 #define PF__HOLE__20000000 0x20000000
1760 #define PF__HOLE__40000000 0x40000000
1761 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
1762
1763 /*
1764 * Only the _current_ task can read/write to tsk->flags, but other
1765 * tasks can access tsk->flags in readonly mode for example
1766 * with tsk_used_math (like during threaded core dumping).
1767 * There is however an exception to this rule during ptrace
1768 * or during fork: the ptracer task is allowed to write to the
1769 * child->flags of its traced child (same goes for fork, the parent
1770 * can write to the child->flags), because we're guaranteed the
1771 * child is not running and in turn not changing child->flags
1772 * at the same time the parent does it.
1773 */
1774 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
1775 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
1776 #define clear_used_math() clear_stopped_child_used_math(current)
1777 #define set_used_math() set_stopped_child_used_math(current)
1778
1779 #define conditional_stopped_child_used_math(condition, child) \
1780 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
1781
1782 #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current)
1783
1784 #define copy_to_stopped_child_used_math(child) \
1785 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
1786
1787 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
1788 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1789 #define used_math() tsk_used_math(current)
1790
is_percpu_thread(void)1791 static __always_inline bool is_percpu_thread(void)
1792 {
1793 #ifdef CONFIG_SMP
1794 return (current->flags & PF_NO_SETAFFINITY) &&
1795 (current->nr_cpus_allowed == 1);
1796 #else
1797 return true;
1798 #endif
1799 }
1800
1801 /* Per-process atomic flags. */
1802 #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */
1803 #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */
1804 #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */
1805 #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */
1806 #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/
1807 #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */
1808 #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */
1809 #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */
1810
1811 #define TASK_PFA_TEST(name, func) \
1812 static inline bool task_##func(struct task_struct *p) \
1813 { return test_bit(PFA_##name, &p->atomic_flags); }
1814
1815 #define TASK_PFA_SET(name, func) \
1816 static inline void task_set_##func(struct task_struct *p) \
1817 { set_bit(PFA_##name, &p->atomic_flags); }
1818
1819 #define TASK_PFA_CLEAR(name, func) \
1820 static inline void task_clear_##func(struct task_struct *p) \
1821 { clear_bit(PFA_##name, &p->atomic_flags); }
1822
TASK_PFA_TEST(NO_NEW_PRIVS,no_new_privs)1823 TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
1824 TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
1825
1826 TASK_PFA_TEST(SPREAD_PAGE, spread_page)
1827 TASK_PFA_SET(SPREAD_PAGE, spread_page)
1828 TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
1829
1830 TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
1831 TASK_PFA_SET(SPREAD_SLAB, spread_slab)
1832 TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
1833
1834 TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
1835 TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
1836 TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
1837
1838 TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
1839 TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
1840 TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)
1841
1842 TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
1843 TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
1844
1845 TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
1846 TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
1847 TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
1848
1849 TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
1850 TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
1851
1852 static inline void
1853 current_restore_flags(unsigned long orig_flags, unsigned long flags)
1854 {
1855 current->flags &= ~flags;
1856 current->flags |= orig_flags & flags;
1857 }
1858
1859 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
1860 extern int task_can_attach(struct task_struct *p);
1861 extern int dl_bw_alloc(int cpu, u64 dl_bw);
1862 extern void dl_bw_free(int cpu, u64 dl_bw);
1863 #ifdef CONFIG_SMP
1864
1865 /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
1866 extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
1867
1868 /**
1869 * set_cpus_allowed_ptr - set CPU affinity mask of a task
1870 * @p: the task
1871 * @new_mask: CPU affinity mask
1872 *
1873 * Return: zero if successful, or a negative error code
1874 */
1875 extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
1876 extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
1877 extern void release_user_cpus_ptr(struct task_struct *p);
1878 extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
1879 extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
1880 extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
1881 #else
do_set_cpus_allowed(struct task_struct * p,const struct cpumask * new_mask)1882 static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1883 {
1884 }
set_cpus_allowed_ptr(struct task_struct * p,const struct cpumask * new_mask)1885 static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1886 {
1887 if (!cpumask_test_cpu(0, new_mask))
1888 return -EINVAL;
1889 return 0;
1890 }
dup_user_cpus_ptr(struct task_struct * dst,struct task_struct * src,int node)1891 static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
1892 {
1893 if (src->user_cpus_ptr)
1894 return -EINVAL;
1895 return 0;
1896 }
release_user_cpus_ptr(struct task_struct * p)1897 static inline void release_user_cpus_ptr(struct task_struct *p)
1898 {
1899 WARN_ON(p->user_cpus_ptr);
1900 }
1901
dl_task_check_affinity(struct task_struct * p,const struct cpumask * mask)1902 static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
1903 {
1904 return 0;
1905 }
1906 #endif
1907
1908 extern int yield_to(struct task_struct *p, bool preempt);
1909 extern void set_user_nice(struct task_struct *p, long nice);
1910 extern int task_prio(const struct task_struct *p);
1911
1912 /**
1913 * task_nice - return the nice value of a given task.
1914 * @p: the task in question.
1915 *
1916 * Return: The nice value [ -20 ... 0 ... 19 ].
1917 */
task_nice(const struct task_struct * p)1918 static inline int task_nice(const struct task_struct *p)
1919 {
1920 return PRIO_TO_NICE((p)->static_prio);
1921 }
1922
1923 extern int can_nice(const struct task_struct *p, const int nice);
1924 extern int task_curr(const struct task_struct *p);
1925 extern int idle_cpu(int cpu);
1926 extern int available_idle_cpu(int cpu);
1927 extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
1928 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
1929 extern void sched_set_fifo(struct task_struct *p);
1930 extern void sched_set_fifo_low(struct task_struct *p);
1931 extern void sched_set_normal(struct task_struct *p, int nice);
1932 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
1933 extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
1934 extern struct task_struct *idle_task(int cpu);
1935
1936 /**
1937 * is_idle_task - is the specified task an idle task?
1938 * @p: the task in question.
1939 *
1940 * Return: 1 if @p is an idle task. 0 otherwise.
1941 */
is_idle_task(const struct task_struct * p)1942 static __always_inline bool is_idle_task(const struct task_struct *p)
1943 {
1944 return !!(p->flags & PF_IDLE);
1945 }
1946
1947 extern struct task_struct *curr_task(int cpu);
1948 extern void ia64_set_curr_task(int cpu, struct task_struct *p);
1949
1950 void yield(void);
1951
1952 union thread_union {
1953 #ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK
1954 struct task_struct task;
1955 #endif
1956 #ifndef CONFIG_THREAD_INFO_IN_TASK
1957 struct thread_info thread_info;
1958 #endif
1959 unsigned long stack[THREAD_SIZE/sizeof(long)];
1960 };
1961
1962 #ifndef CONFIG_THREAD_INFO_IN_TASK
1963 extern struct thread_info init_thread_info;
1964 #endif
1965
1966 extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];
1967
1968 #ifdef CONFIG_THREAD_INFO_IN_TASK
1969 # define task_thread_info(task) (&(task)->thread_info)
1970 #elif !defined(__HAVE_THREAD_FUNCTIONS)
1971 # define task_thread_info(task) ((struct thread_info *)(task)->stack)
1972 #endif
1973
1974 /*
1975 * find a task by one of its numerical ids
1976 *
1977 * find_task_by_pid_ns():
1978 * finds a task by its pid in the specified namespace
1979 * find_task_by_vpid():
1980 * finds a task by its virtual pid
1981 *
1982 * see also find_vpid() etc in include/linux/pid.h
1983 */
1984
1985 extern struct task_struct *find_task_by_vpid(pid_t nr);
1986 extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);
1987
1988 /*
1989 * find a task by its virtual pid and get the task struct
1990 */
1991 extern struct task_struct *find_get_task_by_vpid(pid_t nr);
1992
1993 extern int wake_up_state(struct task_struct *tsk, unsigned int state);
1994 extern int wake_up_process(struct task_struct *tsk);
1995 extern void wake_up_new_task(struct task_struct *tsk);
1996
1997 #ifdef CONFIG_SMP
1998 extern void kick_process(struct task_struct *tsk);
1999 #else
kick_process(struct task_struct * tsk)2000 static inline void kick_process(struct task_struct *tsk) { }
2001 #endif
2002
2003 extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
2004
set_task_comm(struct task_struct * tsk,const char * from)2005 static inline void set_task_comm(struct task_struct *tsk, const char *from)
2006 {
2007 __set_task_comm(tsk, from, false);
2008 }
2009
2010 extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk);
2011 #define get_task_comm(buf, tsk) ({ \
2012 BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \
2013 __get_task_comm(buf, sizeof(buf), tsk); \
2014 })
2015
2016 #ifdef CONFIG_SMP
scheduler_ipi(void)2017 static __always_inline void scheduler_ipi(void)
2018 {
2019 /*
2020 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
2021 * TIF_NEED_RESCHED remotely (for the first time) will also send
2022 * this IPI.
2023 */
2024 preempt_fold_need_resched();
2025 }
2026 #else
scheduler_ipi(void)2027 static inline void scheduler_ipi(void) { }
2028 #endif
2029
2030 extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
2031
2032 /*
2033 * Set thread flags in other task's structures.
2034 * See asm/thread_info.h for TIF_xxxx flags available:
2035 */
set_tsk_thread_flag(struct task_struct * tsk,int flag)2036 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
2037 {
2038 set_ti_thread_flag(task_thread_info(tsk), flag);
2039 }
2040
clear_tsk_thread_flag(struct task_struct * tsk,int flag)2041 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
2042 {
2043 clear_ti_thread_flag(task_thread_info(tsk), flag);
2044 }
2045
update_tsk_thread_flag(struct task_struct * tsk,int flag,bool value)2046 static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
2047 bool value)
2048 {
2049 update_ti_thread_flag(task_thread_info(tsk), flag, value);
2050 }
2051
test_and_set_tsk_thread_flag(struct task_struct * tsk,int flag)2052 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
2053 {
2054 return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
2055 }
2056
test_and_clear_tsk_thread_flag(struct task_struct * tsk,int flag)2057 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
2058 {
2059 return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
2060 }
2061
test_tsk_thread_flag(struct task_struct * tsk,int flag)2062 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
2063 {
2064 return test_ti_thread_flag(task_thread_info(tsk), flag);
2065 }
2066
set_tsk_need_resched(struct task_struct * tsk)2067 static inline void set_tsk_need_resched(struct task_struct *tsk)
2068 {
2069 set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
2070 }
2071
clear_tsk_need_resched(struct task_struct * tsk)2072 static inline void clear_tsk_need_resched(struct task_struct *tsk)
2073 {
2074 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
2075 }
2076
test_tsk_need_resched(struct task_struct * tsk)2077 static inline int test_tsk_need_resched(struct task_struct *tsk)
2078 {
2079 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
2080 }
2081
2082 /*
2083 * cond_resched() and cond_resched_lock(): latency reduction via
2084 * explicit rescheduling in places that are safe. The return
2085 * value indicates whether a reschedule was done in fact.
2086 * cond_resched_lock() will drop the spinlock before scheduling,
2087 */
2088 #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
2089 extern int __cond_resched(void);
2090
2091 #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
2092
2093 void sched_dynamic_klp_enable(void);
2094 void sched_dynamic_klp_disable(void);
2095
2096 DECLARE_STATIC_CALL(cond_resched, __cond_resched);
2097
_cond_resched(void)2098 static __always_inline int _cond_resched(void)
2099 {
2100 return static_call_mod(cond_resched)();
2101 }
2102
2103 #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
2104
2105 extern int dynamic_cond_resched(void);
2106
_cond_resched(void)2107 static __always_inline int _cond_resched(void)
2108 {
2109 return dynamic_cond_resched();
2110 }
2111
2112 #else /* !CONFIG_PREEMPTION */
2113
_cond_resched(void)2114 static inline int _cond_resched(void)
2115 {
2116 klp_sched_try_switch();
2117 return __cond_resched();
2118 }
2119
2120 #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
2121
2122 #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
2123
_cond_resched(void)2124 static inline int _cond_resched(void)
2125 {
2126 klp_sched_try_switch();
2127 return 0;
2128 }
2129
2130 #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
2131
2132 #define cond_resched() ({ \
2133 __might_resched(__FILE__, __LINE__, 0); \
2134 _cond_resched(); \
2135 })
2136
2137 extern int __cond_resched_lock(spinlock_t *lock);
2138 extern int __cond_resched_rwlock_read(rwlock_t *lock);
2139 extern int __cond_resched_rwlock_write(rwlock_t *lock);
2140
2141 #define MIGHT_RESCHED_RCU_SHIFT 8
2142 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)
2143
2144 #ifndef CONFIG_PREEMPT_RT
2145 /*
2146 * Non RT kernels have an elevated preempt count due to the held lock,
2147 * but are not allowed to be inside a RCU read side critical section
2148 */
2149 # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET
2150 #else
2151 /*
2152 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
2153 * cond_resched*lock() has to take that into account because it checks for
2154 * preempt_count() and rcu_preempt_depth().
2155 */
2156 # define PREEMPT_LOCK_RESCHED_OFFSETS \
2157 (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
2158 #endif
2159
2160 #define cond_resched_lock(lock) ({ \
2161 __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \
2162 __cond_resched_lock(lock); \
2163 })
2164
2165 #define cond_resched_rwlock_read(lock) ({ \
2166 __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \
2167 __cond_resched_rwlock_read(lock); \
2168 })
2169
2170 #define cond_resched_rwlock_write(lock) ({ \
2171 __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \
2172 __cond_resched_rwlock_write(lock); \
2173 })
2174
cond_resched_rcu(void)2175 static inline void cond_resched_rcu(void)
2176 {
2177 #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
2178 rcu_read_unlock();
2179 cond_resched();
2180 rcu_read_lock();
2181 #endif
2182 }
2183
2184 #ifdef CONFIG_PREEMPT_DYNAMIC
2185
2186 extern bool preempt_model_none(void);
2187 extern bool preempt_model_voluntary(void);
2188 extern bool preempt_model_full(void);
2189
2190 #else
2191
preempt_model_none(void)2192 static inline bool preempt_model_none(void)
2193 {
2194 return IS_ENABLED(CONFIG_PREEMPT_NONE);
2195 }
preempt_model_voluntary(void)2196 static inline bool preempt_model_voluntary(void)
2197 {
2198 return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
2199 }
preempt_model_full(void)2200 static inline bool preempt_model_full(void)
2201 {
2202 return IS_ENABLED(CONFIG_PREEMPT);
2203 }
2204
2205 #endif
2206
preempt_model_rt(void)2207 static inline bool preempt_model_rt(void)
2208 {
2209 return IS_ENABLED(CONFIG_PREEMPT_RT);
2210 }
2211
2212 /*
2213 * Does the preemption model allow non-cooperative preemption?
2214 *
2215 * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
2216 * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
2217 * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
2218 * PREEMPT_NONE model.
2219 */
preempt_model_preemptible(void)2220 static inline bool preempt_model_preemptible(void)
2221 {
2222 return preempt_model_full() || preempt_model_rt();
2223 }
2224
2225 /*
2226 * Does a critical section need to be broken due to another
2227 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
2228 * but a general need for low latency)
2229 */
spin_needbreak(spinlock_t * lock)2230 static inline int spin_needbreak(spinlock_t *lock)
2231 {
2232 #ifdef CONFIG_PREEMPTION
2233 return spin_is_contended(lock);
2234 #else
2235 return 0;
2236 #endif
2237 }
2238
2239 /*
2240 * Check if a rwlock is contended.
2241 * Returns non-zero if there is another task waiting on the rwlock.
2242 * Returns zero if the lock is not contended or the system / underlying
2243 * rwlock implementation does not support contention detection.
2244 * Technically does not depend on CONFIG_PREEMPTION, but a general need
2245 * for low latency.
2246 */
rwlock_needbreak(rwlock_t * lock)2247 static inline int rwlock_needbreak(rwlock_t *lock)
2248 {
2249 #ifdef CONFIG_PREEMPTION
2250 return rwlock_is_contended(lock);
2251 #else
2252 return 0;
2253 #endif
2254 }
2255
need_resched(void)2256 static __always_inline bool need_resched(void)
2257 {
2258 return unlikely(tif_need_resched());
2259 }
2260
2261 /*
2262 * Wrappers for p->thread_info->cpu access. No-op on UP.
2263 */
2264 #ifdef CONFIG_SMP
2265
task_cpu(const struct task_struct * p)2266 static inline unsigned int task_cpu(const struct task_struct *p)
2267 {
2268 return READ_ONCE(task_thread_info(p)->cpu);
2269 }
2270
2271 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
2272
2273 #else
2274
task_cpu(const struct task_struct * p)2275 static inline unsigned int task_cpu(const struct task_struct *p)
2276 {
2277 return 0;
2278 }
2279
set_task_cpu(struct task_struct * p,unsigned int cpu)2280 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
2281 {
2282 }
2283
2284 #endif /* CONFIG_SMP */
2285
2286 extern bool sched_task_on_rq(struct task_struct *p);
2287 extern unsigned long get_wchan(struct task_struct *p);
2288 extern struct task_struct *cpu_curr_snapshot(int cpu);
2289
2290 /*
2291 * In order to reduce various lock holder preemption latencies provide an
2292 * interface to see if a vCPU is currently running or not.
2293 *
2294 * This allows us to terminate optimistic spin loops and block, analogous to
2295 * the native optimistic spin heuristic of testing if the lock owner task is
2296 * running or not.
2297 */
2298 #ifndef vcpu_is_preempted
vcpu_is_preempted(int cpu)2299 static inline bool vcpu_is_preempted(int cpu)
2300 {
2301 return false;
2302 }
2303 #endif
2304
2305 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
2306 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2307
2308 #ifndef TASK_SIZE_OF
2309 #define TASK_SIZE_OF(tsk) TASK_SIZE
2310 #endif
2311
2312 #ifdef CONFIG_SMP
owner_on_cpu(struct task_struct * owner)2313 static inline bool owner_on_cpu(struct task_struct *owner)
2314 {
2315 /*
2316 * As lock holder preemption issue, we both skip spinning if
2317 * task is not on cpu or its cpu is preempted
2318 */
2319 return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
2320 }
2321
2322 /* Returns effective CPU energy utilization, as seen by the scheduler */
2323 unsigned long sched_cpu_util(int cpu);
2324 #endif /* CONFIG_SMP */
2325
2326 #ifdef CONFIG_RSEQ
2327
2328 /*
2329 * Map the event mask on the user-space ABI enum rseq_cs_flags
2330 * for direct mask checks.
2331 */
2332 enum rseq_event_mask_bits {
2333 RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
2334 RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
2335 RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
2336 };
2337
2338 enum rseq_event_mask {
2339 RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
2340 RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
2341 RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
2342 };
2343
rseq_set_notify_resume(struct task_struct * t)2344 static inline void rseq_set_notify_resume(struct task_struct *t)
2345 {
2346 if (t->rseq)
2347 set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
2348 }
2349
2350 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
2351
rseq_handle_notify_resume(struct ksignal * ksig,struct pt_regs * regs)2352 static inline void rseq_handle_notify_resume(struct ksignal *ksig,
2353 struct pt_regs *regs)
2354 {
2355 if (current->rseq)
2356 __rseq_handle_notify_resume(ksig, regs);
2357 }
2358
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)2359 static inline void rseq_signal_deliver(struct ksignal *ksig,
2360 struct pt_regs *regs)
2361 {
2362 preempt_disable();
2363 __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask);
2364 preempt_enable();
2365 rseq_handle_notify_resume(ksig, regs);
2366 }
2367
2368 /* rseq_preempt() requires preemption to be disabled. */
rseq_preempt(struct task_struct * t)2369 static inline void rseq_preempt(struct task_struct *t)
2370 {
2371 __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
2372 rseq_set_notify_resume(t);
2373 }
2374
2375 /* rseq_migrate() requires preemption to be disabled. */
rseq_migrate(struct task_struct * t)2376 static inline void rseq_migrate(struct task_struct *t)
2377 {
2378 __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
2379 rseq_set_notify_resume(t);
2380 }
2381
2382 /*
2383 * If parent process has a registered restartable sequences area, the
2384 * child inherits. Unregister rseq for a clone with CLONE_VM set.
2385 */
rseq_fork(struct task_struct * t,unsigned long clone_flags)2386 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
2387 {
2388 if (clone_flags & CLONE_VM) {
2389 t->rseq = NULL;
2390 t->rseq_len = 0;
2391 t->rseq_sig = 0;
2392 t->rseq_event_mask = 0;
2393 } else {
2394 t->rseq = current->rseq;
2395 t->rseq_len = current->rseq_len;
2396 t->rseq_sig = current->rseq_sig;
2397 t->rseq_event_mask = current->rseq_event_mask;
2398 }
2399 }
2400
rseq_execve(struct task_struct * t)2401 static inline void rseq_execve(struct task_struct *t)
2402 {
2403 t->rseq = NULL;
2404 t->rseq_len = 0;
2405 t->rseq_sig = 0;
2406 t->rseq_event_mask = 0;
2407 }
2408
2409 #else
2410
rseq_set_notify_resume(struct task_struct * t)2411 static inline void rseq_set_notify_resume(struct task_struct *t)
2412 {
2413 }
rseq_handle_notify_resume(struct ksignal * ksig,struct pt_regs * regs)2414 static inline void rseq_handle_notify_resume(struct ksignal *ksig,
2415 struct pt_regs *regs)
2416 {
2417 }
rseq_signal_deliver(struct ksignal * ksig,struct pt_regs * regs)2418 static inline void rseq_signal_deliver(struct ksignal *ksig,
2419 struct pt_regs *regs)
2420 {
2421 }
rseq_preempt(struct task_struct * t)2422 static inline void rseq_preempt(struct task_struct *t)
2423 {
2424 }
rseq_migrate(struct task_struct * t)2425 static inline void rseq_migrate(struct task_struct *t)
2426 {
2427 }
rseq_fork(struct task_struct * t,unsigned long clone_flags)2428 static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
2429 {
2430 }
rseq_execve(struct task_struct * t)2431 static inline void rseq_execve(struct task_struct *t)
2432 {
2433 }
2434
2435 #endif
2436
2437 #ifdef CONFIG_DEBUG_RSEQ
2438
2439 void rseq_syscall(struct pt_regs *regs);
2440
2441 #else
2442
rseq_syscall(struct pt_regs * regs)2443 static inline void rseq_syscall(struct pt_regs *regs)
2444 {
2445 }
2446
2447 #endif
2448
2449 #ifdef CONFIG_SCHED_CORE
2450 extern void sched_core_free(struct task_struct *tsk);
2451 extern void sched_core_fork(struct task_struct *p);
2452 extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
2453 unsigned long uaddr);
2454 extern int sched_core_idle_cpu(int cpu);
2455 #else
sched_core_free(struct task_struct * tsk)2456 static inline void sched_core_free(struct task_struct *tsk) { }
sched_core_fork(struct task_struct * p)2457 static inline void sched_core_fork(struct task_struct *p) { }
sched_core_idle_cpu(int cpu)2458 static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
2459 #endif
2460
2461 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
2462
2463 #endif
2464