1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Simple CPU accounting cgroup controller
4 */
5 #include <linux/cpufreq_times.h>
6 #include <trace/hooks/sched.h>
7 #undef TRACE_INCLUDE_PATH
8
9 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
10 #include <asm/cputime.h>
11 #endif
12
13 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
14
15 /*
16 * There are no locks covering percpu hardirq/softirq time.
17 * They are only modified in vtime_account, on corresponding CPU
18 * with interrupts disabled. So, writes are safe.
19 * They are read and saved off onto struct rq in update_rq_clock().
20 * This may result in other CPU reading this CPU's IRQ time and can
21 * race with irq/vtime_account on this CPU. We would either get old
22 * or new value with a side effect of accounting a slice of IRQ time to wrong
23 * task when IRQ is in progress while we read rq->clock. That is a worthy
24 * compromise in place of having locks on each IRQ in account_system_time.
25 */
26 DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
27 EXPORT_PER_CPU_SYMBOL_GPL(cpu_irqtime);
28
29 static int sched_clock_irqtime;
30
enable_sched_clock_irqtime(void)31 void enable_sched_clock_irqtime(void)
32 {
33 sched_clock_irqtime = 1;
34 }
35
disable_sched_clock_irqtime(void)36 void disable_sched_clock_irqtime(void)
37 {
38 sched_clock_irqtime = 0;
39 }
40
irqtime_account_delta(struct irqtime * irqtime,u64 delta,enum cpu_usage_stat idx)41 static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
42 enum cpu_usage_stat idx)
43 {
44 u64 *cpustat = kcpustat_this_cpu->cpustat;
45
46 u64_stats_update_begin(&irqtime->sync);
47 cpustat[idx] += delta;
48 irqtime->total += delta;
49 irqtime->tick_delta += delta;
50 u64_stats_update_end(&irqtime->sync);
51 }
52
53 /*
54 * Called after incrementing preempt_count on {soft,}irq_enter
55 * and before decrementing preempt_count on {soft,}irq_exit.
56 */
irqtime_account_irq(struct task_struct * curr,unsigned int offset)57 void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
58 {
59 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
60 unsigned int pc;
61 s64 delta;
62 int cpu;
63 bool irq_start = true;
64
65 if (!sched_clock_irqtime)
66 return;
67
68 cpu = smp_processor_id();
69 delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
70 irqtime->irq_start_time += delta;
71 pc = irq_count() - offset;
72
73 /*
74 * We do not account for softirq time from ksoftirqd here.
75 * We want to continue accounting softirq time to ksoftirqd thread
76 * in that case, so as not to confuse scheduler with a special task
77 * that do not consume any time, but still wants to run.
78 */
79 if (pc & HARDIRQ_MASK) {
80 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
81 irq_start = false;
82 } else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) {
83 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
84 irq_start = false;
85 }
86
87 trace_android_rvh_account_irq(curr, cpu, delta, irq_start);
88 }
89
irqtime_tick_accounted(u64 maxtime)90 static u64 irqtime_tick_accounted(u64 maxtime)
91 {
92 struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
93 u64 delta;
94
95 delta = min(irqtime->tick_delta, maxtime);
96 irqtime->tick_delta -= delta;
97
98 return delta;
99 }
100
101 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
102
103 #define sched_clock_irqtime (0)
104
irqtime_tick_accounted(u64 dummy)105 static u64 irqtime_tick_accounted(u64 dummy)
106 {
107 return 0;
108 }
109
110 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
111
task_group_account_field(struct task_struct * p,int index,u64 tmp)112 static inline void task_group_account_field(struct task_struct *p, int index,
113 u64 tmp)
114 {
115 /*
116 * Since all updates are sure to touch the root cgroup, we
117 * get ourselves ahead and touch it first. If the root cgroup
118 * is the only cgroup, then nothing else should be necessary.
119 *
120 */
121 __this_cpu_add(kernel_cpustat.cpustat[index], tmp);
122
123 cgroup_account_cputime_field(p, index, tmp);
124 }
125
126 /*
127 * Account user CPU time to a process.
128 * @p: the process that the CPU time gets accounted to
129 * @cputime: the CPU time spent in user space since the last update
130 */
account_user_time(struct task_struct * p,u64 cputime)131 void account_user_time(struct task_struct *p, u64 cputime)
132 {
133 int index;
134
135 /* Add user time to process. */
136 p->utime += cputime;
137 account_group_user_time(p, cputime);
138
139 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
140
141 /* Add user time to cpustat. */
142 task_group_account_field(p, index, cputime);
143
144 /* Account for user time used */
145 acct_account_cputime(p);
146
147 /* Account power usage for user time */
148 cpufreq_acct_update_power(p, cputime);
149 }
150
151 /*
152 * Account guest CPU time to a process.
153 * @p: the process that the CPU time gets accounted to
154 * @cputime: the CPU time spent in virtual machine since the last update
155 */
account_guest_time(struct task_struct * p,u64 cputime)156 void account_guest_time(struct task_struct *p, u64 cputime)
157 {
158 u64 *cpustat = kcpustat_this_cpu->cpustat;
159
160 /* Add guest time to process. */
161 p->utime += cputime;
162 account_group_user_time(p, cputime);
163 p->gtime += cputime;
164
165 /* Add guest time to cpustat. */
166 if (task_nice(p) > 0) {
167 task_group_account_field(p, CPUTIME_NICE, cputime);
168 cpustat[CPUTIME_GUEST_NICE] += cputime;
169 } else {
170 task_group_account_field(p, CPUTIME_USER, cputime);
171 cpustat[CPUTIME_GUEST] += cputime;
172 }
173 }
174
175 /*
176 * Account system CPU time to a process and desired cpustat field
177 * @p: the process that the CPU time gets accounted to
178 * @cputime: the CPU time spent in kernel space since the last update
179 * @index: pointer to cpustat field that has to be updated
180 */
account_system_index_time(struct task_struct * p,u64 cputime,enum cpu_usage_stat index)181 void account_system_index_time(struct task_struct *p,
182 u64 cputime, enum cpu_usage_stat index)
183 {
184 /* Add system time to process. */
185 p->stime += cputime;
186 account_group_system_time(p, cputime);
187
188 /* Add system time to cpustat. */
189 task_group_account_field(p, index, cputime);
190
191 /* Account for system time used */
192 acct_account_cputime(p);
193
194 /* Account power usage for system time */
195 cpufreq_acct_update_power(p, cputime);
196 }
197
198 /*
199 * Account system CPU time to a process.
200 * @p: the process that the CPU time gets accounted to
201 * @hardirq_offset: the offset to subtract from hardirq_count()
202 * @cputime: the CPU time spent in kernel space since the last update
203 */
account_system_time(struct task_struct * p,int hardirq_offset,u64 cputime)204 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
205 {
206 int index;
207
208 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
209 account_guest_time(p, cputime);
210 return;
211 }
212
213 if (hardirq_count() - hardirq_offset)
214 index = CPUTIME_IRQ;
215 else if (in_serving_softirq())
216 index = CPUTIME_SOFTIRQ;
217 else
218 index = CPUTIME_SYSTEM;
219
220 account_system_index_time(p, cputime, index);
221 }
222
223 /*
224 * Account for involuntary wait time.
225 * @cputime: the CPU time spent in involuntary wait
226 */
account_steal_time(u64 cputime)227 void account_steal_time(u64 cputime)
228 {
229 u64 *cpustat = kcpustat_this_cpu->cpustat;
230
231 cpustat[CPUTIME_STEAL] += cputime;
232 }
233
234 /*
235 * Account for idle time.
236 * @cputime: the CPU time spent in idle wait
237 */
account_idle_time(u64 cputime)238 void account_idle_time(u64 cputime)
239 {
240 u64 *cpustat = kcpustat_this_cpu->cpustat;
241 struct rq *rq = this_rq();
242
243 if (atomic_read(&rq->nr_iowait) > 0)
244 cpustat[CPUTIME_IOWAIT] += cputime;
245 else
246 cpustat[CPUTIME_IDLE] += cputime;
247 }
248
249
250 #ifdef CONFIG_SCHED_CORE
251 /*
252 * Account for forceidle time due to core scheduling.
253 *
254 * REQUIRES: schedstat is enabled.
255 */
__account_forceidle_time(struct task_struct * p,u64 delta)256 void __account_forceidle_time(struct task_struct *p, u64 delta)
257 {
258 __schedstat_add(p->stats.core_forceidle_sum, delta);
259
260 task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
261 }
262 #endif
263
264 /*
265 * When a guest is interrupted for a longer amount of time, missed clock
266 * ticks are not redelivered later. Due to that, this function may on
267 * occasion account more time than the calling functions think elapsed.
268 */
steal_account_process_time(u64 maxtime)269 static __always_inline u64 steal_account_process_time(u64 maxtime)
270 {
271 #ifdef CONFIG_PARAVIRT
272 if (static_key_false(¶virt_steal_enabled)) {
273 u64 steal;
274
275 steal = paravirt_steal_clock(smp_processor_id());
276 steal -= this_rq()->prev_steal_time;
277 steal = min(steal, maxtime);
278 account_steal_time(steal);
279 this_rq()->prev_steal_time += steal;
280
281 return steal;
282 }
283 #endif
284 return 0;
285 }
286
287 /*
288 * Account how much elapsed time was spent in steal, IRQ, or softirq time.
289 */
account_other_time(u64 max)290 static inline u64 account_other_time(u64 max)
291 {
292 u64 accounted;
293
294 lockdep_assert_irqs_disabled();
295
296 accounted = steal_account_process_time(max);
297
298 if (accounted < max)
299 accounted += irqtime_tick_accounted(max - accounted);
300
301 return accounted;
302 }
303
304 #ifdef CONFIG_64BIT
read_sum_exec_runtime(struct task_struct * t)305 static inline u64 read_sum_exec_runtime(struct task_struct *t)
306 {
307 return t->se.sum_exec_runtime;
308 }
309 #else
read_sum_exec_runtime(struct task_struct * t)310 static u64 read_sum_exec_runtime(struct task_struct *t)
311 {
312 u64 ns;
313 struct rq_flags rf;
314 struct rq *rq;
315
316 rq = task_rq_lock(t, &rf);
317 ns = t->se.sum_exec_runtime;
318 task_rq_unlock(rq, t, &rf);
319
320 return ns;
321 }
322 #endif
323
324 /*
325 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
326 * tasks (sum on group iteration) belonging to @tsk's group.
327 */
thread_group_cputime(struct task_struct * tsk,struct task_cputime * times)328 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
329 {
330 struct signal_struct *sig = tsk->signal;
331 u64 utime, stime;
332 struct task_struct *t;
333 unsigned int seq, nextseq;
334 unsigned long flags;
335
336 /*
337 * Update current task runtime to account pending time since last
338 * scheduler action or thread_group_cputime() call. This thread group
339 * might have other running tasks on different CPUs, but updating
340 * their runtime can affect syscall performance, so we skip account
341 * those pending times and rely only on values updated on tick or
342 * other scheduler action.
343 */
344 if (same_thread_group(current, tsk))
345 (void) task_sched_runtime(current);
346
347 rcu_read_lock();
348 /* Attempt a lockless read on the first round. */
349 nextseq = 0;
350 do {
351 seq = nextseq;
352 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
353 times->utime = sig->utime;
354 times->stime = sig->stime;
355 times->sum_exec_runtime = sig->sum_sched_runtime;
356
357 for_each_thread(tsk, t) {
358 task_cputime(t, &utime, &stime);
359 times->utime += utime;
360 times->stime += stime;
361 times->sum_exec_runtime += read_sum_exec_runtime(t);
362 }
363 /* If lockless access failed, take the lock. */
364 nextseq = 1;
365 } while (need_seqretry(&sig->stats_lock, seq));
366 done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
367 rcu_read_unlock();
368 }
369
370 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
371 /*
372 * Account a tick to a process and cpustat
373 * @p: the process that the CPU time gets accounted to
374 * @user_tick: is the tick from userspace
375 * @rq: the pointer to rq
376 *
377 * Tick demultiplexing follows the order
378 * - pending hardirq update
379 * - pending softirq update
380 * - user_time
381 * - idle_time
382 * - system time
383 * - check for guest_time
384 * - else account as system_time
385 *
386 * Check for hardirq is done both for system and user time as there is
387 * no timer going off while we are on hardirq and hence we may never get an
388 * opportunity to update it solely in system time.
389 * p->stime and friends are only updated on system time and not on IRQ
390 * softirq as those do not count in task exec_runtime any more.
391 */
irqtime_account_process_tick(struct task_struct * p,int user_tick,int ticks)392 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
393 int ticks)
394 {
395 u64 other, cputime = TICK_NSEC * ticks;
396
397 /*
398 * When returning from idle, many ticks can get accounted at
399 * once, including some ticks of steal, IRQ, and softirq time.
400 * Subtract those ticks from the amount of time accounted to
401 * idle, or potentially user or system time. Due to rounding,
402 * other time can exceed ticks occasionally.
403 */
404 other = account_other_time(ULONG_MAX);
405 if (other >= cputime)
406 return;
407
408 cputime -= other;
409
410 if (this_cpu_ksoftirqd() == p) {
411 /*
412 * ksoftirqd time do not get accounted in cpu_softirq_time.
413 * So, we have to handle it separately here.
414 * Also, p->stime needs to be updated for ksoftirqd.
415 */
416 account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
417 } else if (user_tick) {
418 account_user_time(p, cputime);
419 } else if (p == this_rq()->idle) {
420 account_idle_time(cputime);
421 } else if (p->flags & PF_VCPU) { /* System time or guest time */
422 account_guest_time(p, cputime);
423 } else {
424 account_system_index_time(p, cputime, CPUTIME_SYSTEM);
425 }
426 trace_android_vh_irqtime_account_process_tick(p, this_rq(), user_tick, ticks);
427 }
428
irqtime_account_idle_ticks(int ticks)429 static void irqtime_account_idle_ticks(int ticks)
430 {
431 irqtime_account_process_tick(current, 0, ticks);
432 }
433 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
irqtime_account_idle_ticks(int ticks)434 static inline void irqtime_account_idle_ticks(int ticks) { }
irqtime_account_process_tick(struct task_struct * p,int user_tick,int nr_ticks)435 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
436 int nr_ticks) { }
437 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
438
439 /*
440 * Use precise platform statistics if available:
441 */
442 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
443
vtime_account_irq(struct task_struct * tsk,unsigned int offset)444 void vtime_account_irq(struct task_struct *tsk, unsigned int offset)
445 {
446 unsigned int pc = irq_count() - offset;
447
448 if (pc & HARDIRQ_OFFSET) {
449 vtime_account_hardirq(tsk);
450 } else if (pc & SOFTIRQ_OFFSET) {
451 vtime_account_softirq(tsk);
452 } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
453 is_idle_task(tsk)) {
454 vtime_account_idle(tsk);
455 } else {
456 vtime_account_kernel(tsk);
457 }
458 }
459
cputime_adjust(struct task_cputime * curr,struct prev_cputime * prev,u64 * ut,u64 * st)460 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
461 u64 *ut, u64 *st)
462 {
463 *ut = curr->utime;
464 *st = curr->stime;
465 }
466
task_cputime_adjusted(struct task_struct * p,u64 * ut,u64 * st)467 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
468 {
469 *ut = p->utime;
470 *st = p->stime;
471 }
472 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
473
thread_group_cputime_adjusted(struct task_struct * p,u64 * ut,u64 * st)474 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
475 {
476 struct task_cputime cputime;
477
478 thread_group_cputime(p, &cputime);
479
480 *ut = cputime.utime;
481 *st = cputime.stime;
482 }
483 EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted);
484
485 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
486
487 /*
488 * Account a single tick of CPU time.
489 * @p: the process that the CPU time gets accounted to
490 * @user_tick: indicates if the tick is a user or a system tick
491 */
account_process_tick(struct task_struct * p,int user_tick)492 void account_process_tick(struct task_struct *p, int user_tick)
493 {
494 u64 cputime, steal;
495
496 if (vtime_accounting_enabled_this_cpu())
497 return;
498 trace_android_vh_account_task_time(p, this_rq(), user_tick);
499
500 if (sched_clock_irqtime) {
501 irqtime_account_process_tick(p, user_tick, 1);
502 return;
503 }
504
505 cputime = TICK_NSEC;
506 steal = steal_account_process_time(ULONG_MAX);
507
508 if (steal >= cputime)
509 return;
510
511 cputime -= steal;
512
513 if (user_tick)
514 account_user_time(p, cputime);
515 else if ((p != this_rq()->idle) || (irq_count() != HARDIRQ_OFFSET))
516 account_system_time(p, HARDIRQ_OFFSET, cputime);
517 else
518 account_idle_time(cputime);
519 }
520
521 /*
522 * Account multiple ticks of idle time.
523 * @ticks: number of stolen ticks
524 */
account_idle_ticks(unsigned long ticks)525 void account_idle_ticks(unsigned long ticks)
526 {
527 u64 cputime, steal;
528
529 if (sched_clock_irqtime) {
530 irqtime_account_idle_ticks(ticks);
531 return;
532 }
533
534 cputime = ticks * TICK_NSEC;
535 steal = steal_account_process_time(ULONG_MAX);
536
537 if (steal >= cputime)
538 return;
539
540 cputime -= steal;
541 account_idle_time(cputime);
542 }
543
544 /*
545 * Adjust tick based cputime random precision against scheduler runtime
546 * accounting.
547 *
548 * Tick based cputime accounting depend on random scheduling timeslices of a
549 * task to be interrupted or not by the timer. Depending on these
550 * circumstances, the number of these interrupts may be over or
551 * under-optimistic, matching the real user and system cputime with a variable
552 * precision.
553 *
554 * Fix this by scaling these tick based values against the total runtime
555 * accounted by the CFS scheduler.
556 *
557 * This code provides the following guarantees:
558 *
559 * stime + utime == rtime
560 * stime_i+1 >= stime_i, utime_i+1 >= utime_i
561 *
562 * Assuming that rtime_i+1 >= rtime_i.
563 */
cputime_adjust(struct task_cputime * curr,struct prev_cputime * prev,u64 * ut,u64 * st)564 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
565 u64 *ut, u64 *st)
566 {
567 u64 rtime, stime, utime;
568 unsigned long flags;
569
570 /* Serialize concurrent callers such that we can honour our guarantees */
571 raw_spin_lock_irqsave(&prev->lock, flags);
572 rtime = curr->sum_exec_runtime;
573
574 /*
575 * This is possible under two circumstances:
576 * - rtime isn't monotonic after all (a bug);
577 * - we got reordered by the lock.
578 *
579 * In both cases this acts as a filter such that the rest of the code
580 * can assume it is monotonic regardless of anything else.
581 */
582 if (prev->stime + prev->utime >= rtime)
583 goto out;
584
585 stime = curr->stime;
586 utime = curr->utime;
587
588 /*
589 * If either stime or utime are 0, assume all runtime is userspace.
590 * Once a task gets some ticks, the monotonicity code at 'update:'
591 * will ensure things converge to the observed ratio.
592 */
593 if (stime == 0) {
594 utime = rtime;
595 goto update;
596 }
597
598 if (utime == 0) {
599 stime = rtime;
600 goto update;
601 }
602
603 stime = mul_u64_u64_div_u64(stime, rtime, stime + utime);
604 /*
605 * Because mul_u64_u64_div_u64() can approximate on some
606 * achitectures; enforce the constraint that: a*b/(b+c) <= a.
607 */
608 if (unlikely(stime > rtime))
609 stime = rtime;
610
611 update:
612 /*
613 * Make sure stime doesn't go backwards; this preserves monotonicity
614 * for utime because rtime is monotonic.
615 *
616 * utime_i+1 = rtime_i+1 - stime_i
617 * = rtime_i+1 - (rtime_i - utime_i)
618 * = (rtime_i+1 - rtime_i) + utime_i
619 * >= utime_i
620 */
621 if (stime < prev->stime)
622 stime = prev->stime;
623 utime = rtime - stime;
624
625 /*
626 * Make sure utime doesn't go backwards; this still preserves
627 * monotonicity for stime, analogous argument to above.
628 */
629 if (utime < prev->utime) {
630 utime = prev->utime;
631 stime = rtime - utime;
632 }
633
634 prev->stime = stime;
635 prev->utime = utime;
636 out:
637 *ut = prev->utime;
638 *st = prev->stime;
639 raw_spin_unlock_irqrestore(&prev->lock, flags);
640 }
641
task_cputime_adjusted(struct task_struct * p,u64 * ut,u64 * st)642 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
643 {
644 struct task_cputime cputime = {
645 .sum_exec_runtime = p->se.sum_exec_runtime,
646 };
647
648 if (task_cputime(p, &cputime.utime, &cputime.stime))
649 cputime.sum_exec_runtime = task_sched_runtime(p);
650 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
651 }
652 EXPORT_SYMBOL_GPL(task_cputime_adjusted);
653
thread_group_cputime_adjusted(struct task_struct * p,u64 * ut,u64 * st)654 void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
655 {
656 struct task_cputime cputime;
657
658 thread_group_cputime(p, &cputime);
659 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
660 }
661 EXPORT_SYMBOL_GPL(thread_group_cputime_adjusted);
662
663 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
664
665 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
vtime_delta(struct vtime * vtime)666 static u64 vtime_delta(struct vtime *vtime)
667 {
668 unsigned long long clock;
669
670 clock = sched_clock();
671 if (clock < vtime->starttime)
672 return 0;
673
674 return clock - vtime->starttime;
675 }
676
get_vtime_delta(struct vtime * vtime)677 static u64 get_vtime_delta(struct vtime *vtime)
678 {
679 u64 delta = vtime_delta(vtime);
680 u64 other;
681
682 /*
683 * Unlike tick based timing, vtime based timing never has lost
684 * ticks, and no need for steal time accounting to make up for
685 * lost ticks. Vtime accounts a rounded version of actual
686 * elapsed time. Limit account_other_time to prevent rounding
687 * errors from causing elapsed vtime to go negative.
688 */
689 other = account_other_time(delta);
690 WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
691 vtime->starttime += delta;
692
693 return delta - other;
694 }
695
vtime_account_system(struct task_struct * tsk,struct vtime * vtime)696 static void vtime_account_system(struct task_struct *tsk,
697 struct vtime *vtime)
698 {
699 vtime->stime += get_vtime_delta(vtime);
700 if (vtime->stime >= TICK_NSEC) {
701 account_system_time(tsk, irq_count(), vtime->stime);
702 vtime->stime = 0;
703 }
704 }
705
vtime_account_guest(struct task_struct * tsk,struct vtime * vtime)706 static void vtime_account_guest(struct task_struct *tsk,
707 struct vtime *vtime)
708 {
709 vtime->gtime += get_vtime_delta(vtime);
710 if (vtime->gtime >= TICK_NSEC) {
711 account_guest_time(tsk, vtime->gtime);
712 vtime->gtime = 0;
713 }
714 }
715
__vtime_account_kernel(struct task_struct * tsk,struct vtime * vtime)716 static void __vtime_account_kernel(struct task_struct *tsk,
717 struct vtime *vtime)
718 {
719 /* We might have scheduled out from guest path */
720 if (vtime->state == VTIME_GUEST)
721 vtime_account_guest(tsk, vtime);
722 else
723 vtime_account_system(tsk, vtime);
724 }
725
vtime_account_kernel(struct task_struct * tsk)726 void vtime_account_kernel(struct task_struct *tsk)
727 {
728 struct vtime *vtime = &tsk->vtime;
729
730 if (!vtime_delta(vtime))
731 return;
732
733 write_seqcount_begin(&vtime->seqcount);
734 __vtime_account_kernel(tsk, vtime);
735 write_seqcount_end(&vtime->seqcount);
736 }
737
vtime_user_enter(struct task_struct * tsk)738 void vtime_user_enter(struct task_struct *tsk)
739 {
740 struct vtime *vtime = &tsk->vtime;
741
742 write_seqcount_begin(&vtime->seqcount);
743 vtime_account_system(tsk, vtime);
744 vtime->state = VTIME_USER;
745 write_seqcount_end(&vtime->seqcount);
746 }
747
vtime_user_exit(struct task_struct * tsk)748 void vtime_user_exit(struct task_struct *tsk)
749 {
750 struct vtime *vtime = &tsk->vtime;
751
752 write_seqcount_begin(&vtime->seqcount);
753 vtime->utime += get_vtime_delta(vtime);
754 if (vtime->utime >= TICK_NSEC) {
755 account_user_time(tsk, vtime->utime);
756 vtime->utime = 0;
757 }
758 vtime->state = VTIME_SYS;
759 write_seqcount_end(&vtime->seqcount);
760 }
761
vtime_guest_enter(struct task_struct * tsk)762 void vtime_guest_enter(struct task_struct *tsk)
763 {
764 struct vtime *vtime = &tsk->vtime;
765 /*
766 * The flags must be updated under the lock with
767 * the vtime_starttime flush and update.
768 * That enforces a right ordering and update sequence
769 * synchronization against the reader (task_gtime())
770 * that can thus safely catch up with a tickless delta.
771 */
772 write_seqcount_begin(&vtime->seqcount);
773 vtime_account_system(tsk, vtime);
774 tsk->flags |= PF_VCPU;
775 vtime->state = VTIME_GUEST;
776 write_seqcount_end(&vtime->seqcount);
777 }
778 EXPORT_SYMBOL_GPL(vtime_guest_enter);
779
vtime_guest_exit(struct task_struct * tsk)780 void vtime_guest_exit(struct task_struct *tsk)
781 {
782 struct vtime *vtime = &tsk->vtime;
783
784 write_seqcount_begin(&vtime->seqcount);
785 vtime_account_guest(tsk, vtime);
786 tsk->flags &= ~PF_VCPU;
787 vtime->state = VTIME_SYS;
788 write_seqcount_end(&vtime->seqcount);
789 }
790 EXPORT_SYMBOL_GPL(vtime_guest_exit);
791
vtime_account_idle(struct task_struct * tsk)792 void vtime_account_idle(struct task_struct *tsk)
793 {
794 account_idle_time(get_vtime_delta(&tsk->vtime));
795 }
796
vtime_task_switch_generic(struct task_struct * prev)797 void vtime_task_switch_generic(struct task_struct *prev)
798 {
799 struct vtime *vtime = &prev->vtime;
800
801 write_seqcount_begin(&vtime->seqcount);
802 if (vtime->state == VTIME_IDLE)
803 vtime_account_idle(prev);
804 else
805 __vtime_account_kernel(prev, vtime);
806 vtime->state = VTIME_INACTIVE;
807 vtime->cpu = -1;
808 write_seqcount_end(&vtime->seqcount);
809
810 vtime = ¤t->vtime;
811
812 write_seqcount_begin(&vtime->seqcount);
813 if (is_idle_task(current))
814 vtime->state = VTIME_IDLE;
815 else if (current->flags & PF_VCPU)
816 vtime->state = VTIME_GUEST;
817 else
818 vtime->state = VTIME_SYS;
819 vtime->starttime = sched_clock();
820 vtime->cpu = smp_processor_id();
821 write_seqcount_end(&vtime->seqcount);
822 }
823
vtime_init_idle(struct task_struct * t,int cpu)824 void vtime_init_idle(struct task_struct *t, int cpu)
825 {
826 struct vtime *vtime = &t->vtime;
827 unsigned long flags;
828
829 local_irq_save(flags);
830 write_seqcount_begin(&vtime->seqcount);
831 vtime->state = VTIME_IDLE;
832 vtime->starttime = sched_clock();
833 vtime->cpu = cpu;
834 write_seqcount_end(&vtime->seqcount);
835 local_irq_restore(flags);
836 }
837
task_gtime(struct task_struct * t)838 u64 task_gtime(struct task_struct *t)
839 {
840 struct vtime *vtime = &t->vtime;
841 unsigned int seq;
842 u64 gtime;
843
844 if (!vtime_accounting_enabled())
845 return t->gtime;
846
847 do {
848 seq = read_seqcount_begin(&vtime->seqcount);
849
850 gtime = t->gtime;
851 if (vtime->state == VTIME_GUEST)
852 gtime += vtime->gtime + vtime_delta(vtime);
853
854 } while (read_seqcount_retry(&vtime->seqcount, seq));
855
856 return gtime;
857 }
858
859 /*
860 * Fetch cputime raw values from fields of task_struct and
861 * add up the pending nohz execution time since the last
862 * cputime snapshot.
863 */
task_cputime(struct task_struct * t,u64 * utime,u64 * stime)864 bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
865 {
866 struct vtime *vtime = &t->vtime;
867 unsigned int seq;
868 u64 delta;
869 int ret;
870
871 if (!vtime_accounting_enabled()) {
872 *utime = t->utime;
873 *stime = t->stime;
874 return false;
875 }
876
877 do {
878 ret = false;
879 seq = read_seqcount_begin(&vtime->seqcount);
880
881 *utime = t->utime;
882 *stime = t->stime;
883
884 /* Task is sleeping or idle, nothing to add */
885 if (vtime->state < VTIME_SYS)
886 continue;
887
888 ret = true;
889 delta = vtime_delta(vtime);
890
891 /*
892 * Task runs either in user (including guest) or kernel space,
893 * add pending nohz time to the right place.
894 */
895 if (vtime->state == VTIME_SYS)
896 *stime += vtime->stime + delta;
897 else
898 *utime += vtime->utime + delta;
899 } while (read_seqcount_retry(&vtime->seqcount, seq));
900
901 return ret;
902 }
903
vtime_state_fetch(struct vtime * vtime,int cpu)904 static int vtime_state_fetch(struct vtime *vtime, int cpu)
905 {
906 int state = READ_ONCE(vtime->state);
907
908 /*
909 * We raced against a context switch, fetch the
910 * kcpustat task again.
911 */
912 if (vtime->cpu != cpu && vtime->cpu != -1)
913 return -EAGAIN;
914
915 /*
916 * Two possible things here:
917 * 1) We are seeing the scheduling out task (prev) or any past one.
918 * 2) We are seeing the scheduling in task (next) but it hasn't
919 * passed though vtime_task_switch() yet so the pending
920 * cputime of the prev task may not be flushed yet.
921 *
922 * Case 1) is ok but 2) is not. So wait for a safe VTIME state.
923 */
924 if (state == VTIME_INACTIVE)
925 return -EAGAIN;
926
927 return state;
928 }
929
kcpustat_user_vtime(struct vtime * vtime)930 static u64 kcpustat_user_vtime(struct vtime *vtime)
931 {
932 if (vtime->state == VTIME_USER)
933 return vtime->utime + vtime_delta(vtime);
934 else if (vtime->state == VTIME_GUEST)
935 return vtime->gtime + vtime_delta(vtime);
936 return 0;
937 }
938
kcpustat_field_vtime(u64 * cpustat,struct task_struct * tsk,enum cpu_usage_stat usage,int cpu,u64 * val)939 static int kcpustat_field_vtime(u64 *cpustat,
940 struct task_struct *tsk,
941 enum cpu_usage_stat usage,
942 int cpu, u64 *val)
943 {
944 struct vtime *vtime = &tsk->vtime;
945 unsigned int seq;
946
947 do {
948 int state;
949
950 seq = read_seqcount_begin(&vtime->seqcount);
951
952 state = vtime_state_fetch(vtime, cpu);
953 if (state < 0)
954 return state;
955
956 *val = cpustat[usage];
957
958 /*
959 * Nice VS unnice cputime accounting may be inaccurate if
960 * the nice value has changed since the last vtime update.
961 * But proper fix would involve interrupting target on nice
962 * updates which is a no go on nohz_full (although the scheduler
963 * may still interrupt the target if rescheduling is needed...)
964 */
965 switch (usage) {
966 case CPUTIME_SYSTEM:
967 if (state == VTIME_SYS)
968 *val += vtime->stime + vtime_delta(vtime);
969 break;
970 case CPUTIME_USER:
971 if (task_nice(tsk) <= 0)
972 *val += kcpustat_user_vtime(vtime);
973 break;
974 case CPUTIME_NICE:
975 if (task_nice(tsk) > 0)
976 *val += kcpustat_user_vtime(vtime);
977 break;
978 case CPUTIME_GUEST:
979 if (state == VTIME_GUEST && task_nice(tsk) <= 0)
980 *val += vtime->gtime + vtime_delta(vtime);
981 break;
982 case CPUTIME_GUEST_NICE:
983 if (state == VTIME_GUEST && task_nice(tsk) > 0)
984 *val += vtime->gtime + vtime_delta(vtime);
985 break;
986 default:
987 break;
988 }
989 } while (read_seqcount_retry(&vtime->seqcount, seq));
990
991 return 0;
992 }
993
kcpustat_field(struct kernel_cpustat * kcpustat,enum cpu_usage_stat usage,int cpu)994 u64 kcpustat_field(struct kernel_cpustat *kcpustat,
995 enum cpu_usage_stat usage, int cpu)
996 {
997 u64 *cpustat = kcpustat->cpustat;
998 u64 val = cpustat[usage];
999 struct rq *rq;
1000 int err;
1001
1002 if (!vtime_accounting_enabled_cpu(cpu))
1003 return val;
1004
1005 rq = cpu_rq(cpu);
1006
1007 for (;;) {
1008 struct task_struct *curr;
1009
1010 rcu_read_lock();
1011 curr = rcu_dereference(rq->curr);
1012 if (WARN_ON_ONCE(!curr)) {
1013 rcu_read_unlock();
1014 return cpustat[usage];
1015 }
1016
1017 err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
1018 rcu_read_unlock();
1019
1020 if (!err)
1021 return val;
1022
1023 cpu_relax();
1024 }
1025 }
1026 EXPORT_SYMBOL_GPL(kcpustat_field);
1027
kcpustat_cpu_fetch_vtime(struct kernel_cpustat * dst,const struct kernel_cpustat * src,struct task_struct * tsk,int cpu)1028 static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
1029 const struct kernel_cpustat *src,
1030 struct task_struct *tsk, int cpu)
1031 {
1032 struct vtime *vtime = &tsk->vtime;
1033 unsigned int seq;
1034
1035 do {
1036 u64 *cpustat;
1037 u64 delta;
1038 int state;
1039
1040 seq = read_seqcount_begin(&vtime->seqcount);
1041
1042 state = vtime_state_fetch(vtime, cpu);
1043 if (state < 0)
1044 return state;
1045
1046 *dst = *src;
1047 cpustat = dst->cpustat;
1048
1049 /* Task is sleeping, dead or idle, nothing to add */
1050 if (state < VTIME_SYS)
1051 continue;
1052
1053 delta = vtime_delta(vtime);
1054
1055 /*
1056 * Task runs either in user (including guest) or kernel space,
1057 * add pending nohz time to the right place.
1058 */
1059 if (state == VTIME_SYS) {
1060 cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
1061 } else if (state == VTIME_USER) {
1062 if (task_nice(tsk) > 0)
1063 cpustat[CPUTIME_NICE] += vtime->utime + delta;
1064 else
1065 cpustat[CPUTIME_USER] += vtime->utime + delta;
1066 } else {
1067 WARN_ON_ONCE(state != VTIME_GUEST);
1068 if (task_nice(tsk) > 0) {
1069 cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
1070 cpustat[CPUTIME_NICE] += vtime->gtime + delta;
1071 } else {
1072 cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
1073 cpustat[CPUTIME_USER] += vtime->gtime + delta;
1074 }
1075 }
1076 } while (read_seqcount_retry(&vtime->seqcount, seq));
1077
1078 return 0;
1079 }
1080
kcpustat_cpu_fetch(struct kernel_cpustat * dst,int cpu)1081 void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
1082 {
1083 const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
1084 struct rq *rq;
1085 int err;
1086
1087 if (!vtime_accounting_enabled_cpu(cpu)) {
1088 *dst = *src;
1089 return;
1090 }
1091
1092 rq = cpu_rq(cpu);
1093
1094 for (;;) {
1095 struct task_struct *curr;
1096
1097 rcu_read_lock();
1098 curr = rcu_dereference(rq->curr);
1099 if (WARN_ON_ONCE(!curr)) {
1100 rcu_read_unlock();
1101 *dst = *src;
1102 return;
1103 }
1104
1105 err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
1106 rcu_read_unlock();
1107
1108 if (!err)
1109 return;
1110
1111 cpu_relax();
1112 }
1113 }
1114 EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
1115
1116 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
1117