• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 #ifdef CONFIG_SCHEDSTATS
3 /*
4  * bump this up when changing the output format or the meaning of an existing
5  * format, so that tools can adapt (or abort)
6  */
7 #define SCHEDSTAT_VERSION 14
8 
show_schedstat(struct seq_file * seq,void * v)9 static int show_schedstat(struct seq_file *seq, void *v)
10 {
11 	int cpu;
12 	int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 	char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14 
15 	if (mask_str == NULL)
16 		return -ENOMEM;
17 
18 	seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 	seq_printf(seq, "timestamp %lu\n", jiffies);
20 	for_each_online_cpu(cpu) {
21 		struct rq *rq = cpu_rq(cpu);
22 #ifdef CONFIG_SMP
23 		struct sched_domain *sd;
24 		int dcount = 0;
25 #endif
26 
27 		/* runqueue-specific stats */
28 		seq_printf(seq,
29 		    "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
30 		    cpu, rq->yld_both_empty,
31 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 		    rq->ttwu_count, rq->ttwu_local,
34 		    rq->rq_cpu_time,
35 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
36 
37 		seq_printf(seq, "\n");
38 
39 #ifdef CONFIG_SMP
40 		/* domain-specific stats */
41 		preempt_disable();
42 		for_each_domain(cpu, sd) {
43 			enum cpu_idle_type itype;
44 
45 			cpumask_scnprintf(mask_str, mask_len,
46 					  sched_domain_span(sd));
47 			seq_printf(seq, "domain%d %s", dcount++, mask_str);
48 			for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
49 					itype++) {
50 				seq_printf(seq, " %u %u %u %u %u %u %u %u",
51 				    sd->lb_count[itype],
52 				    sd->lb_balanced[itype],
53 				    sd->lb_failed[itype],
54 				    sd->lb_imbalance[itype],
55 				    sd->lb_gained[itype],
56 				    sd->lb_hot_gained[itype],
57 				    sd->lb_nobusyq[itype],
58 				    sd->lb_nobusyg[itype]);
59 			}
60 			seq_printf(seq,
61 				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
62 			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
63 			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
64 			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
65 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
66 			    sd->ttwu_move_balance);
67 		}
68 		preempt_enable();
69 #endif
70 	}
71 	kfree(mask_str);
72 	return 0;
73 }
74 
schedstat_open(struct inode * inode,struct file * file)75 static int schedstat_open(struct inode *inode, struct file *file)
76 {
77 	unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
78 	char *buf = kmalloc(size, GFP_KERNEL);
79 	struct seq_file *m;
80 	int res;
81 
82 	if (!buf)
83 		return -ENOMEM;
84 	res = single_open(file, show_schedstat, NULL);
85 	if (!res) {
86 		m = file->private_data;
87 		m->buf = buf;
88 		m->size = size;
89 	} else
90 		kfree(buf);
91 	return res;
92 }
93 
94 static const struct file_operations proc_schedstat_operations = {
95 	.open    = schedstat_open,
96 	.read    = seq_read,
97 	.llseek  = seq_lseek,
98 	.release = single_release,
99 };
100 
proc_schedstat_init(void)101 static int __init proc_schedstat_init(void)
102 {
103 	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
104 	return 0;
105 }
106 module_init(proc_schedstat_init);
107 
108 /*
109  * Expects runqueue lock to be held for atomicity of update
110  */
111 static inline void
rq_sched_info_arrive(struct rq * rq,unsigned long long delta)112 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
113 {
114 	if (rq) {
115 		rq->rq_sched_info.run_delay += delta;
116 		rq->rq_sched_info.pcount++;
117 	}
118 }
119 
120 /*
121  * Expects runqueue lock to be held for atomicity of update
122  */
123 static inline void
rq_sched_info_depart(struct rq * rq,unsigned long long delta)124 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
125 {
126 	if (rq)
127 		rq->rq_cpu_time += delta;
128 }
129 
130 static inline void
rq_sched_info_dequeued(struct rq * rq,unsigned long long delta)131 rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
132 {
133 	if (rq)
134 		rq->rq_sched_info.run_delay += delta;
135 }
136 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
137 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
138 # define schedstat_set(var, val)	do { var = (val); } while (0)
139 #else /* !CONFIG_SCHEDSTATS */
140 static inline void
rq_sched_info_arrive(struct rq * rq,unsigned long long delta)141 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
142 {}
143 static inline void
rq_sched_info_dequeued(struct rq * rq,unsigned long long delta)144 rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
145 {}
146 static inline void
rq_sched_info_depart(struct rq * rq,unsigned long long delta)147 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
148 {}
149 # define schedstat_inc(rq, field)	do { } while (0)
150 # define schedstat_add(rq, field, amt)	do { } while (0)
151 # define schedstat_set(var, val)	do { } while (0)
152 #endif
153 
154 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
sched_info_reset_dequeued(struct task_struct * t)155 static inline void sched_info_reset_dequeued(struct task_struct *t)
156 {
157 	t->sched_info.last_queued = 0;
158 }
159 
160 /*
161  * Called when a process is dequeued from the active array and given
162  * the cpu.  We should note that with the exception of interactive
163  * tasks, the expired queue will become the active queue after the active
164  * queue is empty, without explicitly dequeuing and requeuing tasks in the
165  * expired queue.  (Interactive tasks may be requeued directly to the
166  * active queue, thus delaying tasks in the expired queue from running;
167  * see scheduler_tick()).
168  *
169  * Though we are interested in knowing how long it was from the *first* time a
170  * task was queued to the time that it finally hit a cpu, we call this routine
171  * from dequeue_task() to account for possible rq->clock skew across cpus. The
172  * delta taken on each cpu would annul the skew.
173  */
sched_info_dequeued(struct task_struct * t)174 static inline void sched_info_dequeued(struct task_struct *t)
175 {
176 	unsigned long long now = task_rq(t)->clock, delta = 0;
177 
178 	if (unlikely(sched_info_on()))
179 		if (t->sched_info.last_queued)
180 			delta = now - t->sched_info.last_queued;
181 	sched_info_reset_dequeued(t);
182 	t->sched_info.run_delay += delta;
183 
184 	rq_sched_info_dequeued(task_rq(t), delta);
185 }
186 
187 /*
188  * Called when a task finally hits the cpu.  We can now calculate how
189  * long it was waiting to run.  We also note when it began so that we
190  * can keep stats on how long its timeslice is.
191  */
sched_info_arrive(struct task_struct * t)192 static void sched_info_arrive(struct task_struct *t)
193 {
194 	unsigned long long now = task_rq(t)->clock, delta = 0;
195 
196 	if (t->sched_info.last_queued)
197 		delta = now - t->sched_info.last_queued;
198 	sched_info_reset_dequeued(t);
199 	t->sched_info.run_delay += delta;
200 	t->sched_info.last_arrival = now;
201 	t->sched_info.pcount++;
202 
203 	rq_sched_info_arrive(task_rq(t), delta);
204 }
205 
206 /*
207  * Called when a process is queued into either the active or expired
208  * array.  The time is noted and later used to determine how long we
209  * had to wait for us to reach the cpu.  Since the expired queue will
210  * become the active queue after active queue is empty, without dequeuing
211  * and requeuing any tasks, we are interested in queuing to either. It
212  * is unusual but not impossible for tasks to be dequeued and immediately
213  * requeued in the same or another array: this can happen in sched_yield(),
214  * set_user_nice(), and even load_balance() as it moves tasks from runqueue
215  * to runqueue.
216  *
217  * This function is only called from enqueue_task(), but also only updates
218  * the timestamp if it is already not set.  It's assumed that
219  * sched_info_dequeued() will clear that stamp when appropriate.
220  */
sched_info_queued(struct task_struct * t)221 static inline void sched_info_queued(struct task_struct *t)
222 {
223 	if (unlikely(sched_info_on()))
224 		if (!t->sched_info.last_queued)
225 			t->sched_info.last_queued = task_rq(t)->clock;
226 }
227 
228 /*
229  * Called when a process ceases being the active-running process, either
230  * voluntarily or involuntarily.  Now we can calculate how long we ran.
231  * Also, if the process is still in the TASK_RUNNING state, call
232  * sched_info_queued() to mark that it has now again started waiting on
233  * the runqueue.
234  */
sched_info_depart(struct task_struct * t)235 static inline void sched_info_depart(struct task_struct *t)
236 {
237 	unsigned long long delta = task_rq(t)->clock -
238 					t->sched_info.last_arrival;
239 
240 	rq_sched_info_depart(task_rq(t), delta);
241 
242 	if (t->state == TASK_RUNNING)
243 		sched_info_queued(t);
244 }
245 
246 /*
247  * Called when tasks are switched involuntarily due, typically, to expiring
248  * their time slice.  (This may also be called when switching to or from
249  * the idle task.)  We are only called when prev != next.
250  */
251 static inline void
__sched_info_switch(struct task_struct * prev,struct task_struct * next)252 __sched_info_switch(struct task_struct *prev, struct task_struct *next)
253 {
254 	struct rq *rq = task_rq(prev);
255 
256 	/*
257 	 * prev now departs the cpu.  It's not interesting to record
258 	 * stats about how efficient we were at scheduling the idle
259 	 * process, however.
260 	 */
261 	if (prev != rq->idle)
262 		sched_info_depart(prev);
263 
264 	if (next != rq->idle)
265 		sched_info_arrive(next);
266 }
267 static inline void
sched_info_switch(struct task_struct * prev,struct task_struct * next)268 sched_info_switch(struct task_struct *prev, struct task_struct *next)
269 {
270 	if (unlikely(sched_info_on()))
271 		__sched_info_switch(prev, next);
272 }
273 #else
274 #define sched_info_queued(t)			do { } while (0)
275 #define sched_info_reset_dequeued(t)	do { } while (0)
276 #define sched_info_dequeued(t)			do { } while (0)
277 #define sched_info_switch(t, next)		do { } while (0)
278 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
279 
280 /*
281  * The following are functions that support scheduler-internal time accounting.
282  * These functions are generally called at the timer tick.  None of this depends
283  * on CONFIG_SCHEDSTATS.
284  */
285 
286 /**
287  * account_group_user_time - Maintain utime for a thread group.
288  *
289  * @tsk:	Pointer to task structure.
290  * @cputime:	Time value by which to increment the utime field of the
291  *		thread_group_cputime structure.
292  *
293  * If thread group time is being maintained, get the structure for the
294  * running CPU and update the utime field there.
295  */
account_group_user_time(struct task_struct * tsk,cputime_t cputime)296 static inline void account_group_user_time(struct task_struct *tsk,
297 					   cputime_t cputime)
298 {
299 	struct thread_group_cputimer *cputimer;
300 
301 	/* tsk == current, ensure it is safe to use ->signal */
302 	if (unlikely(tsk->exit_state))
303 		return;
304 
305 	cputimer = &tsk->signal->cputimer;
306 
307 	if (!cputimer->running)
308 		return;
309 
310 	spin_lock(&cputimer->lock);
311 	cputimer->cputime.utime =
312 		cputime_add(cputimer->cputime.utime, cputime);
313 	spin_unlock(&cputimer->lock);
314 }
315 
316 /**
317  * account_group_system_time - Maintain stime for a thread group.
318  *
319  * @tsk:	Pointer to task structure.
320  * @cputime:	Time value by which to increment the stime field of the
321  *		thread_group_cputime structure.
322  *
323  * If thread group time is being maintained, get the structure for the
324  * running CPU and update the stime field there.
325  */
account_group_system_time(struct task_struct * tsk,cputime_t cputime)326 static inline void account_group_system_time(struct task_struct *tsk,
327 					     cputime_t cputime)
328 {
329 	struct thread_group_cputimer *cputimer;
330 
331 	/* tsk == current, ensure it is safe to use ->signal */
332 	if (unlikely(tsk->exit_state))
333 		return;
334 
335 	cputimer = &tsk->signal->cputimer;
336 
337 	if (!cputimer->running)
338 		return;
339 
340 	spin_lock(&cputimer->lock);
341 	cputimer->cputime.stime =
342 		cputime_add(cputimer->cputime.stime, cputime);
343 	spin_unlock(&cputimer->lock);
344 }
345 
346 /**
347  * account_group_exec_runtime - Maintain exec runtime for a thread group.
348  *
349  * @tsk:	Pointer to task structure.
350  * @ns:		Time value by which to increment the sum_exec_runtime field
351  *		of the thread_group_cputime structure.
352  *
353  * If thread group time is being maintained, get the structure for the
354  * running CPU and update the sum_exec_runtime field there.
355  */
account_group_exec_runtime(struct task_struct * tsk,unsigned long long ns)356 static inline void account_group_exec_runtime(struct task_struct *tsk,
357 					      unsigned long long ns)
358 {
359 	struct thread_group_cputimer *cputimer;
360 	struct signal_struct *sig;
361 
362 	sig = tsk->signal;
363 	/* see __exit_signal()->task_rq_unlock_wait() */
364 	barrier();
365 	if (unlikely(!sig))
366 		return;
367 
368 	cputimer = &sig->cputimer;
369 
370 	if (!cputimer->running)
371 		return;
372 
373 	spin_lock(&cputimer->lock);
374 	cputimer->cputime.sum_exec_runtime += ns;
375 	spin_unlock(&cputimer->lock);
376 }
377