• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Common Block IO controller cgroup interface
3  *
4  * Based on ideas and code from CFQ, CFS and BFQ:
5  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
6  *
7  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
8  *		      Paolo Valente <paolo.valente@unimore.it>
9  *
10  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
11  * 	              Nauman Rafique <nauman@google.com>
12  */
13 #include <linux/ioprio.h>
14 #include <linux/kdev_t.h>
15 #include <linux/module.h>
16 #include <linux/err.h>
17 #include <linux/blkdev.h>
18 #include <linux/slab.h>
19 #include <linux/genhd.h>
20 #include <linux/delay.h>
21 #include <linux/atomic.h>
22 #include "blk-cgroup.h"
23 #include "blk.h"
24 
25 #define MAX_KEY_LEN 100
26 
27 static DEFINE_MUTEX(blkcg_pol_mutex);
28 
29 struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
30 			    .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
31 EXPORT_SYMBOL_GPL(blkcg_root);
32 
33 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
34 
35 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
36 				      struct request_queue *q, bool update_hint);
37 
38 /**
39  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
40  * @d_blkg: loop cursor pointing to the current descendant
41  * @pos_cgrp: used for iteration
42  * @p_blkg: target blkg to walk descendants of
43  *
44  * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
45  * read locked.  If called under either blkcg or queue lock, the iteration
46  * is guaranteed to include all and only online blkgs.  The caller may
47  * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip
48  * subtree.
49  */
50 #define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg)		\
51 	cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \
52 		if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \
53 					      (p_blkg)->q, false)))
54 
blkcg_policy_enabled(struct request_queue * q,const struct blkcg_policy * pol)55 static bool blkcg_policy_enabled(struct request_queue *q,
56 				 const struct blkcg_policy *pol)
57 {
58 	return pol && test_bit(pol->plid, q->blkcg_pols);
59 }
60 
61 /**
62  * blkg_free - free a blkg
63  * @blkg: blkg to free
64  *
65  * Free @blkg which may be partially allocated.
66  */
blkg_free(struct blkcg_gq * blkg)67 static void blkg_free(struct blkcg_gq *blkg)
68 {
69 	int i;
70 
71 	if (!blkg)
72 		return;
73 
74 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
75 		struct blkcg_policy *pol = blkcg_policy[i];
76 		struct blkg_policy_data *pd = blkg->pd[i];
77 
78 		if (!pd)
79 			continue;
80 
81 		if (pol && pol->pd_exit_fn)
82 			pol->pd_exit_fn(blkg);
83 
84 		kfree(pd);
85 	}
86 
87 	blk_exit_rl(&blkg->rl);
88 	kfree(blkg);
89 }
90 
91 /**
92  * blkg_alloc - allocate a blkg
93  * @blkcg: block cgroup the new blkg is associated with
94  * @q: request_queue the new blkg is associated with
95  * @gfp_mask: allocation mask to use
96  *
97  * Allocate a new blkg assocating @blkcg and @q.
98  */
blkg_alloc(struct blkcg * blkcg,struct request_queue * q,gfp_t gfp_mask)99 static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
100 				   gfp_t gfp_mask)
101 {
102 	struct blkcg_gq *blkg;
103 	int i;
104 
105 	/* alloc and init base part */
106 	blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
107 	if (!blkg)
108 		return NULL;
109 
110 	blkg->q = q;
111 	INIT_LIST_HEAD(&blkg->q_node);
112 	blkg->blkcg = blkcg;
113 	blkg->refcnt = 1;
114 
115 	/* root blkg uses @q->root_rl, init rl only for !root blkgs */
116 	if (blkcg != &blkcg_root) {
117 		if (blk_init_rl(&blkg->rl, q, gfp_mask))
118 			goto err_free;
119 		blkg->rl.blkg = blkg;
120 	}
121 
122 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
123 		struct blkcg_policy *pol = blkcg_policy[i];
124 		struct blkg_policy_data *pd;
125 
126 		if (!blkcg_policy_enabled(q, pol))
127 			continue;
128 
129 		/* alloc per-policy data and attach it to blkg */
130 		pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
131 		if (!pd)
132 			goto err_free;
133 
134 		blkg->pd[i] = pd;
135 		pd->blkg = blkg;
136 		pd->plid = i;
137 
138 		/* invoke per-policy init */
139 		if (pol->pd_init_fn)
140 			pol->pd_init_fn(blkg);
141 	}
142 
143 	return blkg;
144 
145 err_free:
146 	blkg_free(blkg);
147 	return NULL;
148 }
149 
150 /**
151  * __blkg_lookup - internal version of blkg_lookup()
152  * @blkcg: blkcg of interest
153  * @q: request_queue of interest
154  * @update_hint: whether to update lookup hint with the result or not
155  *
156  * This is internal version and shouldn't be used by policy
157  * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
158  * @q's bypass state.  If @update_hint is %true, the caller should be
159  * holding @q->queue_lock and lookup hint is updated on success.
160  */
__blkg_lookup(struct blkcg * blkcg,struct request_queue * q,bool update_hint)161 static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
162 				      struct request_queue *q, bool update_hint)
163 {
164 	struct blkcg_gq *blkg;
165 
166 	blkg = rcu_dereference(blkcg->blkg_hint);
167 	if (blkg && blkg->q == q)
168 		return blkg;
169 
170 	/*
171 	 * Hint didn't match.  Look up from the radix tree.  Note that the
172 	 * hint can only be updated under queue_lock as otherwise @blkg
173 	 * could have already been removed from blkg_tree.  The caller is
174 	 * responsible for grabbing queue_lock if @update_hint.
175 	 */
176 	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
177 	if (blkg && blkg->q == q) {
178 		if (update_hint) {
179 			lockdep_assert_held(q->queue_lock);
180 			rcu_assign_pointer(blkcg->blkg_hint, blkg);
181 		}
182 		return blkg;
183 	}
184 
185 	return NULL;
186 }
187 
188 /**
189  * blkg_lookup - lookup blkg for the specified blkcg - q pair
190  * @blkcg: blkcg of interest
191  * @q: request_queue of interest
192  *
193  * Lookup blkg for the @blkcg - @q pair.  This function should be called
194  * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
195  * - see blk_queue_bypass_start() for details.
196  */
blkg_lookup(struct blkcg * blkcg,struct request_queue * q)197 struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
198 {
199 	WARN_ON_ONCE(!rcu_read_lock_held());
200 
201 	if (unlikely(blk_queue_bypass(q)))
202 		return NULL;
203 	return __blkg_lookup(blkcg, q, false);
204 }
205 EXPORT_SYMBOL_GPL(blkg_lookup);
206 
207 /*
208  * If @new_blkg is %NULL, this function tries to allocate a new one as
209  * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
210  */
blkg_create(struct blkcg * blkcg,struct request_queue * q,struct blkcg_gq * new_blkg)211 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
212 				    struct request_queue *q,
213 				    struct blkcg_gq *new_blkg)
214 {
215 	struct blkcg_gq *blkg;
216 	int i, ret;
217 
218 	WARN_ON_ONCE(!rcu_read_lock_held());
219 	lockdep_assert_held(q->queue_lock);
220 
221 	/* blkg holds a reference to blkcg */
222 	if (!css_tryget(&blkcg->css)) {
223 		ret = -EINVAL;
224 		goto err_free_blkg;
225 	}
226 
227 	/* allocate */
228 	if (!new_blkg) {
229 		new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
230 		if (unlikely(!new_blkg)) {
231 			ret = -ENOMEM;
232 			goto err_put_css;
233 		}
234 	}
235 	blkg = new_blkg;
236 
237 	/* link parent and insert */
238 	if (blkcg_parent(blkcg)) {
239 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
240 		if (WARN_ON_ONCE(!blkg->parent)) {
241 			blkg = ERR_PTR(-EINVAL);
242 			goto err_put_css;
243 		}
244 		blkg_get(blkg->parent);
245 	}
246 
247 	spin_lock(&blkcg->lock);
248 	ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
249 	if (likely(!ret)) {
250 		hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
251 		list_add(&blkg->q_node, &q->blkg_list);
252 
253 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
254 			struct blkcg_policy *pol = blkcg_policy[i];
255 
256 			if (blkg->pd[i] && pol->pd_online_fn)
257 				pol->pd_online_fn(blkg);
258 		}
259 	}
260 	blkg->online = true;
261 	spin_unlock(&blkcg->lock);
262 
263 	if (!ret)
264 		return blkg;
265 
266 	/* @blkg failed fully initialized, use the usual release path */
267 	blkg_put(blkg);
268 	return ERR_PTR(ret);
269 
270 err_put_css:
271 	css_put(&blkcg->css);
272 err_free_blkg:
273 	blkg_free(new_blkg);
274 	return ERR_PTR(ret);
275 }
276 
277 /**
278  * blkg_lookup_create - lookup blkg, try to create one if not there
279  * @blkcg: blkcg of interest
280  * @q: request_queue of interest
281  *
282  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
283  * create one.  blkg creation is performed recursively from blkcg_root such
284  * that all non-root blkg's have access to the parent blkg.  This function
285  * should be called under RCU read lock and @q->queue_lock.
286  *
287  * Returns pointer to the looked up or created blkg on success, ERR_PTR()
288  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
289  * dead and bypassing, returns ERR_PTR(-EBUSY).
290  */
blkg_lookup_create(struct blkcg * blkcg,struct request_queue * q)291 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
292 				    struct request_queue *q)
293 {
294 	struct blkcg_gq *blkg;
295 
296 	WARN_ON_ONCE(!rcu_read_lock_held());
297 	lockdep_assert_held(q->queue_lock);
298 
299 	/*
300 	 * This could be the first entry point of blkcg implementation and
301 	 * we shouldn't allow anything to go through for a bypassing queue.
302 	 */
303 	if (unlikely(blk_queue_bypass(q)))
304 		return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
305 
306 	blkg = __blkg_lookup(blkcg, q, true);
307 	if (blkg)
308 		return blkg;
309 
310 	/*
311 	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
312 	 * non-root blkgs have access to their parents.
313 	 */
314 	while (true) {
315 		struct blkcg *pos = blkcg;
316 		struct blkcg *parent = blkcg_parent(blkcg);
317 
318 		while (parent && !__blkg_lookup(parent, q, false)) {
319 			pos = parent;
320 			parent = blkcg_parent(parent);
321 		}
322 
323 		blkg = blkg_create(pos, q, NULL);
324 		if (pos == blkcg || IS_ERR(blkg))
325 			return blkg;
326 	}
327 }
328 EXPORT_SYMBOL_GPL(blkg_lookup_create);
329 
blkg_destroy(struct blkcg_gq * blkg)330 static void blkg_destroy(struct blkcg_gq *blkg)
331 {
332 	struct blkcg *blkcg = blkg->blkcg;
333 	int i;
334 
335 	lockdep_assert_held(blkg->q->queue_lock);
336 	lockdep_assert_held(&blkcg->lock);
337 
338 	/* Something wrong if we are trying to remove same group twice */
339 	WARN_ON_ONCE(list_empty(&blkg->q_node));
340 	WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
341 
342 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
343 		struct blkcg_policy *pol = blkcg_policy[i];
344 
345 		if (blkg->pd[i] && pol->pd_offline_fn)
346 			pol->pd_offline_fn(blkg);
347 	}
348 	blkg->online = false;
349 
350 	radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
351 	list_del_init(&blkg->q_node);
352 	hlist_del_init_rcu(&blkg->blkcg_node);
353 
354 	/*
355 	 * Both setting lookup hint to and clearing it from @blkg are done
356 	 * under queue_lock.  If it's not pointing to @blkg now, it never
357 	 * will.  Hint assignment itself can race safely.
358 	 */
359 	if (rcu_dereference_raw(blkcg->blkg_hint) == blkg)
360 		rcu_assign_pointer(blkcg->blkg_hint, NULL);
361 
362 	/*
363 	 * Put the reference taken at the time of creation so that when all
364 	 * queues are gone, group can be destroyed.
365 	 */
366 	blkg_put(blkg);
367 }
368 
369 /**
370  * blkg_destroy_all - destroy all blkgs associated with a request_queue
371  * @q: request_queue of interest
372  *
373  * Destroy all blkgs associated with @q.
374  */
blkg_destroy_all(struct request_queue * q)375 static void blkg_destroy_all(struct request_queue *q)
376 {
377 	struct blkcg_gq *blkg, *n;
378 
379 	lockdep_assert_held(q->queue_lock);
380 
381 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
382 		struct blkcg *blkcg = blkg->blkcg;
383 
384 		spin_lock(&blkcg->lock);
385 		blkg_destroy(blkg);
386 		spin_unlock(&blkcg->lock);
387 	}
388 
389 	/*
390 	 * root blkg is destroyed.  Just clear the pointer since
391 	 * root_rl does not take reference on root blkg.
392 	 */
393 	q->root_blkg = NULL;
394 	q->root_rl.blkg = NULL;
395 }
396 
blkg_rcu_free(struct rcu_head * rcu_head)397 static void blkg_rcu_free(struct rcu_head *rcu_head)
398 {
399 	blkg_free(container_of(rcu_head, struct blkcg_gq, rcu_head));
400 }
401 
__blkg_release(struct blkcg_gq * blkg)402 void __blkg_release(struct blkcg_gq *blkg)
403 {
404 	/* release the blkcg and parent blkg refs this blkg has been holding */
405 	css_put(&blkg->blkcg->css);
406 	if (blkg->parent)
407 		blkg_put(blkg->parent);
408 
409 	/*
410 	 * A group is freed in rcu manner. But having an rcu lock does not
411 	 * mean that one can access all the fields of blkg and assume these
412 	 * are valid. For example, don't try to follow throtl_data and
413 	 * request queue links.
414 	 *
415 	 * Having a reference to blkg under an rcu allows acess to only
416 	 * values local to groups like group stats and group rate limits
417 	 */
418 	call_rcu(&blkg->rcu_head, blkg_rcu_free);
419 }
420 EXPORT_SYMBOL_GPL(__blkg_release);
421 
422 /*
423  * The next function used by blk_queue_for_each_rl().  It's a bit tricky
424  * because the root blkg uses @q->root_rl instead of its own rl.
425  */
__blk_queue_next_rl(struct request_list * rl,struct request_queue * q)426 struct request_list *__blk_queue_next_rl(struct request_list *rl,
427 					 struct request_queue *q)
428 {
429 	struct list_head *ent;
430 	struct blkcg_gq *blkg;
431 
432 	/*
433 	 * Determine the current blkg list_head.  The first entry is
434 	 * root_rl which is off @q->blkg_list and mapped to the head.
435 	 */
436 	if (rl == &q->root_rl) {
437 		ent = &q->blkg_list;
438 		/* There are no more block groups, hence no request lists */
439 		if (list_empty(ent))
440 			return NULL;
441 	} else {
442 		blkg = container_of(rl, struct blkcg_gq, rl);
443 		ent = &blkg->q_node;
444 	}
445 
446 	/* walk to the next list_head, skip root blkcg */
447 	ent = ent->next;
448 	if (ent == &q->root_blkg->q_node)
449 		ent = ent->next;
450 	if (ent == &q->blkg_list)
451 		return NULL;
452 
453 	blkg = container_of(ent, struct blkcg_gq, q_node);
454 	return &blkg->rl;
455 }
456 
blkcg_reset_stats(struct cgroup * cgroup,struct cftype * cftype,u64 val)457 static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype,
458 			     u64 val)
459 {
460 	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
461 	struct blkcg_gq *blkg;
462 	int i;
463 
464 	mutex_lock(&blkcg_pol_mutex);
465 	spin_lock_irq(&blkcg->lock);
466 
467 	/*
468 	 * Note that stat reset is racy - it doesn't synchronize against
469 	 * stat updates.  This is a debug feature which shouldn't exist
470 	 * anyway.  If you get hit by a race, retry.
471 	 */
472 	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
473 		for (i = 0; i < BLKCG_MAX_POLS; i++) {
474 			struct blkcg_policy *pol = blkcg_policy[i];
475 
476 			if (blkcg_policy_enabled(blkg->q, pol) &&
477 			    pol->pd_reset_stats_fn)
478 				pol->pd_reset_stats_fn(blkg);
479 		}
480 	}
481 
482 	spin_unlock_irq(&blkcg->lock);
483 	mutex_unlock(&blkcg_pol_mutex);
484 	return 0;
485 }
486 
blkg_dev_name(struct blkcg_gq * blkg)487 static const char *blkg_dev_name(struct blkcg_gq *blkg)
488 {
489 	/* some drivers (floppy) instantiate a queue w/o disk registered */
490 	if (blkg->q->backing_dev_info.dev)
491 		return dev_name(blkg->q->backing_dev_info.dev);
492 	return NULL;
493 }
494 
495 /**
496  * blkcg_print_blkgs - helper for printing per-blkg data
497  * @sf: seq_file to print to
498  * @blkcg: blkcg of interest
499  * @prfill: fill function to print out a blkg
500  * @pol: policy in question
501  * @data: data to be passed to @prfill
502  * @show_total: to print out sum of prfill return values or not
503  *
504  * This function invokes @prfill on each blkg of @blkcg if pd for the
505  * policy specified by @pol exists.  @prfill is invoked with @sf, the
506  * policy data and @data and the matching queue lock held.  If @show_total
507  * is %true, the sum of the return values from @prfill is printed with
508  * "Total" label at the end.
509  *
510  * This is to be used to construct print functions for
511  * cftype->read_seq_string method.
512  */
blkcg_print_blkgs(struct seq_file * sf,struct blkcg * blkcg,u64 (* prfill)(struct seq_file *,struct blkg_policy_data *,int),const struct blkcg_policy * pol,int data,bool show_total)513 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
514 		       u64 (*prfill)(struct seq_file *,
515 				     struct blkg_policy_data *, int),
516 		       const struct blkcg_policy *pol, int data,
517 		       bool show_total)
518 {
519 	struct blkcg_gq *blkg;
520 	u64 total = 0;
521 
522 	rcu_read_lock();
523 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
524 		spin_lock_irq(blkg->q->queue_lock);
525 		if (blkcg_policy_enabled(blkg->q, pol))
526 			total += prfill(sf, blkg->pd[pol->plid], data);
527 		spin_unlock_irq(blkg->q->queue_lock);
528 	}
529 	rcu_read_unlock();
530 
531 	if (show_total)
532 		seq_printf(sf, "Total %llu\n", (unsigned long long)total);
533 }
534 EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
535 
536 /**
537  * __blkg_prfill_u64 - prfill helper for a single u64 value
538  * @sf: seq_file to print to
539  * @pd: policy private data of interest
540  * @v: value to print
541  *
542  * Print @v to @sf for the device assocaited with @pd.
543  */
__blkg_prfill_u64(struct seq_file * sf,struct blkg_policy_data * pd,u64 v)544 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
545 {
546 	const char *dname = blkg_dev_name(pd->blkg);
547 
548 	if (!dname)
549 		return 0;
550 
551 	seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
552 	return v;
553 }
554 EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
555 
556 /**
557  * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
558  * @sf: seq_file to print to
559  * @pd: policy private data of interest
560  * @rwstat: rwstat to print
561  *
562  * Print @rwstat to @sf for the device assocaited with @pd.
563  */
__blkg_prfill_rwstat(struct seq_file * sf,struct blkg_policy_data * pd,const struct blkg_rwstat * rwstat)564 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
565 			 const struct blkg_rwstat *rwstat)
566 {
567 	static const char *rwstr[] = {
568 		[BLKG_RWSTAT_READ]	= "Read",
569 		[BLKG_RWSTAT_WRITE]	= "Write",
570 		[BLKG_RWSTAT_SYNC]	= "Sync",
571 		[BLKG_RWSTAT_ASYNC]	= "Async",
572 	};
573 	const char *dname = blkg_dev_name(pd->blkg);
574 	u64 v;
575 	int i;
576 
577 	if (!dname)
578 		return 0;
579 
580 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
581 		seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
582 			   (unsigned long long)rwstat->cnt[i]);
583 
584 	v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
585 	seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
586 	return v;
587 }
588 EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
589 
590 /**
591  * blkg_prfill_stat - prfill callback for blkg_stat
592  * @sf: seq_file to print to
593  * @pd: policy private data of interest
594  * @off: offset to the blkg_stat in @pd
595  *
596  * prfill callback for printing a blkg_stat.
597  */
blkg_prfill_stat(struct seq_file * sf,struct blkg_policy_data * pd,int off)598 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
599 {
600 	return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
601 }
602 EXPORT_SYMBOL_GPL(blkg_prfill_stat);
603 
604 /**
605  * blkg_prfill_rwstat - prfill callback for blkg_rwstat
606  * @sf: seq_file to print to
607  * @pd: policy private data of interest
608  * @off: offset to the blkg_rwstat in @pd
609  *
610  * prfill callback for printing a blkg_rwstat.
611  */
blkg_prfill_rwstat(struct seq_file * sf,struct blkg_policy_data * pd,int off)612 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
613 		       int off)
614 {
615 	struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
616 
617 	return __blkg_prfill_rwstat(sf, pd, &rwstat);
618 }
619 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
620 
621 /**
622  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
623  * @pd: policy private data of interest
624  * @off: offset to the blkg_stat in @pd
625  *
626  * Collect the blkg_stat specified by @off from @pd and all its online
627  * descendants and return the sum.  The caller must be holding the queue
628  * lock for online tests.
629  */
blkg_stat_recursive_sum(struct blkg_policy_data * pd,int off)630 u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
631 {
632 	struct blkcg_policy *pol = blkcg_policy[pd->plid];
633 	struct blkcg_gq *pos_blkg;
634 	struct cgroup *pos_cgrp;
635 	u64 sum;
636 
637 	lockdep_assert_held(pd->blkg->q->queue_lock);
638 
639 	sum = blkg_stat_read((void *)pd + off);
640 
641 	rcu_read_lock();
642 	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
643 		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
644 		struct blkg_stat *stat = (void *)pos_pd + off;
645 
646 		if (pos_blkg->online)
647 			sum += blkg_stat_read(stat);
648 	}
649 	rcu_read_unlock();
650 
651 	return sum;
652 }
653 EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
654 
655 /**
656  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
657  * @pd: policy private data of interest
658  * @off: offset to the blkg_stat in @pd
659  *
660  * Collect the blkg_rwstat specified by @off from @pd and all its online
661  * descendants and return the sum.  The caller must be holding the queue
662  * lock for online tests.
663  */
blkg_rwstat_recursive_sum(struct blkg_policy_data * pd,int off)664 struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
665 					     int off)
666 {
667 	struct blkcg_policy *pol = blkcg_policy[pd->plid];
668 	struct blkcg_gq *pos_blkg;
669 	struct cgroup *pos_cgrp;
670 	struct blkg_rwstat sum;
671 	int i;
672 
673 	lockdep_assert_held(pd->blkg->q->queue_lock);
674 
675 	sum = blkg_rwstat_read((void *)pd + off);
676 
677 	rcu_read_lock();
678 	blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) {
679 		struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
680 		struct blkg_rwstat *rwstat = (void *)pos_pd + off;
681 		struct blkg_rwstat tmp;
682 
683 		if (!pos_blkg->online)
684 			continue;
685 
686 		tmp = blkg_rwstat_read(rwstat);
687 
688 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
689 			sum.cnt[i] += tmp.cnt[i];
690 	}
691 	rcu_read_unlock();
692 
693 	return sum;
694 }
695 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
696 
697 /**
698  * blkg_conf_prep - parse and prepare for per-blkg config update
699  * @blkcg: target block cgroup
700  * @pol: target policy
701  * @input: input string
702  * @ctx: blkg_conf_ctx to be filled
703  *
704  * Parse per-blkg config update from @input and initialize @ctx with the
705  * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
706  * value.  This function returns with RCU read lock and queue lock held and
707  * must be paired with blkg_conf_finish().
708  */
blkg_conf_prep(struct blkcg * blkcg,const struct blkcg_policy * pol,const char * input,struct blkg_conf_ctx * ctx)709 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
710 		   const char *input, struct blkg_conf_ctx *ctx)
711 	__acquires(rcu) __acquires(disk->queue->queue_lock)
712 {
713 	struct gendisk *disk;
714 	struct blkcg_gq *blkg;
715 	unsigned int major, minor;
716 	unsigned long long v;
717 	int part, ret;
718 
719 	if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
720 		return -EINVAL;
721 
722 	disk = get_gendisk(MKDEV(major, minor), &part);
723 	if (!disk || part)
724 		return -EINVAL;
725 
726 	rcu_read_lock();
727 	spin_lock_irq(disk->queue->queue_lock);
728 
729 	if (blkcg_policy_enabled(disk->queue, pol))
730 		blkg = blkg_lookup_create(blkcg, disk->queue);
731 	else
732 		blkg = ERR_PTR(-EINVAL);
733 
734 	if (IS_ERR(blkg)) {
735 		ret = PTR_ERR(blkg);
736 		rcu_read_unlock();
737 		spin_unlock_irq(disk->queue->queue_lock);
738 		put_disk(disk);
739 		/*
740 		 * If queue was bypassing, we should retry.  Do so after a
741 		 * short msleep().  It isn't strictly necessary but queue
742 		 * can be bypassing for some time and it's always nice to
743 		 * avoid busy looping.
744 		 */
745 		if (ret == -EBUSY) {
746 			msleep(10);
747 			ret = restart_syscall();
748 		}
749 		return ret;
750 	}
751 
752 	ctx->disk = disk;
753 	ctx->blkg = blkg;
754 	ctx->v = v;
755 	return 0;
756 }
757 EXPORT_SYMBOL_GPL(blkg_conf_prep);
758 
759 /**
760  * blkg_conf_finish - finish up per-blkg config update
761  * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
762  *
763  * Finish up after per-blkg config update.  This function must be paired
764  * with blkg_conf_prep().
765  */
blkg_conf_finish(struct blkg_conf_ctx * ctx)766 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
767 	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
768 {
769 	spin_unlock_irq(ctx->disk->queue->queue_lock);
770 	rcu_read_unlock();
771 	put_disk(ctx->disk);
772 }
773 EXPORT_SYMBOL_GPL(blkg_conf_finish);
774 
775 struct cftype blkcg_files[] = {
776 	{
777 		.name = "reset_stats",
778 		.write_u64 = blkcg_reset_stats,
779 	},
780 	{ }	/* terminate */
781 };
782 
783 /**
784  * blkcg_css_offline - cgroup css_offline callback
785  * @cgroup: cgroup of interest
786  *
787  * This function is called when @cgroup is about to go away and responsible
788  * for shooting down all blkgs associated with @cgroup.  blkgs should be
789  * removed while holding both q and blkcg locks.  As blkcg lock is nested
790  * inside q lock, this function performs reverse double lock dancing.
791  *
792  * This is the blkcg counterpart of ioc_release_fn().
793  */
blkcg_css_offline(struct cgroup * cgroup)794 static void blkcg_css_offline(struct cgroup *cgroup)
795 {
796 	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
797 
798 	spin_lock_irq(&blkcg->lock);
799 
800 	while (!hlist_empty(&blkcg->blkg_list)) {
801 		struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
802 						struct blkcg_gq, blkcg_node);
803 		struct request_queue *q = blkg->q;
804 
805 		if (spin_trylock(q->queue_lock)) {
806 			blkg_destroy(blkg);
807 			spin_unlock(q->queue_lock);
808 		} else {
809 			spin_unlock_irq(&blkcg->lock);
810 			cpu_relax();
811 			spin_lock_irq(&blkcg->lock);
812 		}
813 	}
814 
815 	spin_unlock_irq(&blkcg->lock);
816 }
817 
blkcg_css_free(struct cgroup * cgroup)818 static void blkcg_css_free(struct cgroup *cgroup)
819 {
820 	struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
821 
822 	if (blkcg != &blkcg_root)
823 		kfree(blkcg);
824 }
825 
blkcg_css_alloc(struct cgroup * cgroup)826 static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup)
827 {
828 	static atomic64_t id_seq = ATOMIC64_INIT(0);
829 	struct blkcg *blkcg;
830 	struct cgroup *parent = cgroup->parent;
831 
832 	if (!parent) {
833 		blkcg = &blkcg_root;
834 		goto done;
835 	}
836 
837 	blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
838 	if (!blkcg)
839 		return ERR_PTR(-ENOMEM);
840 
841 	blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
842 	blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
843 	blkcg->id = atomic64_inc_return(&id_seq); /* root is 0, start from 1 */
844 done:
845 	spin_lock_init(&blkcg->lock);
846 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
847 	INIT_HLIST_HEAD(&blkcg->blkg_list);
848 
849 	return &blkcg->css;
850 }
851 
852 /**
853  * blkcg_init_queue - initialize blkcg part of request queue
854  * @q: request_queue to initialize
855  *
856  * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
857  * part of new request_queue @q.
858  *
859  * RETURNS:
860  * 0 on success, -errno on failure.
861  */
blkcg_init_queue(struct request_queue * q)862 int blkcg_init_queue(struct request_queue *q)
863 {
864 	might_sleep();
865 
866 	return blk_throtl_init(q);
867 }
868 
869 /**
870  * blkcg_drain_queue - drain blkcg part of request_queue
871  * @q: request_queue to drain
872  *
873  * Called from blk_drain_queue().  Responsible for draining blkcg part.
874  */
blkcg_drain_queue(struct request_queue * q)875 void blkcg_drain_queue(struct request_queue *q)
876 {
877 	lockdep_assert_held(q->queue_lock);
878 
879 	blk_throtl_drain(q);
880 }
881 
882 /**
883  * blkcg_exit_queue - exit and release blkcg part of request_queue
884  * @q: request_queue being released
885  *
886  * Called from blk_release_queue().  Responsible for exiting blkcg part.
887  */
blkcg_exit_queue(struct request_queue * q)888 void blkcg_exit_queue(struct request_queue *q)
889 {
890 	spin_lock_irq(q->queue_lock);
891 	blkg_destroy_all(q);
892 	spin_unlock_irq(q->queue_lock);
893 
894 	blk_throtl_exit(q);
895 }
896 
897 /*
898  * We cannot support shared io contexts, as we have no mean to support
899  * two tasks with the same ioc in two different groups without major rework
900  * of the main cic data structures.  For now we allow a task to change
901  * its cgroup only if it's the only owner of its ioc.
902  */
blkcg_can_attach(struct cgroup * cgrp,struct cgroup_taskset * tset)903 static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
904 {
905 	struct task_struct *task;
906 	struct io_context *ioc;
907 	int ret = 0;
908 
909 	/* task_lock() is needed to avoid races with exit_io_context() */
910 	cgroup_taskset_for_each(task, cgrp, tset) {
911 		task_lock(task);
912 		ioc = task->io_context;
913 		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
914 			ret = -EINVAL;
915 		task_unlock(task);
916 		if (ret)
917 			break;
918 	}
919 	return ret;
920 }
921 
922 struct cgroup_subsys blkio_subsys = {
923 	.name = "blkio",
924 	.css_alloc = blkcg_css_alloc,
925 	.css_offline = blkcg_css_offline,
926 	.css_free = blkcg_css_free,
927 	.can_attach = blkcg_can_attach,
928 	.subsys_id = blkio_subsys_id,
929 	.base_cftypes = blkcg_files,
930 	.module = THIS_MODULE,
931 
932 	/*
933 	 * blkio subsystem is utterly broken in terms of hierarchy support.
934 	 * It treats all cgroups equally regardless of where they're
935 	 * located in the hierarchy - all cgroups are treated as if they're
936 	 * right below the root.  Fix it and remove the following.
937 	 */
938 	.broken_hierarchy = true,
939 };
940 EXPORT_SYMBOL_GPL(blkio_subsys);
941 
942 /**
943  * blkcg_activate_policy - activate a blkcg policy on a request_queue
944  * @q: request_queue of interest
945  * @pol: blkcg policy to activate
946  *
947  * Activate @pol on @q.  Requires %GFP_KERNEL context.  @q goes through
948  * bypass mode to populate its blkgs with policy_data for @pol.
949  *
950  * Activation happens with @q bypassed, so nobody would be accessing blkgs
951  * from IO path.  Update of each blkg is protected by both queue and blkcg
952  * locks so that holding either lock and testing blkcg_policy_enabled() is
953  * always enough for dereferencing policy data.
954  *
955  * The caller is responsible for synchronizing [de]activations and policy
956  * [un]registerations.  Returns 0 on success, -errno on failure.
957  */
blkcg_activate_policy(struct request_queue * q,const struct blkcg_policy * pol)958 int blkcg_activate_policy(struct request_queue *q,
959 			  const struct blkcg_policy *pol)
960 {
961 	LIST_HEAD(pds);
962 	struct blkcg_gq *blkg, *new_blkg;
963 	struct blkg_policy_data *pd, *n;
964 	int cnt = 0, ret;
965 	bool preloaded;
966 
967 	if (blkcg_policy_enabled(q, pol))
968 		return 0;
969 
970 	/* preallocations for root blkg */
971 	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
972 	if (!new_blkg)
973 		return -ENOMEM;
974 
975 	blk_queue_bypass_start(q);
976 
977 	preloaded = !radix_tree_preload(GFP_KERNEL);
978 
979 	/*
980 	 * Make sure the root blkg exists and count the existing blkgs.  As
981 	 * @q is bypassing at this point, blkg_lookup_create() can't be
982 	 * used.  Open code it.
983 	 */
984 	spin_lock_irq(q->queue_lock);
985 
986 	rcu_read_lock();
987 	blkg = __blkg_lookup(&blkcg_root, q, false);
988 	if (blkg)
989 		blkg_free(new_blkg);
990 	else
991 		blkg = blkg_create(&blkcg_root, q, new_blkg);
992 	rcu_read_unlock();
993 
994 	if (preloaded)
995 		radix_tree_preload_end();
996 
997 	if (IS_ERR(blkg)) {
998 		ret = PTR_ERR(blkg);
999 		goto out_unlock;
1000 	}
1001 	q->root_blkg = blkg;
1002 	q->root_rl.blkg = blkg;
1003 
1004 	list_for_each_entry(blkg, &q->blkg_list, q_node)
1005 		cnt++;
1006 
1007 	spin_unlock_irq(q->queue_lock);
1008 
1009 	/* allocate policy_data for all existing blkgs */
1010 	while (cnt--) {
1011 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
1012 		if (!pd) {
1013 			ret = -ENOMEM;
1014 			goto out_free;
1015 		}
1016 		list_add_tail(&pd->alloc_node, &pds);
1017 	}
1018 
1019 	/*
1020 	 * Install the allocated pds.  With @q bypassing, no new blkg
1021 	 * should have been created while the queue lock was dropped.
1022 	 */
1023 	spin_lock_irq(q->queue_lock);
1024 
1025 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
1026 		if (WARN_ON(list_empty(&pds))) {
1027 			/* umm... this shouldn't happen, just abort */
1028 			ret = -ENOMEM;
1029 			goto out_unlock;
1030 		}
1031 		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
1032 		list_del_init(&pd->alloc_node);
1033 
1034 		/* grab blkcg lock too while installing @pd on @blkg */
1035 		spin_lock(&blkg->blkcg->lock);
1036 
1037 		blkg->pd[pol->plid] = pd;
1038 		pd->blkg = blkg;
1039 		pd->plid = pol->plid;
1040 		pol->pd_init_fn(blkg);
1041 
1042 		spin_unlock(&blkg->blkcg->lock);
1043 	}
1044 
1045 	__set_bit(pol->plid, q->blkcg_pols);
1046 	ret = 0;
1047 out_unlock:
1048 	spin_unlock_irq(q->queue_lock);
1049 out_free:
1050 	blk_queue_bypass_end(q);
1051 	list_for_each_entry_safe(pd, n, &pds, alloc_node)
1052 		kfree(pd);
1053 	return ret;
1054 }
1055 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
1056 
1057 /**
1058  * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
1059  * @q: request_queue of interest
1060  * @pol: blkcg policy to deactivate
1061  *
1062  * Deactivate @pol on @q.  Follows the same synchronization rules as
1063  * blkcg_activate_policy().
1064  */
blkcg_deactivate_policy(struct request_queue * q,const struct blkcg_policy * pol)1065 void blkcg_deactivate_policy(struct request_queue *q,
1066 			     const struct blkcg_policy *pol)
1067 {
1068 	struct blkcg_gq *blkg;
1069 
1070 	if (!blkcg_policy_enabled(q, pol))
1071 		return;
1072 
1073 	blk_queue_bypass_start(q);
1074 	spin_lock_irq(q->queue_lock);
1075 
1076 	__clear_bit(pol->plid, q->blkcg_pols);
1077 
1078 	/* if no policy is left, no need for blkgs - shoot them down */
1079 	if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
1080 		blkg_destroy_all(q);
1081 
1082 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
1083 		/* grab blkcg lock too while removing @pd from @blkg */
1084 		spin_lock(&blkg->blkcg->lock);
1085 
1086 		if (pol->pd_offline_fn)
1087 			pol->pd_offline_fn(blkg);
1088 		if (pol->pd_exit_fn)
1089 			pol->pd_exit_fn(blkg);
1090 
1091 		kfree(blkg->pd[pol->plid]);
1092 		blkg->pd[pol->plid] = NULL;
1093 
1094 		spin_unlock(&blkg->blkcg->lock);
1095 	}
1096 
1097 	spin_unlock_irq(q->queue_lock);
1098 	blk_queue_bypass_end(q);
1099 }
1100 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
1101 
1102 /**
1103  * blkcg_policy_register - register a blkcg policy
1104  * @pol: blkcg policy to register
1105  *
1106  * Register @pol with blkcg core.  Might sleep and @pol may be modified on
1107  * successful registration.  Returns 0 on success and -errno on failure.
1108  */
blkcg_policy_register(struct blkcg_policy * pol)1109 int blkcg_policy_register(struct blkcg_policy *pol)
1110 {
1111 	int i, ret;
1112 
1113 	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
1114 		return -EINVAL;
1115 
1116 	mutex_lock(&blkcg_pol_mutex);
1117 
1118 	/* find an empty slot */
1119 	ret = -ENOSPC;
1120 	for (i = 0; i < BLKCG_MAX_POLS; i++)
1121 		if (!blkcg_policy[i])
1122 			break;
1123 	if (i >= BLKCG_MAX_POLS)
1124 		goto out_unlock;
1125 
1126 	/* register and update blkgs */
1127 	pol->plid = i;
1128 	blkcg_policy[i] = pol;
1129 
1130 	/* everything is in place, add intf files for the new policy */
1131 	if (pol->cftypes)
1132 		WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
1133 	ret = 0;
1134 out_unlock:
1135 	mutex_unlock(&blkcg_pol_mutex);
1136 	return ret;
1137 }
1138 EXPORT_SYMBOL_GPL(blkcg_policy_register);
1139 
1140 /**
1141  * blkcg_policy_unregister - unregister a blkcg policy
1142  * @pol: blkcg policy to unregister
1143  *
1144  * Undo blkcg_policy_register(@pol).  Might sleep.
1145  */
blkcg_policy_unregister(struct blkcg_policy * pol)1146 void blkcg_policy_unregister(struct blkcg_policy *pol)
1147 {
1148 	mutex_lock(&blkcg_pol_mutex);
1149 
1150 	if (WARN_ON(blkcg_policy[pol->plid] != pol))
1151 		goto out_unlock;
1152 
1153 	/* kill the intf files first */
1154 	if (pol->cftypes)
1155 		cgroup_rm_cftypes(&blkio_subsys, pol->cftypes);
1156 
1157 	/* unregister and update blkgs */
1158 	blkcg_policy[pol->plid] = NULL;
1159 out_unlock:
1160 	mutex_unlock(&blkcg_pol_mutex);
1161 }
1162 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
1163