• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 
32 #include <net/net_namespace.h>
33 #include <net/sock.h>
34 #include <net/netlink.h>
35 #include <net/pkt_sched.h>
36 
37 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
38 			struct Qdisc *old, struct Qdisc *new);
39 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
40 			 struct Qdisc *q, unsigned long cl, int event);
41 
42 /*
43 
44    Short review.
45    -------------
46 
47    This file consists of two interrelated parts:
48 
49    1. queueing disciplines manager frontend.
50    2. traffic classes manager frontend.
51 
52    Generally, queueing discipline ("qdisc") is a black box,
53    which is able to enqueue packets and to dequeue them (when
54    device is ready to send something) in order and at times
55    determined by algorithm hidden in it.
56 
57    qdisc's are divided to two categories:
58    - "queues", which have no internal structure visible from outside.
59    - "schedulers", which split all the packets to "traffic classes",
60      using "packet classifiers" (look at cls_api.c)
61 
62    In turn, classes may have child qdiscs (as rule, queues)
63    attached to them etc. etc. etc.
64 
65    The goal of the routines in this file is to translate
66    information supplied by user in the form of handles
67    to more intelligible for kernel form, to make some sanity
68    checks and part of work, which is common to all qdiscs
69    and to provide rtnetlink notifications.
70 
71    All real intelligent work is done inside qdisc modules.
72 
73 
74 
75    Every discipline has two major routines: enqueue and dequeue.
76 
77    ---dequeue
78 
79    dequeue usually returns a skb to send. It is allowed to return NULL,
80    but it does not mean that queue is empty, it just means that
81    discipline does not want to send anything this time.
82    Queue is really empty if q->q.qlen == 0.
83    For complicated disciplines with multiple queues q->q is not
84    real packet queue, but however q->q.qlen must be valid.
85 
86    ---enqueue
87 
88    enqueue returns 0, if packet was enqueued successfully.
89    If packet (this one or another one) was dropped, it returns
90    not zero error code.
91    NET_XMIT_DROP 	- this packet dropped
92      Expected action: do not backoff, but wait until queue will clear.
93    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
94      Expected action: backoff or ignore
95    NET_XMIT_POLICED	- dropped by police.
96      Expected action: backoff or error to real-time apps.
97 
98    Auxiliary routines:
99 
100    ---peek
101 
102    like dequeue but without removing a packet from the queue
103 
104    ---reset
105 
106    returns qdisc to initial state: purge all buffers, clear all
107    timers, counters (except for statistics) etc.
108 
109    ---init
110 
111    initializes newly created qdisc.
112 
113    ---destroy
114 
115    destroys resources allocated by init and during lifetime of qdisc.
116 
117    ---change
118 
119    changes qdisc parameters.
120  */
121 
122 /* Protects list of registered TC modules. It is pure SMP lock. */
123 static DEFINE_RWLOCK(qdisc_mod_lock);
124 
125 
126 /************************************************
127  *	Queueing disciplines manipulation.	*
128  ************************************************/
129 
130 
131 /* The list of all installed queueing disciplines. */
132 
133 static struct Qdisc_ops *qdisc_base;
134 
135 /* Register/uregister queueing discipline */
136 
register_qdisc(struct Qdisc_ops * qops)137 int register_qdisc(struct Qdisc_ops *qops)
138 {
139 	struct Qdisc_ops *q, **qp;
140 	int rc = -EEXIST;
141 
142 	write_lock(&qdisc_mod_lock);
143 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
144 		if (!strcmp(qops->id, q->id))
145 			goto out;
146 
147 	if (qops->enqueue == NULL)
148 		qops->enqueue = noop_qdisc_ops.enqueue;
149 	if (qops->peek == NULL) {
150 		if (qops->dequeue == NULL) {
151 			qops->peek = noop_qdisc_ops.peek;
152 		} else {
153 			rc = -EINVAL;
154 			goto out;
155 		}
156 	}
157 	if (qops->dequeue == NULL)
158 		qops->dequeue = noop_qdisc_ops.dequeue;
159 
160 	qops->next = NULL;
161 	*qp = qops;
162 	rc = 0;
163 out:
164 	write_unlock(&qdisc_mod_lock);
165 	return rc;
166 }
167 EXPORT_SYMBOL(register_qdisc);
168 
unregister_qdisc(struct Qdisc_ops * qops)169 int unregister_qdisc(struct Qdisc_ops *qops)
170 {
171 	struct Qdisc_ops *q, **qp;
172 	int err = -ENOENT;
173 
174 	write_lock(&qdisc_mod_lock);
175 	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
176 		if (q == qops)
177 			break;
178 	if (q) {
179 		*qp = q->next;
180 		q->next = NULL;
181 		err = 0;
182 	}
183 	write_unlock(&qdisc_mod_lock);
184 	return err;
185 }
186 EXPORT_SYMBOL(unregister_qdisc);
187 
188 /* We know handle. Find qdisc among all qdisc's attached to device
189    (root qdisc, all its children, children of children etc.)
190  */
191 
qdisc_match_from_root(struct Qdisc * root,u32 handle)192 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
193 {
194 	struct Qdisc *q;
195 
196 	if (!(root->flags & TCQ_F_BUILTIN) &&
197 	    root->handle == handle)
198 		return root;
199 
200 	list_for_each_entry(q, &root->list, list) {
201 		if (q->handle == handle)
202 			return q;
203 	}
204 	return NULL;
205 }
206 
qdisc_list_add(struct Qdisc * q)207 static void qdisc_list_add(struct Qdisc *q)
208 {
209 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
210 		list_add_tail(&q->list, &qdisc_root_sleeping(q)->list);
211 }
212 
qdisc_list_del(struct Qdisc * q)213 void qdisc_list_del(struct Qdisc *q)
214 {
215 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
216 		list_del(&q->list);
217 }
218 EXPORT_SYMBOL(qdisc_list_del);
219 
qdisc_lookup(struct net_device * dev,u32 handle)220 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
221 {
222 	unsigned int i;
223 	struct Qdisc *q;
224 
225 	for (i = 0; i < dev->num_tx_queues; i++) {
226 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
227 		struct Qdisc *txq_root = txq->qdisc_sleeping;
228 
229 		q = qdisc_match_from_root(txq_root, handle);
230 		if (q)
231 			goto out;
232 	}
233 
234 	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
235 out:
236 	return q;
237 }
238 
qdisc_leaf(struct Qdisc * p,u32 classid)239 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
240 {
241 	unsigned long cl;
242 	struct Qdisc *leaf;
243 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
244 
245 	if (cops == NULL)
246 		return NULL;
247 	cl = cops->get(p, classid);
248 
249 	if (cl == 0)
250 		return NULL;
251 	leaf = cops->leaf(p, cl);
252 	cops->put(p, cl);
253 	return leaf;
254 }
255 
256 /* Find queueing discipline by name */
257 
qdisc_lookup_ops(struct nlattr * kind)258 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
259 {
260 	struct Qdisc_ops *q = NULL;
261 
262 	if (kind) {
263 		read_lock(&qdisc_mod_lock);
264 		for (q = qdisc_base; q; q = q->next) {
265 			if (nla_strcmp(kind, q->id) == 0) {
266 				if (!try_module_get(q->owner))
267 					q = NULL;
268 				break;
269 			}
270 		}
271 		read_unlock(&qdisc_mod_lock);
272 	}
273 	return q;
274 }
275 
276 static struct qdisc_rate_table *qdisc_rtab_list;
277 
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab)278 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
279 {
280 	struct qdisc_rate_table *rtab;
281 
282 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
283 		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
284 			rtab->refcnt++;
285 			return rtab;
286 		}
287 	}
288 
289 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
290 	    nla_len(tab) != TC_RTAB_SIZE)
291 		return NULL;
292 
293 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
294 	if (rtab) {
295 		rtab->rate = *r;
296 		rtab->refcnt = 1;
297 		memcpy(rtab->data, nla_data(tab), 1024);
298 		rtab->next = qdisc_rtab_list;
299 		qdisc_rtab_list = rtab;
300 	}
301 	return rtab;
302 }
303 EXPORT_SYMBOL(qdisc_get_rtab);
304 
qdisc_put_rtab(struct qdisc_rate_table * tab)305 void qdisc_put_rtab(struct qdisc_rate_table *tab)
306 {
307 	struct qdisc_rate_table *rtab, **rtabp;
308 
309 	if (!tab || --tab->refcnt)
310 		return;
311 
312 	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
313 		if (rtab == tab) {
314 			*rtabp = rtab->next;
315 			kfree(rtab);
316 			return;
317 		}
318 	}
319 }
320 EXPORT_SYMBOL(qdisc_put_rtab);
321 
322 static LIST_HEAD(qdisc_stab_list);
323 static DEFINE_SPINLOCK(qdisc_stab_lock);
324 
325 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
326 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
327 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
328 };
329 
qdisc_get_stab(struct nlattr * opt)330 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
331 {
332 	struct nlattr *tb[TCA_STAB_MAX + 1];
333 	struct qdisc_size_table *stab;
334 	struct tc_sizespec *s;
335 	unsigned int tsize = 0;
336 	u16 *tab = NULL;
337 	int err;
338 
339 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
340 	if (err < 0)
341 		return ERR_PTR(err);
342 	if (!tb[TCA_STAB_BASE])
343 		return ERR_PTR(-EINVAL);
344 
345 	s = nla_data(tb[TCA_STAB_BASE]);
346 
347 	if (s->tsize > 0) {
348 		if (!tb[TCA_STAB_DATA])
349 			return ERR_PTR(-EINVAL);
350 		tab = nla_data(tb[TCA_STAB_DATA]);
351 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
352 	}
353 
354 	if (!s || tsize != s->tsize || (!tab && tsize > 0))
355 		return ERR_PTR(-EINVAL);
356 
357 	spin_lock(&qdisc_stab_lock);
358 
359 	list_for_each_entry(stab, &qdisc_stab_list, list) {
360 		if (memcmp(&stab->szopts, s, sizeof(*s)))
361 			continue;
362 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
363 			continue;
364 		stab->refcnt++;
365 		spin_unlock(&qdisc_stab_lock);
366 		return stab;
367 	}
368 
369 	spin_unlock(&qdisc_stab_lock);
370 
371 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
372 	if (!stab)
373 		return ERR_PTR(-ENOMEM);
374 
375 	stab->refcnt = 1;
376 	stab->szopts = *s;
377 	if (tsize > 0)
378 		memcpy(stab->data, tab, tsize * sizeof(u16));
379 
380 	spin_lock(&qdisc_stab_lock);
381 	list_add_tail(&stab->list, &qdisc_stab_list);
382 	spin_unlock(&qdisc_stab_lock);
383 
384 	return stab;
385 }
386 
qdisc_put_stab(struct qdisc_size_table * tab)387 void qdisc_put_stab(struct qdisc_size_table *tab)
388 {
389 	if (!tab)
390 		return;
391 
392 	spin_lock(&qdisc_stab_lock);
393 
394 	if (--tab->refcnt == 0) {
395 		list_del(&tab->list);
396 		kfree(tab);
397 	}
398 
399 	spin_unlock(&qdisc_stab_lock);
400 }
401 EXPORT_SYMBOL(qdisc_put_stab);
402 
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)403 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
404 {
405 	struct nlattr *nest;
406 
407 	nest = nla_nest_start(skb, TCA_STAB);
408 	if (nest == NULL)
409 		goto nla_put_failure;
410 	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
411 	nla_nest_end(skb, nest);
412 
413 	return skb->len;
414 
415 nla_put_failure:
416 	return -1;
417 }
418 
qdisc_calculate_pkt_len(struct sk_buff * skb,struct qdisc_size_table * stab)419 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
420 {
421 	int pkt_len, slot;
422 
423 	pkt_len = skb->len + stab->szopts.overhead;
424 	if (unlikely(!stab->szopts.tsize))
425 		goto out;
426 
427 	slot = pkt_len + stab->szopts.cell_align;
428 	if (unlikely(slot < 0))
429 		slot = 0;
430 
431 	slot >>= stab->szopts.cell_log;
432 	if (likely(slot < stab->szopts.tsize))
433 		pkt_len = stab->data[slot];
434 	else
435 		pkt_len = stab->data[stab->szopts.tsize - 1] *
436 				(slot / stab->szopts.tsize) +
437 				stab->data[slot % stab->szopts.tsize];
438 
439 	pkt_len <<= stab->szopts.size_log;
440 out:
441 	if (unlikely(pkt_len < 1))
442 		pkt_len = 1;
443 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
444 }
445 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
446 
qdisc_watchdog(struct hrtimer * timer)447 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
448 {
449 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
450 						 timer);
451 
452 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
453 	__netif_schedule(qdisc_root(wd->qdisc));
454 
455 	return HRTIMER_NORESTART;
456 }
457 
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)458 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
459 {
460 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
461 	wd->timer.function = qdisc_watchdog;
462 	wd->qdisc = qdisc;
463 }
464 EXPORT_SYMBOL(qdisc_watchdog_init);
465 
qdisc_watchdog_schedule(struct qdisc_watchdog * wd,psched_time_t expires)466 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
467 {
468 	ktime_t time;
469 
470 	if (test_bit(__QDISC_STATE_DEACTIVATED,
471 		     &qdisc_root_sleeping(wd->qdisc)->state))
472 		return;
473 
474 	wd->qdisc->flags |= TCQ_F_THROTTLED;
475 	time = ktime_set(0, 0);
476 	time = ktime_add_ns(time, PSCHED_US2NS(expires));
477 	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
478 }
479 EXPORT_SYMBOL(qdisc_watchdog_schedule);
480 
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)481 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
482 {
483 	hrtimer_cancel(&wd->timer);
484 	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
485 }
486 EXPORT_SYMBOL(qdisc_watchdog_cancel);
487 
qdisc_class_hash_alloc(unsigned int n)488 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
489 {
490 	unsigned int size = n * sizeof(struct hlist_head), i;
491 	struct hlist_head *h;
492 
493 	if (size <= PAGE_SIZE)
494 		h = kmalloc(size, GFP_KERNEL);
495 	else
496 		h = (struct hlist_head *)
497 			__get_free_pages(GFP_KERNEL, get_order(size));
498 
499 	if (h != NULL) {
500 		for (i = 0; i < n; i++)
501 			INIT_HLIST_HEAD(&h[i]);
502 	}
503 	return h;
504 }
505 
qdisc_class_hash_free(struct hlist_head * h,unsigned int n)506 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
507 {
508 	unsigned int size = n * sizeof(struct hlist_head);
509 
510 	if (size <= PAGE_SIZE)
511 		kfree(h);
512 	else
513 		free_pages((unsigned long)h, get_order(size));
514 }
515 
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)516 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
517 {
518 	struct Qdisc_class_common *cl;
519 	struct hlist_node *n, *next;
520 	struct hlist_head *nhash, *ohash;
521 	unsigned int nsize, nmask, osize;
522 	unsigned int i, h;
523 
524 	/* Rehash when load factor exceeds 0.75 */
525 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
526 		return;
527 	nsize = clhash->hashsize * 2;
528 	nmask = nsize - 1;
529 	nhash = qdisc_class_hash_alloc(nsize);
530 	if (nhash == NULL)
531 		return;
532 
533 	ohash = clhash->hash;
534 	osize = clhash->hashsize;
535 
536 	sch_tree_lock(sch);
537 	for (i = 0; i < osize; i++) {
538 		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
539 			h = qdisc_class_hash(cl->classid, nmask);
540 			hlist_add_head(&cl->hnode, &nhash[h]);
541 		}
542 	}
543 	clhash->hash     = nhash;
544 	clhash->hashsize = nsize;
545 	clhash->hashmask = nmask;
546 	sch_tree_unlock(sch);
547 
548 	qdisc_class_hash_free(ohash, osize);
549 }
550 EXPORT_SYMBOL(qdisc_class_hash_grow);
551 
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)552 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
553 {
554 	unsigned int size = 4;
555 
556 	clhash->hash = qdisc_class_hash_alloc(size);
557 	if (clhash->hash == NULL)
558 		return -ENOMEM;
559 	clhash->hashsize  = size;
560 	clhash->hashmask  = size - 1;
561 	clhash->hashelems = 0;
562 	return 0;
563 }
564 EXPORT_SYMBOL(qdisc_class_hash_init);
565 
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)566 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
567 {
568 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
569 }
570 EXPORT_SYMBOL(qdisc_class_hash_destroy);
571 
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)572 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
573 			     struct Qdisc_class_common *cl)
574 {
575 	unsigned int h;
576 
577 	INIT_HLIST_NODE(&cl->hnode);
578 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
579 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
580 	clhash->hashelems++;
581 }
582 EXPORT_SYMBOL(qdisc_class_hash_insert);
583 
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)584 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
585 			     struct Qdisc_class_common *cl)
586 {
587 	hlist_del(&cl->hnode);
588 	clhash->hashelems--;
589 }
590 EXPORT_SYMBOL(qdisc_class_hash_remove);
591 
592 /* Allocate an unique handle from space managed by kernel */
593 
qdisc_alloc_handle(struct net_device * dev)594 static u32 qdisc_alloc_handle(struct net_device *dev)
595 {
596 	int i = 0x10000;
597 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
598 
599 	do {
600 		autohandle += TC_H_MAKE(0x10000U, 0);
601 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
602 			autohandle = TC_H_MAKE(0x80000000U, 0);
603 	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
604 
605 	return i>0 ? autohandle : 0;
606 }
607 
608 /* Attach toplevel qdisc to device queue. */
609 
dev_graft_qdisc(struct netdev_queue * dev_queue,struct Qdisc * qdisc)610 static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
611 				     struct Qdisc *qdisc)
612 {
613 	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
614 	spinlock_t *root_lock;
615 
616 	root_lock = qdisc_lock(oqdisc);
617 	spin_lock_bh(root_lock);
618 
619 	/* Prune old scheduler */
620 	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
621 		qdisc_reset(oqdisc);
622 
623 	/* ... and graft new one */
624 	if (qdisc == NULL)
625 		qdisc = &noop_qdisc;
626 	dev_queue->qdisc_sleeping = qdisc;
627 	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
628 
629 	spin_unlock_bh(root_lock);
630 
631 	return oqdisc;
632 }
633 
qdisc_tree_decrease_qlen(struct Qdisc * sch,unsigned int n)634 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
635 {
636 	const struct Qdisc_class_ops *cops;
637 	unsigned long cl;
638 	u32 parentid;
639 
640 	if (n == 0)
641 		return;
642 	while ((parentid = sch->parent)) {
643 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
644 			return;
645 
646 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
647 		if (sch == NULL) {
648 			WARN_ON(parentid != TC_H_ROOT);
649 			return;
650 		}
651 		cops = sch->ops->cl_ops;
652 		if (cops->qlen_notify) {
653 			cl = cops->get(sch, parentid);
654 			cops->qlen_notify(sch, cl);
655 			cops->put(sch, cl);
656 		}
657 		sch->q.qlen -= n;
658 	}
659 }
660 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
661 
notify_and_destroy(struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)662 static void notify_and_destroy(struct sk_buff *skb, struct nlmsghdr *n, u32 clid,
663 			       struct Qdisc *old, struct Qdisc *new)
664 {
665 	if (new || old)
666 		qdisc_notify(skb, n, clid, old, new);
667 
668 	if (old)
669 		qdisc_destroy(old);
670 }
671 
672 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
673  * to device "dev".
674  *
675  * When appropriate send a netlink notification using 'skb'
676  * and "n".
677  *
678  * On success, destroy old qdisc.
679  */
680 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old)681 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
682 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
683 		       struct Qdisc *new, struct Qdisc *old)
684 {
685 	struct Qdisc *q = old;
686 	int err = 0;
687 
688 	if (parent == NULL) {
689 		unsigned int i, num_q, ingress;
690 
691 		ingress = 0;
692 		num_q = dev->num_tx_queues;
693 		if ((q && q->flags & TCQ_F_INGRESS) ||
694 		    (new && new->flags & TCQ_F_INGRESS)) {
695 			num_q = 1;
696 			ingress = 1;
697 		}
698 
699 		if (dev->flags & IFF_UP)
700 			dev_deactivate(dev);
701 
702 		for (i = 0; i < num_q; i++) {
703 			struct netdev_queue *dev_queue = &dev->rx_queue;
704 
705 			if (!ingress)
706 				dev_queue = netdev_get_tx_queue(dev, i);
707 
708 			old = dev_graft_qdisc(dev_queue, new);
709 			if (new && i > 0)
710 				atomic_inc(&new->refcnt);
711 
712 			notify_and_destroy(skb, n, classid, old, new);
713 		}
714 
715 		if (dev->flags & IFF_UP)
716 			dev_activate(dev);
717 	} else {
718 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
719 
720 		err = -EINVAL;
721 
722 		if (cops) {
723 			unsigned long cl = cops->get(parent, classid);
724 			if (cl) {
725 				err = cops->graft(parent, cl, new, &old);
726 				cops->put(parent, cl);
727 			}
728 		}
729 		if (!err)
730 			notify_and_destroy(skb, n, classid, old, new);
731 	}
732 	return err;
733 }
734 
735 /* lockdep annotation is needed for ingress; egress gets it only for name */
736 static struct lock_class_key qdisc_tx_lock;
737 static struct lock_class_key qdisc_rx_lock;
738 
739 /*
740    Allocate and initialize new qdisc.
741 
742    Parameters are passed via opt.
743  */
744 
745 static struct Qdisc *
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,u32 parent,u32 handle,struct nlattr ** tca,int * errp)746 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
747 	     u32 parent, u32 handle, struct nlattr **tca, int *errp)
748 {
749 	int err;
750 	struct nlattr *kind = tca[TCA_KIND];
751 	struct Qdisc *sch;
752 	struct Qdisc_ops *ops;
753 	struct qdisc_size_table *stab;
754 
755 	ops = qdisc_lookup_ops(kind);
756 #ifdef CONFIG_MODULES
757 	if (ops == NULL && kind != NULL) {
758 		char name[IFNAMSIZ];
759 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
760 			/* We dropped the RTNL semaphore in order to
761 			 * perform the module load.  So, even if we
762 			 * succeeded in loading the module we have to
763 			 * tell the caller to replay the request.  We
764 			 * indicate this using -EAGAIN.
765 			 * We replay the request because the device may
766 			 * go away in the mean time.
767 			 */
768 			rtnl_unlock();
769 			request_module("sch_%s", name);
770 			rtnl_lock();
771 			ops = qdisc_lookup_ops(kind);
772 			if (ops != NULL) {
773 				/* We will try again qdisc_lookup_ops,
774 				 * so don't keep a reference.
775 				 */
776 				module_put(ops->owner);
777 				err = -EAGAIN;
778 				goto err_out;
779 			}
780 		}
781 	}
782 #endif
783 
784 	err = -ENOENT;
785 	if (ops == NULL)
786 		goto err_out;
787 
788 	sch = qdisc_alloc(dev_queue, ops);
789 	if (IS_ERR(sch)) {
790 		err = PTR_ERR(sch);
791 		goto err_out2;
792 	}
793 
794 	sch->parent = parent;
795 
796 	if (handle == TC_H_INGRESS) {
797 		sch->flags |= TCQ_F_INGRESS;
798 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
799 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
800 	} else {
801 		if (handle == 0) {
802 			handle = qdisc_alloc_handle(dev);
803 			err = -ENOMEM;
804 			if (handle == 0)
805 				goto err_out3;
806 		}
807 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
808 	}
809 
810 	sch->handle = handle;
811 
812 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
813 		if (tca[TCA_STAB]) {
814 			stab = qdisc_get_stab(tca[TCA_STAB]);
815 			if (IS_ERR(stab)) {
816 				err = PTR_ERR(stab);
817 				goto err_out3;
818 			}
819 			sch->stab = stab;
820 		}
821 		if (tca[TCA_RATE]) {
822 			spinlock_t *root_lock;
823 
824 			if ((sch->parent != TC_H_ROOT) &&
825 			    !(sch->flags & TCQ_F_INGRESS))
826 				root_lock = qdisc_root_sleeping_lock(sch);
827 			else
828 				root_lock = qdisc_lock(sch);
829 
830 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
831 						root_lock, tca[TCA_RATE]);
832 			if (err) {
833 				/*
834 				 * Any broken qdiscs that would require
835 				 * a ops->reset() here? The qdisc was never
836 				 * in action so it shouldn't be necessary.
837 				 */
838 				if (ops->destroy)
839 					ops->destroy(sch);
840 				goto err_out3;
841 			}
842 		}
843 
844 		qdisc_list_add(sch);
845 
846 		return sch;
847 	}
848 err_out3:
849 	qdisc_put_stab(sch->stab);
850 	dev_put(dev);
851 	kfree((char *) sch - sch->padded);
852 err_out2:
853 	module_put(ops->owner);
854 err_out:
855 	*errp = err;
856 	return NULL;
857 }
858 
qdisc_change(struct Qdisc * sch,struct nlattr ** tca)859 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
860 {
861 	struct qdisc_size_table *stab = NULL;
862 	int err = 0;
863 
864 	if (tca[TCA_OPTIONS]) {
865 		if (sch->ops->change == NULL)
866 			return -EINVAL;
867 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
868 		if (err)
869 			return err;
870 	}
871 
872 	if (tca[TCA_STAB]) {
873 		stab = qdisc_get_stab(tca[TCA_STAB]);
874 		if (IS_ERR(stab))
875 			return PTR_ERR(stab);
876 	}
877 
878 	qdisc_put_stab(sch->stab);
879 	sch->stab = stab;
880 
881 	if (tca[TCA_RATE])
882 		/* NB: ignores errors from replace_estimator
883 		   because change can't be undone. */
884 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
885 					    qdisc_root_sleeping_lock(sch),
886 					    tca[TCA_RATE]);
887 
888 	return 0;
889 }
890 
891 struct check_loop_arg
892 {
893 	struct qdisc_walker 	w;
894 	struct Qdisc		*p;
895 	int			depth;
896 };
897 
898 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
899 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)900 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
901 {
902 	struct check_loop_arg	arg;
903 
904 	if (q->ops->cl_ops == NULL)
905 		return 0;
906 
907 	arg.w.stop = arg.w.skip = arg.w.count = 0;
908 	arg.w.fn = check_loop_fn;
909 	arg.depth = depth;
910 	arg.p = p;
911 	q->ops->cl_ops->walk(q, &arg.w);
912 	return arg.w.stop ? -ELOOP : 0;
913 }
914 
915 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)916 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
917 {
918 	struct Qdisc *leaf;
919 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
920 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
921 
922 	leaf = cops->leaf(q, cl);
923 	if (leaf) {
924 		if (leaf == arg->p || arg->depth > 7)
925 			return -ELOOP;
926 		return check_loop(leaf, arg->p, arg->depth + 1);
927 	}
928 	return 0;
929 }
930 
931 /*
932  * Delete/get qdisc.
933  */
934 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)935 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
936 {
937 	struct net *net = sock_net(skb->sk);
938 	struct tcmsg *tcm = NLMSG_DATA(n);
939 	struct nlattr *tca[TCA_MAX + 1];
940 	struct net_device *dev;
941 	u32 clid = tcm->tcm_parent;
942 	struct Qdisc *q = NULL;
943 	struct Qdisc *p = NULL;
944 	int err;
945 
946 	if (net != &init_net)
947 		return -EINVAL;
948 
949 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
950 		return -ENODEV;
951 
952 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
953 	if (err < 0)
954 		return err;
955 
956 	if (clid) {
957 		if (clid != TC_H_ROOT) {
958 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
959 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
960 					return -ENOENT;
961 				q = qdisc_leaf(p, clid);
962 			} else { /* ingress */
963 				q = dev->rx_queue.qdisc_sleeping;
964 			}
965 		} else {
966 			struct netdev_queue *dev_queue;
967 			dev_queue = netdev_get_tx_queue(dev, 0);
968 			q = dev_queue->qdisc_sleeping;
969 		}
970 		if (!q)
971 			return -ENOENT;
972 
973 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
974 			return -EINVAL;
975 	} else {
976 		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
977 			return -ENOENT;
978 	}
979 
980 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
981 		return -EINVAL;
982 
983 	if (n->nlmsg_type == RTM_DELQDISC) {
984 		if (!clid)
985 			return -EINVAL;
986 		if (q->handle == 0)
987 			return -ENOENT;
988 		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
989 			return err;
990 	} else {
991 		qdisc_notify(skb, n, clid, NULL, q);
992 	}
993 	return 0;
994 }
995 
996 /*
997    Create/change qdisc.
998  */
999 
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,void * arg)1000 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1001 {
1002 	struct net *net = sock_net(skb->sk);
1003 	struct tcmsg *tcm;
1004 	struct nlattr *tca[TCA_MAX + 1];
1005 	struct net_device *dev;
1006 	u32 clid;
1007 	struct Qdisc *q, *p;
1008 	int err;
1009 
1010 	if (net != &init_net)
1011 		return -EINVAL;
1012 
1013 replay:
1014 	/* Reinit, just in case something touches this. */
1015 	tcm = NLMSG_DATA(n);
1016 	clid = tcm->tcm_parent;
1017 	q = p = NULL;
1018 
1019 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1020 		return -ENODEV;
1021 
1022 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1023 	if (err < 0)
1024 		return err;
1025 
1026 	if (clid) {
1027 		if (clid != TC_H_ROOT) {
1028 			if (clid != TC_H_INGRESS) {
1029 				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1030 					return -ENOENT;
1031 				q = qdisc_leaf(p, clid);
1032 			} else { /*ingress */
1033 				q = dev->rx_queue.qdisc_sleeping;
1034 			}
1035 		} else {
1036 			struct netdev_queue *dev_queue;
1037 			dev_queue = netdev_get_tx_queue(dev, 0);
1038 			q = dev_queue->qdisc_sleeping;
1039 		}
1040 
1041 		/* It may be default qdisc, ignore it */
1042 		if (q && q->handle == 0)
1043 			q = NULL;
1044 
1045 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1046 			if (tcm->tcm_handle) {
1047 				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1048 					return -EEXIST;
1049 				if (TC_H_MIN(tcm->tcm_handle))
1050 					return -EINVAL;
1051 				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1052 					goto create_n_graft;
1053 				if (n->nlmsg_flags&NLM_F_EXCL)
1054 					return -EEXIST;
1055 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1056 					return -EINVAL;
1057 				if (q == p ||
1058 				    (p && check_loop(q, p, 0)))
1059 					return -ELOOP;
1060 				atomic_inc(&q->refcnt);
1061 				goto graft;
1062 			} else {
1063 				if (q == NULL)
1064 					goto create_n_graft;
1065 
1066 				/* This magic test requires explanation.
1067 				 *
1068 				 *   We know, that some child q is already
1069 				 *   attached to this parent and have choice:
1070 				 *   either to change it or to create/graft new one.
1071 				 *
1072 				 *   1. We are allowed to create/graft only
1073 				 *   if CREATE and REPLACE flags are set.
1074 				 *
1075 				 *   2. If EXCL is set, requestor wanted to say,
1076 				 *   that qdisc tcm_handle is not expected
1077 				 *   to exist, so that we choose create/graft too.
1078 				 *
1079 				 *   3. The last case is when no flags are set.
1080 				 *   Alas, it is sort of hole in API, we
1081 				 *   cannot decide what to do unambiguously.
1082 				 *   For now we select create/graft, if
1083 				 *   user gave KIND, which does not match existing.
1084 				 */
1085 				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1086 				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1087 				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1088 				     (tca[TCA_KIND] &&
1089 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1090 					goto create_n_graft;
1091 			}
1092 		}
1093 	} else {
1094 		if (!tcm->tcm_handle)
1095 			return -EINVAL;
1096 		q = qdisc_lookup(dev, tcm->tcm_handle);
1097 	}
1098 
1099 	/* Change qdisc parameters */
1100 	if (q == NULL)
1101 		return -ENOENT;
1102 	if (n->nlmsg_flags&NLM_F_EXCL)
1103 		return -EEXIST;
1104 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1105 		return -EINVAL;
1106 	err = qdisc_change(q, tca);
1107 	if (err == 0)
1108 		qdisc_notify(skb, n, clid, NULL, q);
1109 	return err;
1110 
1111 create_n_graft:
1112 	if (!(n->nlmsg_flags&NLM_F_CREATE))
1113 		return -ENOENT;
1114 	if (clid == TC_H_INGRESS)
1115 		q = qdisc_create(dev, &dev->rx_queue,
1116 				 tcm->tcm_parent, tcm->tcm_parent,
1117 				 tca, &err);
1118 	else
1119 		q = qdisc_create(dev, netdev_get_tx_queue(dev, 0),
1120 				 tcm->tcm_parent, tcm->tcm_handle,
1121 				 tca, &err);
1122 	if (q == NULL) {
1123 		if (err == -EAGAIN)
1124 			goto replay;
1125 		return err;
1126 	}
1127 
1128 graft:
1129 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1130 	if (err) {
1131 		if (q)
1132 			qdisc_destroy(q);
1133 		return err;
1134 	}
1135 
1136 	return 0;
1137 }
1138 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 pid,u32 seq,u16 flags,int event)1139 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1140 			 u32 pid, u32 seq, u16 flags, int event)
1141 {
1142 	struct tcmsg *tcm;
1143 	struct nlmsghdr  *nlh;
1144 	unsigned char *b = skb_tail_pointer(skb);
1145 	struct gnet_dump d;
1146 
1147 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1148 	tcm = NLMSG_DATA(nlh);
1149 	tcm->tcm_family = AF_UNSPEC;
1150 	tcm->tcm__pad1 = 0;
1151 	tcm->tcm__pad2 = 0;
1152 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1153 	tcm->tcm_parent = clid;
1154 	tcm->tcm_handle = q->handle;
1155 	tcm->tcm_info = atomic_read(&q->refcnt);
1156 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1157 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1158 		goto nla_put_failure;
1159 	q->qstats.qlen = q->q.qlen;
1160 
1161 	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1162 		goto nla_put_failure;
1163 
1164 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1165 					 qdisc_root_sleeping_lock(q), &d) < 0)
1166 		goto nla_put_failure;
1167 
1168 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1169 		goto nla_put_failure;
1170 
1171 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1172 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
1173 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1174 		goto nla_put_failure;
1175 
1176 	if (gnet_stats_finish_copy(&d) < 0)
1177 		goto nla_put_failure;
1178 
1179 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1180 	return skb->len;
1181 
1182 nlmsg_failure:
1183 nla_put_failure:
1184 	nlmsg_trim(skb, b);
1185 	return -1;
1186 }
1187 
qdisc_notify(struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)1188 static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1189 			u32 clid, struct Qdisc *old, struct Qdisc *new)
1190 {
1191 	struct sk_buff *skb;
1192 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1193 
1194 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1195 	if (!skb)
1196 		return -ENOBUFS;
1197 
1198 	if (old && old->handle) {
1199 		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1200 			goto err_out;
1201 	}
1202 	if (new) {
1203 		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1204 			goto err_out;
1205 	}
1206 
1207 	if (skb->len)
1208 		return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1209 
1210 err_out:
1211 	kfree_skb(skb);
1212 	return -EINVAL;
1213 }
1214 
tc_qdisc_dump_ignore(struct Qdisc * q)1215 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1216 {
1217 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1218 }
1219 
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx)1220 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1221 			      struct netlink_callback *cb,
1222 			      int *q_idx_p, int s_q_idx)
1223 {
1224 	int ret = 0, q_idx = *q_idx_p;
1225 	struct Qdisc *q;
1226 
1227 	if (!root)
1228 		return 0;
1229 
1230 	q = root;
1231 	if (q_idx < s_q_idx) {
1232 		q_idx++;
1233 	} else {
1234 		if (!tc_qdisc_dump_ignore(q) &&
1235 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1236 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1237 			goto done;
1238 		q_idx++;
1239 	}
1240 	list_for_each_entry(q, &root->list, list) {
1241 		if (q_idx < s_q_idx) {
1242 			q_idx++;
1243 			continue;
1244 		}
1245 		if (!tc_qdisc_dump_ignore(q) &&
1246 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1247 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1248 			goto done;
1249 		q_idx++;
1250 	}
1251 
1252 out:
1253 	*q_idx_p = q_idx;
1254 	return ret;
1255 done:
1256 	ret = -1;
1257 	goto out;
1258 }
1259 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1260 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1261 {
1262 	struct net *net = sock_net(skb->sk);
1263 	int idx, q_idx;
1264 	int s_idx, s_q_idx;
1265 	struct net_device *dev;
1266 
1267 	if (net != &init_net)
1268 		return 0;
1269 
1270 	s_idx = cb->args[0];
1271 	s_q_idx = q_idx = cb->args[1];
1272 	read_lock(&dev_base_lock);
1273 	idx = 0;
1274 	for_each_netdev(&init_net, dev) {
1275 		struct netdev_queue *dev_queue;
1276 
1277 		if (idx < s_idx)
1278 			goto cont;
1279 		if (idx > s_idx)
1280 			s_q_idx = 0;
1281 		q_idx = 0;
1282 
1283 		dev_queue = netdev_get_tx_queue(dev, 0);
1284 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1285 			goto done;
1286 
1287 		dev_queue = &dev->rx_queue;
1288 		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1289 			goto done;
1290 
1291 cont:
1292 		idx++;
1293 	}
1294 
1295 done:
1296 	read_unlock(&dev_base_lock);
1297 
1298 	cb->args[0] = idx;
1299 	cb->args[1] = q_idx;
1300 
1301 	return skb->len;
1302 }
1303 
1304 
1305 
1306 /************************************************
1307  *	Traffic classes manipulation.		*
1308  ************************************************/
1309 
1310 
1311 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,void * arg)1312 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1313 {
1314 	struct net *net = sock_net(skb->sk);
1315 	struct netdev_queue *dev_queue;
1316 	struct tcmsg *tcm = NLMSG_DATA(n);
1317 	struct nlattr *tca[TCA_MAX + 1];
1318 	struct net_device *dev;
1319 	struct Qdisc *q = NULL;
1320 	const struct Qdisc_class_ops *cops;
1321 	unsigned long cl = 0;
1322 	unsigned long new_cl;
1323 	u32 pid = tcm->tcm_parent;
1324 	u32 clid = tcm->tcm_handle;
1325 	u32 qid = TC_H_MAJ(clid);
1326 	int err;
1327 
1328 	if (net != &init_net)
1329 		return -EINVAL;
1330 
1331 	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1332 		return -ENODEV;
1333 
1334 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1335 	if (err < 0)
1336 		return err;
1337 
1338 	/*
1339 	   parent == TC_H_UNSPEC - unspecified parent.
1340 	   parent == TC_H_ROOT   - class is root, which has no parent.
1341 	   parent == X:0	 - parent is root class.
1342 	   parent == X:Y	 - parent is a node in hierarchy.
1343 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1344 
1345 	   handle == 0:0	 - generate handle from kernel pool.
1346 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1347 	   handle == X:Y	 - clear.
1348 	   handle == X:0	 - root class.
1349 	 */
1350 
1351 	/* Step 1. Determine qdisc handle X:0 */
1352 
1353 	dev_queue = netdev_get_tx_queue(dev, 0);
1354 	if (pid != TC_H_ROOT) {
1355 		u32 qid1 = TC_H_MAJ(pid);
1356 
1357 		if (qid && qid1) {
1358 			/* If both majors are known, they must be identical. */
1359 			if (qid != qid1)
1360 				return -EINVAL;
1361 		} else if (qid1) {
1362 			qid = qid1;
1363 		} else if (qid == 0)
1364 			qid = dev_queue->qdisc_sleeping->handle;
1365 
1366 		/* Now qid is genuine qdisc handle consistent
1367 		   both with parent and child.
1368 
1369 		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1370 		 */
1371 		if (pid)
1372 			pid = TC_H_MAKE(qid, pid);
1373 	} else {
1374 		if (qid == 0)
1375 			qid = dev_queue->qdisc_sleeping->handle;
1376 	}
1377 
1378 	/* OK. Locate qdisc */
1379 	if ((q = qdisc_lookup(dev, qid)) == NULL)
1380 		return -ENOENT;
1381 
1382 	/* An check that it supports classes */
1383 	cops = q->ops->cl_ops;
1384 	if (cops == NULL)
1385 		return -EINVAL;
1386 
1387 	/* Now try to get class */
1388 	if (clid == 0) {
1389 		if (pid == TC_H_ROOT)
1390 			clid = qid;
1391 	} else
1392 		clid = TC_H_MAKE(qid, clid);
1393 
1394 	if (clid)
1395 		cl = cops->get(q, clid);
1396 
1397 	if (cl == 0) {
1398 		err = -ENOENT;
1399 		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1400 			goto out;
1401 	} else {
1402 		switch (n->nlmsg_type) {
1403 		case RTM_NEWTCLASS:
1404 			err = -EEXIST;
1405 			if (n->nlmsg_flags&NLM_F_EXCL)
1406 				goto out;
1407 			break;
1408 		case RTM_DELTCLASS:
1409 			err = cops->delete(q, cl);
1410 			if (err == 0)
1411 				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
1412 			goto out;
1413 		case RTM_GETTCLASS:
1414 			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
1415 			goto out;
1416 		default:
1417 			err = -EINVAL;
1418 			goto out;
1419 		}
1420 	}
1421 
1422 	new_cl = cl;
1423 	err = cops->change(q, clid, pid, tca, &new_cl);
1424 	if (err == 0)
1425 		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
1426 
1427 out:
1428 	if (cl)
1429 		cops->put(q, cl);
1430 
1431 	return err;
1432 }
1433 
1434 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 pid,u32 seq,u16 flags,int event)1435 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1436 			  unsigned long cl,
1437 			  u32 pid, u32 seq, u16 flags, int event)
1438 {
1439 	struct tcmsg *tcm;
1440 	struct nlmsghdr  *nlh;
1441 	unsigned char *b = skb_tail_pointer(skb);
1442 	struct gnet_dump d;
1443 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1444 
1445 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1446 	tcm = NLMSG_DATA(nlh);
1447 	tcm->tcm_family = AF_UNSPEC;
1448 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1449 	tcm->tcm_parent = q->handle;
1450 	tcm->tcm_handle = q->handle;
1451 	tcm->tcm_info = 0;
1452 	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1453 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1454 		goto nla_put_failure;
1455 
1456 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1457 					 qdisc_root_sleeping_lock(q), &d) < 0)
1458 		goto nla_put_failure;
1459 
1460 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1461 		goto nla_put_failure;
1462 
1463 	if (gnet_stats_finish_copy(&d) < 0)
1464 		goto nla_put_failure;
1465 
1466 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1467 	return skb->len;
1468 
1469 nlmsg_failure:
1470 nla_put_failure:
1471 	nlmsg_trim(skb, b);
1472 	return -1;
1473 }
1474 
tclass_notify(struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1475 static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1476 			  struct Qdisc *q, unsigned long cl, int event)
1477 {
1478 	struct sk_buff *skb;
1479 	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1480 
1481 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1482 	if (!skb)
1483 		return -ENOBUFS;
1484 
1485 	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1486 		kfree_skb(skb);
1487 		return -EINVAL;
1488 	}
1489 
1490 	return rtnetlink_send(skb, &init_net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1491 }
1492 
1493 struct qdisc_dump_args
1494 {
1495 	struct qdisc_walker w;
1496 	struct sk_buff *skb;
1497 	struct netlink_callback *cb;
1498 };
1499 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)1500 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1501 {
1502 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1503 
1504 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1505 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1506 }
1507 
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1508 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1509 				struct tcmsg *tcm, struct netlink_callback *cb,
1510 				int *t_p, int s_t)
1511 {
1512 	struct qdisc_dump_args arg;
1513 
1514 	if (tc_qdisc_dump_ignore(q) ||
1515 	    *t_p < s_t || !q->ops->cl_ops ||
1516 	    (tcm->tcm_parent &&
1517 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1518 		(*t_p)++;
1519 		return 0;
1520 	}
1521 	if (*t_p > s_t)
1522 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1523 	arg.w.fn = qdisc_class_dump;
1524 	arg.skb = skb;
1525 	arg.cb = cb;
1526 	arg.w.stop  = 0;
1527 	arg.w.skip = cb->args[1];
1528 	arg.w.count = 0;
1529 	q->ops->cl_ops->walk(q, &arg.w);
1530 	cb->args[1] = arg.w.count;
1531 	if (arg.w.stop)
1532 		return -1;
1533 	(*t_p)++;
1534 	return 0;
1535 }
1536 
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1537 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1538 			       struct tcmsg *tcm, struct netlink_callback *cb,
1539 			       int *t_p, int s_t)
1540 {
1541 	struct Qdisc *q;
1542 
1543 	if (!root)
1544 		return 0;
1545 
1546 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1547 		return -1;
1548 
1549 	list_for_each_entry(q, &root->list, list) {
1550 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1551 			return -1;
1552 	}
1553 
1554 	return 0;
1555 }
1556 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)1557 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1558 {
1559 	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1560 	struct net *net = sock_net(skb->sk);
1561 	struct netdev_queue *dev_queue;
1562 	struct net_device *dev;
1563 	int t, s_t;
1564 
1565 	if (net != &init_net)
1566 		return 0;
1567 
1568 	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1569 		return 0;
1570 	if ((dev = dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
1571 		return 0;
1572 
1573 	s_t = cb->args[0];
1574 	t = 0;
1575 
1576 	dev_queue = netdev_get_tx_queue(dev, 0);
1577 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1578 		goto done;
1579 
1580 	dev_queue = &dev->rx_queue;
1581 	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1582 		goto done;
1583 
1584 done:
1585 	cb->args[0] = t;
1586 
1587 	dev_put(dev);
1588 	return skb->len;
1589 }
1590 
1591 /* Main classifier routine: scans classifier chain attached
1592    to this qdisc, (optionally) tests for protocol and asks
1593    specific classifiers.
1594  */
tc_classify_compat(struct sk_buff * skb,struct tcf_proto * tp,struct tcf_result * res)1595 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1596 		       struct tcf_result *res)
1597 {
1598 	__be16 protocol = skb->protocol;
1599 	int err = 0;
1600 
1601 	for (; tp; tp = tp->next) {
1602 		if ((tp->protocol == protocol ||
1603 		     tp->protocol == htons(ETH_P_ALL)) &&
1604 		    (err = tp->classify(skb, tp, res)) >= 0) {
1605 #ifdef CONFIG_NET_CLS_ACT
1606 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1607 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1608 #endif
1609 			return err;
1610 		}
1611 	}
1612 	return -1;
1613 }
1614 EXPORT_SYMBOL(tc_classify_compat);
1615 
tc_classify(struct sk_buff * skb,struct tcf_proto * tp,struct tcf_result * res)1616 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1617 		struct tcf_result *res)
1618 {
1619 	int err = 0;
1620 	__be16 protocol;
1621 #ifdef CONFIG_NET_CLS_ACT
1622 	struct tcf_proto *otp = tp;
1623 reclassify:
1624 #endif
1625 	protocol = skb->protocol;
1626 
1627 	err = tc_classify_compat(skb, tp, res);
1628 #ifdef CONFIG_NET_CLS_ACT
1629 	if (err == TC_ACT_RECLASSIFY) {
1630 		u32 verd = G_TC_VERD(skb->tc_verd);
1631 		tp = otp;
1632 
1633 		if (verd++ >= MAX_REC_LOOP) {
1634 			printk("rule prio %u protocol %02x reclassify loop, "
1635 			       "packet dropped\n",
1636 			       tp->prio&0xffff, ntohs(tp->protocol));
1637 			return TC_ACT_SHOT;
1638 		}
1639 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1640 		goto reclassify;
1641 	}
1642 #endif
1643 	return err;
1644 }
1645 EXPORT_SYMBOL(tc_classify);
1646 
tcf_destroy(struct tcf_proto * tp)1647 void tcf_destroy(struct tcf_proto *tp)
1648 {
1649 	tp->ops->destroy(tp);
1650 	module_put(tp->ops->owner);
1651 	kfree(tp);
1652 }
1653 
tcf_destroy_chain(struct tcf_proto ** fl)1654 void tcf_destroy_chain(struct tcf_proto **fl)
1655 {
1656 	struct tcf_proto *tp;
1657 
1658 	while ((tp = *fl) != NULL) {
1659 		*fl = tp->next;
1660 		tcf_destroy(tp);
1661 	}
1662 }
1663 EXPORT_SYMBOL(tcf_destroy_chain);
1664 
1665 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)1666 static int psched_show(struct seq_file *seq, void *v)
1667 {
1668 	struct timespec ts;
1669 
1670 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1671 	seq_printf(seq, "%08x %08x %08x %08x\n",
1672 		   (u32)NSEC_PER_USEC, (u32)PSCHED_US2NS(1),
1673 		   1000000,
1674 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1675 
1676 	return 0;
1677 }
1678 
psched_open(struct inode * inode,struct file * file)1679 static int psched_open(struct inode *inode, struct file *file)
1680 {
1681 	return single_open(file, psched_show, PDE(inode)->data);
1682 }
1683 
1684 static const struct file_operations psched_fops = {
1685 	.owner = THIS_MODULE,
1686 	.open = psched_open,
1687 	.read  = seq_read,
1688 	.llseek = seq_lseek,
1689 	.release = single_release,
1690 };
1691 #endif
1692 
pktsched_init(void)1693 static int __init pktsched_init(void)
1694 {
1695 	register_qdisc(&pfifo_qdisc_ops);
1696 	register_qdisc(&bfifo_qdisc_ops);
1697 	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
1698 
1699 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1700 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1701 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1702 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1703 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1704 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1705 
1706 	return 0;
1707 }
1708 
1709 subsys_initcall(pktsched_init);
1710