• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37 
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39 			struct nlmsghdr *n, u32 clid,
40 			struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42 			 struct nlmsghdr *n, struct Qdisc *q,
43 			 unsigned long cl, int event);
44 
45 /*
46 
47    Short review.
48    -------------
49 
50    This file consists of two interrelated parts:
51 
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54 
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59 
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64 
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67 
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73 
74    All real intelligent work is done inside qdisc modules.
75 
76 
77 
78    Every discipline has two major routines: enqueue and dequeue.
79 
80    ---dequeue
81 
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88 
89    ---enqueue
90 
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP 	- this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED	- dropped by police.
99      Expected action: backoff or error to real-time apps.
100 
101    Auxiliary routines:
102 
103    ---peek
104 
105    like dequeue but without removing a packet from the queue
106 
107    ---reset
108 
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111 
112    ---init
113 
114    initializes newly created qdisc.
115 
116    ---destroy
117 
118    destroys resources allocated by init and during lifetime of qdisc.
119 
120    ---change
121 
122    changes qdisc parameters.
123  */
124 
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127 
128 
129 /************************************************
130  *	Queueing disciplines manipulation.	*
131  ************************************************/
132 
133 
134 /* The list of all installed queueing disciplines. */
135 
136 static struct Qdisc_ops *qdisc_base;
137 
138 /* Register/uregister queueing discipline */
139 
register_qdisc(struct Qdisc_ops * qops)140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142 	struct Qdisc_ops *q, **qp;
143 	int rc = -EEXIST;
144 
145 	write_lock(&qdisc_mod_lock);
146 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147 		if (!strcmp(qops->id, q->id))
148 			goto out;
149 
150 	if (qops->enqueue == NULL)
151 		qops->enqueue = noop_qdisc_ops.enqueue;
152 	if (qops->peek == NULL) {
153 		if (qops->dequeue == NULL)
154 			qops->peek = noop_qdisc_ops.peek;
155 		else
156 			goto out_einval;
157 	}
158 	if (qops->dequeue == NULL)
159 		qops->dequeue = noop_qdisc_ops.dequeue;
160 
161 	if (qops->cl_ops) {
162 		const struct Qdisc_class_ops *cops = qops->cl_ops;
163 
164 		if (!(cops->get && cops->put && cops->walk && cops->leaf))
165 			goto out_einval;
166 
167 		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168 			goto out_einval;
169 	}
170 
171 	qops->next = NULL;
172 	*qp = qops;
173 	rc = 0;
174 out:
175 	write_unlock(&qdisc_mod_lock);
176 	return rc;
177 
178 out_einval:
179 	rc = -EINVAL;
180 	goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183 
unregister_qdisc(struct Qdisc_ops * qops)184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186 	struct Qdisc_ops *q, **qp;
187 	int err = -ENOENT;
188 
189 	write_lock(&qdisc_mod_lock);
190 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191 		if (q == qops)
192 			break;
193 	if (q) {
194 		*qp = q->next;
195 		q->next = NULL;
196 		err = 0;
197 	}
198 	write_unlock(&qdisc_mod_lock);
199 	return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202 
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206 
qdisc_match_from_root(struct Qdisc * root,u32 handle)207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209 	struct Qdisc *q;
210 
211 	if (!(root->flags & TCQ_F_BUILTIN) &&
212 	    root->handle == handle)
213 		return root;
214 
215 	list_for_each_entry(q, &root->list, list) {
216 		if (q->handle == handle)
217 			return q;
218 	}
219 	return NULL;
220 }
221 
qdisc_list_add(struct Qdisc * q)222 static void qdisc_list_add(struct Qdisc *q)
223 {
224 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225 		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227 
qdisc_list_del(struct Qdisc * q)228 void qdisc_list_del(struct Qdisc *q)
229 {
230 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231 		list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234 
qdisc_lookup(struct net_device * dev,u32 handle)235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237 	struct Qdisc *q;
238 
239 	q = qdisc_match_from_root(dev->qdisc, handle);
240 	if (q)
241 		goto out;
242 
243 	if (dev_ingress_queue(dev))
244 		q = qdisc_match_from_root(
245 			dev_ingress_queue(dev)->qdisc_sleeping,
246 			handle);
247 out:
248 	return q;
249 }
250 
qdisc_leaf(struct Qdisc * p,u32 classid)251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253 	unsigned long cl;
254 	struct Qdisc *leaf;
255 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256 
257 	if (cops == NULL)
258 		return NULL;
259 	cl = cops->get(p, classid);
260 
261 	if (cl == 0)
262 		return NULL;
263 	leaf = cops->leaf(p, cl);
264 	cops->put(p, cl);
265 	return leaf;
266 }
267 
268 /* Find queueing discipline by name */
269 
qdisc_lookup_ops(struct nlattr * kind)270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272 	struct Qdisc_ops *q = NULL;
273 
274 	if (kind) {
275 		read_lock(&qdisc_mod_lock);
276 		for (q = qdisc_base; q; q = q->next) {
277 			if (nla_strcmp(kind, q->id) == 0) {
278 				if (!try_module_get(q->owner))
279 					q = NULL;
280 				break;
281 			}
282 		}
283 		read_unlock(&qdisc_mod_lock);
284 	}
285 	return q;
286 }
287 
288 static struct qdisc_rate_table *qdisc_rtab_list;
289 
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab)290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292 	struct qdisc_rate_table *rtab;
293 
294 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
295 	    nla_len(tab) != TC_RTAB_SIZE)
296 		return NULL;
297 
298 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
299 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
300 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
301 			rtab->refcnt++;
302 			return rtab;
303 		}
304 	}
305 
306 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
307 	if (rtab) {
308 		rtab->rate = *r;
309 		rtab->refcnt = 1;
310 		memcpy(rtab->data, nla_data(tab), 1024);
311 		rtab->next = qdisc_rtab_list;
312 		qdisc_rtab_list = rtab;
313 	}
314 	return rtab;
315 }
316 EXPORT_SYMBOL(qdisc_get_rtab);
317 
qdisc_put_rtab(struct qdisc_rate_table * tab)318 void qdisc_put_rtab(struct qdisc_rate_table *tab)
319 {
320 	struct qdisc_rate_table *rtab, **rtabp;
321 
322 	if (!tab || --tab->refcnt)
323 		return;
324 
325 	for (rtabp = &qdisc_rtab_list;
326 	     (rtab = *rtabp) != NULL;
327 	     rtabp = &rtab->next) {
328 		if (rtab == tab) {
329 			*rtabp = rtab->next;
330 			kfree(rtab);
331 			return;
332 		}
333 	}
334 }
335 EXPORT_SYMBOL(qdisc_put_rtab);
336 
337 static LIST_HEAD(qdisc_stab_list);
338 static DEFINE_SPINLOCK(qdisc_stab_lock);
339 
340 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
341 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
342 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
343 };
344 
qdisc_get_stab(struct nlattr * opt)345 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
346 {
347 	struct nlattr *tb[TCA_STAB_MAX + 1];
348 	struct qdisc_size_table *stab;
349 	struct tc_sizespec *s;
350 	unsigned int tsize = 0;
351 	u16 *tab = NULL;
352 	int err;
353 
354 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
355 	if (err < 0)
356 		return ERR_PTR(err);
357 	if (!tb[TCA_STAB_BASE])
358 		return ERR_PTR(-EINVAL);
359 
360 	s = nla_data(tb[TCA_STAB_BASE]);
361 
362 	if (s->tsize > 0) {
363 		if (!tb[TCA_STAB_DATA])
364 			return ERR_PTR(-EINVAL);
365 		tab = nla_data(tb[TCA_STAB_DATA]);
366 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
367 	}
368 
369 	if (tsize != s->tsize || (!tab && tsize > 0))
370 		return ERR_PTR(-EINVAL);
371 
372 	spin_lock(&qdisc_stab_lock);
373 
374 	list_for_each_entry(stab, &qdisc_stab_list, list) {
375 		if (memcmp(&stab->szopts, s, sizeof(*s)))
376 			continue;
377 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
378 			continue;
379 		stab->refcnt++;
380 		spin_unlock(&qdisc_stab_lock);
381 		return stab;
382 	}
383 
384 	spin_unlock(&qdisc_stab_lock);
385 
386 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
387 	if (!stab)
388 		return ERR_PTR(-ENOMEM);
389 
390 	stab->refcnt = 1;
391 	stab->szopts = *s;
392 	if (tsize > 0)
393 		memcpy(stab->data, tab, tsize * sizeof(u16));
394 
395 	spin_lock(&qdisc_stab_lock);
396 	list_add_tail(&stab->list, &qdisc_stab_list);
397 	spin_unlock(&qdisc_stab_lock);
398 
399 	return stab;
400 }
401 
stab_kfree_rcu(struct rcu_head * head)402 static void stab_kfree_rcu(struct rcu_head *head)
403 {
404 	kfree(container_of(head, struct qdisc_size_table, rcu));
405 }
406 
qdisc_put_stab(struct qdisc_size_table * tab)407 void qdisc_put_stab(struct qdisc_size_table *tab)
408 {
409 	if (!tab)
410 		return;
411 
412 	spin_lock(&qdisc_stab_lock);
413 
414 	if (--tab->refcnt == 0) {
415 		list_del(&tab->list);
416 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
417 	}
418 
419 	spin_unlock(&qdisc_stab_lock);
420 }
421 EXPORT_SYMBOL(qdisc_put_stab);
422 
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)423 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
424 {
425 	struct nlattr *nest;
426 
427 	nest = nla_nest_start(skb, TCA_STAB);
428 	if (nest == NULL)
429 		goto nla_put_failure;
430 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
431 		goto nla_put_failure;
432 	nla_nest_end(skb, nest);
433 
434 	return skb->len;
435 
436 nla_put_failure:
437 	return -1;
438 }
439 
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)440 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
441 {
442 	int pkt_len, slot;
443 
444 	pkt_len = skb->len + stab->szopts.overhead;
445 	if (unlikely(!stab->szopts.tsize))
446 		goto out;
447 
448 	slot = pkt_len + stab->szopts.cell_align;
449 	if (unlikely(slot < 0))
450 		slot = 0;
451 
452 	slot >>= stab->szopts.cell_log;
453 	if (likely(slot < stab->szopts.tsize))
454 		pkt_len = stab->data[slot];
455 	else
456 		pkt_len = stab->data[stab->szopts.tsize - 1] *
457 				(slot / stab->szopts.tsize) +
458 				stab->data[slot % stab->szopts.tsize];
459 
460 	pkt_len <<= stab->szopts.size_log;
461 out:
462 	if (unlikely(pkt_len < 1))
463 		pkt_len = 1;
464 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
465 }
466 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
467 
qdisc_warn_nonwc(char * txt,struct Qdisc * qdisc)468 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
469 {
470 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
471 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
472 			txt, qdisc->ops->id, qdisc->handle >> 16);
473 		qdisc->flags |= TCQ_F_WARN_NONWC;
474 	}
475 }
476 EXPORT_SYMBOL(qdisc_warn_nonwc);
477 
qdisc_watchdog(struct hrtimer * timer)478 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
479 {
480 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
481 						 timer);
482 
483 	qdisc_unthrottled(wd->qdisc);
484 	__netif_schedule(qdisc_root(wd->qdisc));
485 
486 	return HRTIMER_NORESTART;
487 }
488 
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)489 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
490 {
491 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
492 	wd->timer.function = qdisc_watchdog;
493 	wd->qdisc = qdisc;
494 }
495 EXPORT_SYMBOL(qdisc_watchdog_init);
496 
qdisc_watchdog_schedule_ns(struct qdisc_watchdog * wd,u64 expires)497 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
498 {
499 	if (test_bit(__QDISC_STATE_DEACTIVATED,
500 		     &qdisc_root_sleeping(wd->qdisc)->state))
501 		return;
502 
503 	qdisc_throttled(wd->qdisc);
504 
505 	hrtimer_start(&wd->timer,
506 		      ns_to_ktime(expires),
507 		      HRTIMER_MODE_ABS);
508 }
509 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
510 
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)511 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
512 {
513 	hrtimer_cancel(&wd->timer);
514 	qdisc_unthrottled(wd->qdisc);
515 }
516 EXPORT_SYMBOL(qdisc_watchdog_cancel);
517 
qdisc_class_hash_alloc(unsigned int n)518 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
519 {
520 	unsigned int size = n * sizeof(struct hlist_head), i;
521 	struct hlist_head *h;
522 
523 	if (size <= PAGE_SIZE)
524 		h = kmalloc(size, GFP_KERNEL);
525 	else
526 		h = (struct hlist_head *)
527 			__get_free_pages(GFP_KERNEL, get_order(size));
528 
529 	if (h != NULL) {
530 		for (i = 0; i < n; i++)
531 			INIT_HLIST_HEAD(&h[i]);
532 	}
533 	return h;
534 }
535 
qdisc_class_hash_free(struct hlist_head * h,unsigned int n)536 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
537 {
538 	unsigned int size = n * sizeof(struct hlist_head);
539 
540 	if (size <= PAGE_SIZE)
541 		kfree(h);
542 	else
543 		free_pages((unsigned long)h, get_order(size));
544 }
545 
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)546 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
547 {
548 	struct Qdisc_class_common *cl;
549 	struct hlist_node *next;
550 	struct hlist_head *nhash, *ohash;
551 	unsigned int nsize, nmask, osize;
552 	unsigned int i, h;
553 
554 	/* Rehash when load factor exceeds 0.75 */
555 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
556 		return;
557 	nsize = clhash->hashsize * 2;
558 	nmask = nsize - 1;
559 	nhash = qdisc_class_hash_alloc(nsize);
560 	if (nhash == NULL)
561 		return;
562 
563 	ohash = clhash->hash;
564 	osize = clhash->hashsize;
565 
566 	sch_tree_lock(sch);
567 	for (i = 0; i < osize; i++) {
568 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
569 			h = qdisc_class_hash(cl->classid, nmask);
570 			hlist_add_head(&cl->hnode, &nhash[h]);
571 		}
572 	}
573 	clhash->hash     = nhash;
574 	clhash->hashsize = nsize;
575 	clhash->hashmask = nmask;
576 	sch_tree_unlock(sch);
577 
578 	qdisc_class_hash_free(ohash, osize);
579 }
580 EXPORT_SYMBOL(qdisc_class_hash_grow);
581 
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)582 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
583 {
584 	unsigned int size = 4;
585 
586 	clhash->hash = qdisc_class_hash_alloc(size);
587 	if (clhash->hash == NULL)
588 		return -ENOMEM;
589 	clhash->hashsize  = size;
590 	clhash->hashmask  = size - 1;
591 	clhash->hashelems = 0;
592 	return 0;
593 }
594 EXPORT_SYMBOL(qdisc_class_hash_init);
595 
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)596 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
597 {
598 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
599 }
600 EXPORT_SYMBOL(qdisc_class_hash_destroy);
601 
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)602 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
603 			     struct Qdisc_class_common *cl)
604 {
605 	unsigned int h;
606 
607 	INIT_HLIST_NODE(&cl->hnode);
608 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
609 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
610 	clhash->hashelems++;
611 }
612 EXPORT_SYMBOL(qdisc_class_hash_insert);
613 
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)614 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
615 			     struct Qdisc_class_common *cl)
616 {
617 	hlist_del(&cl->hnode);
618 	clhash->hashelems--;
619 }
620 EXPORT_SYMBOL(qdisc_class_hash_remove);
621 
622 /* Allocate an unique handle from space managed by kernel
623  * Possible range is [8000-FFFF]:0000 (0x8000 values)
624  */
qdisc_alloc_handle(struct net_device * dev)625 static u32 qdisc_alloc_handle(struct net_device *dev)
626 {
627 	int i = 0x8000;
628 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
629 
630 	do {
631 		autohandle += TC_H_MAKE(0x10000U, 0);
632 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
633 			autohandle = TC_H_MAKE(0x80000000U, 0);
634 		if (!qdisc_lookup(dev, autohandle))
635 			return autohandle;
636 		cond_resched();
637 	} while	(--i > 0);
638 
639 	return 0;
640 }
641 
qdisc_tree_decrease_qlen(struct Qdisc * sch,unsigned int n)642 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
643 {
644 	const struct Qdisc_class_ops *cops;
645 	unsigned long cl;
646 	u32 parentid;
647 
648 	if (n == 0)
649 		return;
650 	while ((parentid = sch->parent)) {
651 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
652 			return;
653 
654 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
655 		if (sch == NULL) {
656 			WARN_ON(parentid != TC_H_ROOT);
657 			return;
658 		}
659 		cops = sch->ops->cl_ops;
660 		if (cops->qlen_notify) {
661 			cl = cops->get(sch, parentid);
662 			cops->qlen_notify(sch, cl);
663 			cops->put(sch, cl);
664 		}
665 		sch->q.qlen -= n;
666 	}
667 }
668 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
669 
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)670 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
671 			       struct nlmsghdr *n, u32 clid,
672 			       struct Qdisc *old, struct Qdisc *new)
673 {
674 	if (new || old)
675 		qdisc_notify(net, skb, n, clid, old, new);
676 
677 	if (old)
678 		qdisc_destroy(old);
679 }
680 
681 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
682  * to device "dev".
683  *
684  * When appropriate send a netlink notification using 'skb'
685  * and "n".
686  *
687  * On success, destroy old qdisc.
688  */
689 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old)690 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
691 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
692 		       struct Qdisc *new, struct Qdisc *old)
693 {
694 	struct Qdisc *q = old;
695 	struct net *net = dev_net(dev);
696 	int err = 0;
697 
698 	if (parent == NULL) {
699 		unsigned int i, num_q, ingress;
700 
701 		ingress = 0;
702 		num_q = dev->num_tx_queues;
703 		if ((q && q->flags & TCQ_F_INGRESS) ||
704 		    (new && new->flags & TCQ_F_INGRESS)) {
705 			num_q = 1;
706 			ingress = 1;
707 			if (!dev_ingress_queue(dev))
708 				return -ENOENT;
709 		}
710 
711 		if (dev->flags & IFF_UP)
712 			dev_deactivate(dev);
713 
714 		if (new && new->ops->attach) {
715 			new->ops->attach(new);
716 			num_q = 0;
717 		}
718 
719 		for (i = 0; i < num_q; i++) {
720 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
721 
722 			if (!ingress)
723 				dev_queue = netdev_get_tx_queue(dev, i);
724 
725 			old = dev_graft_qdisc(dev_queue, new);
726 			if (new && i > 0)
727 				atomic_inc(&new->refcnt);
728 
729 			if (!ingress)
730 				qdisc_destroy(old);
731 		}
732 
733 		if (!ingress) {
734 			notify_and_destroy(net, skb, n, classid,
735 					   dev->qdisc, new);
736 			if (new && !new->ops->attach)
737 				atomic_inc(&new->refcnt);
738 			dev->qdisc = new ? : &noop_qdisc;
739 		} else {
740 			notify_and_destroy(net, skb, n, classid, old, new);
741 		}
742 
743 		if (dev->flags & IFF_UP)
744 			dev_activate(dev);
745 	} else {
746 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
747 
748 		err = -EOPNOTSUPP;
749 		if (cops && cops->graft) {
750 			unsigned long cl = cops->get(parent, classid);
751 			if (cl) {
752 				err = cops->graft(parent, cl, new, &old);
753 				cops->put(parent, cl);
754 			} else
755 				err = -ENOENT;
756 		}
757 		if (!err)
758 			notify_and_destroy(net, skb, n, classid, old, new);
759 	}
760 	return err;
761 }
762 
763 /* lockdep annotation is needed for ingress; egress gets it only for name */
764 static struct lock_class_key qdisc_tx_lock;
765 static struct lock_class_key qdisc_rx_lock;
766 
767 /*
768    Allocate and initialize new qdisc.
769 
770    Parameters are passed via opt.
771  */
772 
773 static struct Qdisc *
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp)774 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
775 	     struct Qdisc *p, u32 parent, u32 handle,
776 	     struct nlattr **tca, int *errp)
777 {
778 	int err;
779 	struct nlattr *kind = tca[TCA_KIND];
780 	struct Qdisc *sch;
781 	struct Qdisc_ops *ops;
782 	struct qdisc_size_table *stab;
783 
784 	ops = qdisc_lookup_ops(kind);
785 #ifdef CONFIG_MODULES
786 	if (ops == NULL && kind != NULL) {
787 		char name[IFNAMSIZ];
788 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
789 			/* We dropped the RTNL semaphore in order to
790 			 * perform the module load.  So, even if we
791 			 * succeeded in loading the module we have to
792 			 * tell the caller to replay the request.  We
793 			 * indicate this using -EAGAIN.
794 			 * We replay the request because the device may
795 			 * go away in the mean time.
796 			 */
797 			rtnl_unlock();
798 			request_module("sch_%s", name);
799 			rtnl_lock();
800 			ops = qdisc_lookup_ops(kind);
801 			if (ops != NULL) {
802 				/* We will try again qdisc_lookup_ops,
803 				 * so don't keep a reference.
804 				 */
805 				module_put(ops->owner);
806 				err = -EAGAIN;
807 				goto err_out;
808 			}
809 		}
810 	}
811 #endif
812 
813 	err = -ENOENT;
814 	if (ops == NULL)
815 		goto err_out;
816 
817 	sch = qdisc_alloc(dev_queue, ops);
818 	if (IS_ERR(sch)) {
819 		err = PTR_ERR(sch);
820 		goto err_out2;
821 	}
822 
823 	sch->parent = parent;
824 
825 	if (handle == TC_H_INGRESS) {
826 		sch->flags |= TCQ_F_INGRESS;
827 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
828 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
829 	} else {
830 		if (handle == 0) {
831 			handle = qdisc_alloc_handle(dev);
832 			err = -ENOMEM;
833 			if (handle == 0)
834 				goto err_out3;
835 		}
836 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
837 		if (!netif_is_multiqueue(dev))
838 			sch->flags |= TCQ_F_ONETXQUEUE;
839 	}
840 
841 	sch->handle = handle;
842 
843 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
844 		if (tca[TCA_STAB]) {
845 			stab = qdisc_get_stab(tca[TCA_STAB]);
846 			if (IS_ERR(stab)) {
847 				err = PTR_ERR(stab);
848 				goto err_out4;
849 			}
850 			rcu_assign_pointer(sch->stab, stab);
851 		}
852 		if (tca[TCA_RATE]) {
853 			spinlock_t *root_lock;
854 
855 			err = -EOPNOTSUPP;
856 			if (sch->flags & TCQ_F_MQROOT)
857 				goto err_out4;
858 
859 			if ((sch->parent != TC_H_ROOT) &&
860 			    !(sch->flags & TCQ_F_INGRESS) &&
861 			    (!p || !(p->flags & TCQ_F_MQROOT)))
862 				root_lock = qdisc_root_sleeping_lock(sch);
863 			else
864 				root_lock = qdisc_lock(sch);
865 
866 			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
867 						root_lock, tca[TCA_RATE]);
868 			if (err)
869 				goto err_out4;
870 		}
871 
872 		qdisc_list_add(sch);
873 
874 		return sch;
875 	}
876 err_out3:
877 	dev_put(dev);
878 	kfree((char *) sch - sch->padded);
879 err_out2:
880 	module_put(ops->owner);
881 err_out:
882 	*errp = err;
883 	return NULL;
884 
885 err_out4:
886 	/*
887 	 * Any broken qdiscs that would require a ops->reset() here?
888 	 * The qdisc was never in action so it shouldn't be necessary.
889 	 */
890 	qdisc_put_stab(rtnl_dereference(sch->stab));
891 	if (ops->destroy)
892 		ops->destroy(sch);
893 	goto err_out3;
894 }
895 
qdisc_change(struct Qdisc * sch,struct nlattr ** tca)896 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
897 {
898 	struct qdisc_size_table *ostab, *stab = NULL;
899 	int err = 0;
900 
901 	if (tca[TCA_OPTIONS]) {
902 		if (sch->ops->change == NULL)
903 			return -EINVAL;
904 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
905 		if (err)
906 			return err;
907 	}
908 
909 	if (tca[TCA_STAB]) {
910 		stab = qdisc_get_stab(tca[TCA_STAB]);
911 		if (IS_ERR(stab))
912 			return PTR_ERR(stab);
913 	}
914 
915 	ostab = rtnl_dereference(sch->stab);
916 	rcu_assign_pointer(sch->stab, stab);
917 	qdisc_put_stab(ostab);
918 
919 	if (tca[TCA_RATE]) {
920 		/* NB: ignores errors from replace_estimator
921 		   because change can't be undone. */
922 		if (sch->flags & TCQ_F_MQROOT)
923 			goto out;
924 		gen_replace_estimator(&sch->bstats, &sch->rate_est,
925 					    qdisc_root_sleeping_lock(sch),
926 					    tca[TCA_RATE]);
927 	}
928 out:
929 	return 0;
930 }
931 
932 struct check_loop_arg {
933 	struct qdisc_walker	w;
934 	struct Qdisc		*p;
935 	int			depth;
936 };
937 
938 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
939 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)940 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
941 {
942 	struct check_loop_arg	arg;
943 
944 	if (q->ops->cl_ops == NULL)
945 		return 0;
946 
947 	arg.w.stop = arg.w.skip = arg.w.count = 0;
948 	arg.w.fn = check_loop_fn;
949 	arg.depth = depth;
950 	arg.p = p;
951 	q->ops->cl_ops->walk(q, &arg.w);
952 	return arg.w.stop ? -ELOOP : 0;
953 }
954 
955 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)956 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
957 {
958 	struct Qdisc *leaf;
959 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
960 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
961 
962 	leaf = cops->leaf(q, cl);
963 	if (leaf) {
964 		if (leaf == arg->p || arg->depth > 7)
965 			return -ELOOP;
966 		return check_loop(leaf, arg->p, arg->depth + 1);
967 	}
968 	return 0;
969 }
970 
971 /*
972  * Delete/get qdisc.
973  */
974 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n)975 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
976 {
977 	struct net *net = sock_net(skb->sk);
978 	struct tcmsg *tcm = nlmsg_data(n);
979 	struct nlattr *tca[TCA_MAX + 1];
980 	struct net_device *dev;
981 	u32 clid;
982 	struct Qdisc *q = NULL;
983 	struct Qdisc *p = NULL;
984 	int err;
985 
986 	if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
987 		return -EPERM;
988 
989 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
990 	if (err < 0)
991 		return err;
992 
993 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
994 	if (!dev)
995 		return -ENODEV;
996 
997 	clid = tcm->tcm_parent;
998 	if (clid) {
999 		if (clid != TC_H_ROOT) {
1000 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1001 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1002 				if (!p)
1003 					return -ENOENT;
1004 				q = qdisc_leaf(p, clid);
1005 			} else if (dev_ingress_queue(dev)) {
1006 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1007 			}
1008 		} else {
1009 			q = dev->qdisc;
1010 		}
1011 		if (!q)
1012 			return -ENOENT;
1013 
1014 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1015 			return -EINVAL;
1016 	} else {
1017 		q = qdisc_lookup(dev, tcm->tcm_handle);
1018 		if (!q)
1019 			return -ENOENT;
1020 	}
1021 
1022 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1023 		return -EINVAL;
1024 
1025 	if (n->nlmsg_type == RTM_DELQDISC) {
1026 		if (!clid)
1027 			return -EINVAL;
1028 		if (q->handle == 0)
1029 			return -ENOENT;
1030 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1031 		if (err != 0)
1032 			return err;
1033 	} else {
1034 		qdisc_notify(net, skb, n, clid, NULL, q);
1035 	}
1036 	return 0;
1037 }
1038 
1039 /*
1040  * Create/change qdisc.
1041  */
1042 
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n)1043 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1044 {
1045 	struct net *net = sock_net(skb->sk);
1046 	struct tcmsg *tcm;
1047 	struct nlattr *tca[TCA_MAX + 1];
1048 	struct net_device *dev;
1049 	u32 clid;
1050 	struct Qdisc *q, *p;
1051 	int err;
1052 
1053 	if (!capable(CAP_NET_ADMIN))
1054 		return -EPERM;
1055 
1056 replay:
1057 	/* Reinit, just in case something touches this. */
1058 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1059 	if (err < 0)
1060 		return err;
1061 
1062 	tcm = nlmsg_data(n);
1063 	clid = tcm->tcm_parent;
1064 	q = p = NULL;
1065 
1066 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1067 	if (!dev)
1068 		return -ENODEV;
1069 
1070 
1071 	if (clid) {
1072 		if (clid != TC_H_ROOT) {
1073 			if (clid != TC_H_INGRESS) {
1074 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1075 				if (!p)
1076 					return -ENOENT;
1077 				q = qdisc_leaf(p, clid);
1078 			} else if (dev_ingress_queue_create(dev)) {
1079 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1080 			}
1081 		} else {
1082 			q = dev->qdisc;
1083 		}
1084 
1085 		/* It may be default qdisc, ignore it */
1086 		if (q && q->handle == 0)
1087 			q = NULL;
1088 
1089 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1090 			if (tcm->tcm_handle) {
1091 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1092 					return -EEXIST;
1093 				if (TC_H_MIN(tcm->tcm_handle))
1094 					return -EINVAL;
1095 				q = qdisc_lookup(dev, tcm->tcm_handle);
1096 				if (!q)
1097 					goto create_n_graft;
1098 				if (n->nlmsg_flags & NLM_F_EXCL)
1099 					return -EEXIST;
1100 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1101 					return -EINVAL;
1102 				if (q == p ||
1103 				    (p && check_loop(q, p, 0)))
1104 					return -ELOOP;
1105 				atomic_inc(&q->refcnt);
1106 				goto graft;
1107 			} else {
1108 				if (!q)
1109 					goto create_n_graft;
1110 
1111 				/* This magic test requires explanation.
1112 				 *
1113 				 *   We know, that some child q is already
1114 				 *   attached to this parent and have choice:
1115 				 *   either to change it or to create/graft new one.
1116 				 *
1117 				 *   1. We are allowed to create/graft only
1118 				 *   if CREATE and REPLACE flags are set.
1119 				 *
1120 				 *   2. If EXCL is set, requestor wanted to say,
1121 				 *   that qdisc tcm_handle is not expected
1122 				 *   to exist, so that we choose create/graft too.
1123 				 *
1124 				 *   3. The last case is when no flags are set.
1125 				 *   Alas, it is sort of hole in API, we
1126 				 *   cannot decide what to do unambiguously.
1127 				 *   For now we select create/graft, if
1128 				 *   user gave KIND, which does not match existing.
1129 				 */
1130 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1131 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1132 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1133 				     (tca[TCA_KIND] &&
1134 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1135 					goto create_n_graft;
1136 			}
1137 		}
1138 	} else {
1139 		if (!tcm->tcm_handle)
1140 			return -EINVAL;
1141 		q = qdisc_lookup(dev, tcm->tcm_handle);
1142 	}
1143 
1144 	/* Change qdisc parameters */
1145 	if (q == NULL)
1146 		return -ENOENT;
1147 	if (n->nlmsg_flags & NLM_F_EXCL)
1148 		return -EEXIST;
1149 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1150 		return -EINVAL;
1151 	err = qdisc_change(q, tca);
1152 	if (err == 0)
1153 		qdisc_notify(net, skb, n, clid, NULL, q);
1154 	return err;
1155 
1156 create_n_graft:
1157 	if (!(n->nlmsg_flags & NLM_F_CREATE))
1158 		return -ENOENT;
1159 	if (clid == TC_H_INGRESS) {
1160 		if (dev_ingress_queue(dev))
1161 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1162 					 tcm->tcm_parent, tcm->tcm_parent,
1163 					 tca, &err);
1164 		else
1165 			err = -ENOENT;
1166 	} else {
1167 		struct netdev_queue *dev_queue;
1168 
1169 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1170 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1171 		else if (p)
1172 			dev_queue = p->dev_queue;
1173 		else
1174 			dev_queue = netdev_get_tx_queue(dev, 0);
1175 
1176 		q = qdisc_create(dev, dev_queue, p,
1177 				 tcm->tcm_parent, tcm->tcm_handle,
1178 				 tca, &err);
1179 	}
1180 	if (q == NULL) {
1181 		if (err == -EAGAIN)
1182 			goto replay;
1183 		return err;
1184 	}
1185 
1186 graft:
1187 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1188 	if (err) {
1189 		if (q)
1190 			qdisc_destroy(q);
1191 		return err;
1192 	}
1193 
1194 	return 0;
1195 }
1196 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event)1197 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1198 			 u32 portid, u32 seq, u16 flags, int event)
1199 {
1200 	struct tcmsg *tcm;
1201 	struct nlmsghdr  *nlh;
1202 	unsigned char *b = skb_tail_pointer(skb);
1203 	struct gnet_dump d;
1204 	struct qdisc_size_table *stab;
1205 
1206 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1207 	if (!nlh)
1208 		goto out_nlmsg_trim;
1209 	tcm = nlmsg_data(nlh);
1210 	tcm->tcm_family = AF_UNSPEC;
1211 	tcm->tcm__pad1 = 0;
1212 	tcm->tcm__pad2 = 0;
1213 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1214 	tcm->tcm_parent = clid;
1215 	tcm->tcm_handle = q->handle;
1216 	tcm->tcm_info = atomic_read(&q->refcnt);
1217 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1218 		goto nla_put_failure;
1219 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1220 		goto nla_put_failure;
1221 	q->qstats.qlen = q->q.qlen;
1222 
1223 	stab = rtnl_dereference(q->stab);
1224 	if (stab && qdisc_dump_stab(skb, stab) < 0)
1225 		goto nla_put_failure;
1226 
1227 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1228 					 qdisc_root_sleeping_lock(q), &d) < 0)
1229 		goto nla_put_failure;
1230 
1231 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1232 		goto nla_put_failure;
1233 
1234 	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1235 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1236 	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1237 		goto nla_put_failure;
1238 
1239 	if (gnet_stats_finish_copy(&d) < 0)
1240 		goto nla_put_failure;
1241 
1242 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1243 	return skb->len;
1244 
1245 out_nlmsg_trim:
1246 nla_put_failure:
1247 	nlmsg_trim(skb, b);
1248 	return -1;
1249 }
1250 
tc_qdisc_dump_ignore(struct Qdisc * q)1251 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1252 {
1253 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1254 }
1255 
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)1256 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1257 			struct nlmsghdr *n, u32 clid,
1258 			struct Qdisc *old, struct Qdisc *new)
1259 {
1260 	struct sk_buff *skb;
1261 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1262 
1263 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1264 	if (!skb)
1265 		return -ENOBUFS;
1266 
1267 	if (old && !tc_qdisc_dump_ignore(old)) {
1268 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1269 				  0, RTM_DELQDISC) < 0)
1270 			goto err_out;
1271 	}
1272 	if (new && !tc_qdisc_dump_ignore(new)) {
1273 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1274 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1275 			goto err_out;
1276 	}
1277 
1278 	if (skb->len)
1279 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1280 				      n->nlmsg_flags & NLM_F_ECHO);
1281 
1282 err_out:
1283 	kfree_skb(skb);
1284 	return -EINVAL;
1285 }
1286 
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx)1287 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1288 			      struct netlink_callback *cb,
1289 			      int *q_idx_p, int s_q_idx)
1290 {
1291 	int ret = 0, q_idx = *q_idx_p;
1292 	struct Qdisc *q;
1293 
1294 	if (!root)
1295 		return 0;
1296 
1297 	q = root;
1298 	if (q_idx < s_q_idx) {
1299 		q_idx++;
1300 	} else {
1301 		if (!tc_qdisc_dump_ignore(q) &&
1302 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1303 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1304 			goto done;
1305 		q_idx++;
1306 	}
1307 	list_for_each_entry(q, &root->list, list) {
1308 		if (q_idx < s_q_idx) {
1309 			q_idx++;
1310 			continue;
1311 		}
1312 		if (!tc_qdisc_dump_ignore(q) &&
1313 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1314 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1315 			goto done;
1316 		q_idx++;
1317 	}
1318 
1319 out:
1320 	*q_idx_p = q_idx;
1321 	return ret;
1322 done:
1323 	ret = -1;
1324 	goto out;
1325 }
1326 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1327 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1328 {
1329 	struct net *net = sock_net(skb->sk);
1330 	int idx, q_idx;
1331 	int s_idx, s_q_idx;
1332 	struct net_device *dev;
1333 
1334 	s_idx = cb->args[0];
1335 	s_q_idx = q_idx = cb->args[1];
1336 
1337 	rcu_read_lock();
1338 	idx = 0;
1339 	for_each_netdev_rcu(net, dev) {
1340 		struct netdev_queue *dev_queue;
1341 
1342 		if (idx < s_idx)
1343 			goto cont;
1344 		if (idx > s_idx)
1345 			s_q_idx = 0;
1346 		q_idx = 0;
1347 
1348 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1349 			goto done;
1350 
1351 		dev_queue = dev_ingress_queue(dev);
1352 		if (dev_queue &&
1353 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1354 				       &q_idx, s_q_idx) < 0)
1355 			goto done;
1356 
1357 cont:
1358 		idx++;
1359 	}
1360 
1361 done:
1362 	rcu_read_unlock();
1363 
1364 	cb->args[0] = idx;
1365 	cb->args[1] = q_idx;
1366 
1367 	return skb->len;
1368 }
1369 
1370 
1371 
1372 /************************************************
1373  *	Traffic classes manipulation.		*
1374  ************************************************/
1375 
1376 
1377 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n)1378 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1379 {
1380 	struct net *net = sock_net(skb->sk);
1381 	struct tcmsg *tcm = nlmsg_data(n);
1382 	struct nlattr *tca[TCA_MAX + 1];
1383 	struct net_device *dev;
1384 	struct Qdisc *q = NULL;
1385 	const struct Qdisc_class_ops *cops;
1386 	unsigned long cl = 0;
1387 	unsigned long new_cl;
1388 	u32 portid;
1389 	u32 clid;
1390 	u32 qid;
1391 	int err;
1392 
1393 	if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1394 		return -EPERM;
1395 
1396 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1397 	if (err < 0)
1398 		return err;
1399 
1400 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1401 	if (!dev)
1402 		return -ENODEV;
1403 
1404 	/*
1405 	   parent == TC_H_UNSPEC - unspecified parent.
1406 	   parent == TC_H_ROOT   - class is root, which has no parent.
1407 	   parent == X:0	 - parent is root class.
1408 	   parent == X:Y	 - parent is a node in hierarchy.
1409 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1410 
1411 	   handle == 0:0	 - generate handle from kernel pool.
1412 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1413 	   handle == X:Y	 - clear.
1414 	   handle == X:0	 - root class.
1415 	 */
1416 
1417 	/* Step 1. Determine qdisc handle X:0 */
1418 
1419 	portid = tcm->tcm_parent;
1420 	clid = tcm->tcm_handle;
1421 	qid = TC_H_MAJ(clid);
1422 
1423 	if (portid != TC_H_ROOT) {
1424 		u32 qid1 = TC_H_MAJ(portid);
1425 
1426 		if (qid && qid1) {
1427 			/* If both majors are known, they must be identical. */
1428 			if (qid != qid1)
1429 				return -EINVAL;
1430 		} else if (qid1) {
1431 			qid = qid1;
1432 		} else if (qid == 0)
1433 			qid = dev->qdisc->handle;
1434 
1435 		/* Now qid is genuine qdisc handle consistent
1436 		 * both with parent and child.
1437 		 *
1438 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1439 		 */
1440 		if (portid)
1441 			portid = TC_H_MAKE(qid, portid);
1442 	} else {
1443 		if (qid == 0)
1444 			qid = dev->qdisc->handle;
1445 	}
1446 
1447 	/* OK. Locate qdisc */
1448 	q = qdisc_lookup(dev, qid);
1449 	if (!q)
1450 		return -ENOENT;
1451 
1452 	/* An check that it supports classes */
1453 	cops = q->ops->cl_ops;
1454 	if (cops == NULL)
1455 		return -EINVAL;
1456 
1457 	/* Now try to get class */
1458 	if (clid == 0) {
1459 		if (portid == TC_H_ROOT)
1460 			clid = qid;
1461 	} else
1462 		clid = TC_H_MAKE(qid, clid);
1463 
1464 	if (clid)
1465 		cl = cops->get(q, clid);
1466 
1467 	if (cl == 0) {
1468 		err = -ENOENT;
1469 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1470 		    !(n->nlmsg_flags & NLM_F_CREATE))
1471 			goto out;
1472 	} else {
1473 		switch (n->nlmsg_type) {
1474 		case RTM_NEWTCLASS:
1475 			err = -EEXIST;
1476 			if (n->nlmsg_flags & NLM_F_EXCL)
1477 				goto out;
1478 			break;
1479 		case RTM_DELTCLASS:
1480 			err = -EOPNOTSUPP;
1481 			if (cops->delete)
1482 				err = cops->delete(q, cl);
1483 			if (err == 0)
1484 				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1485 			goto out;
1486 		case RTM_GETTCLASS:
1487 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1488 			goto out;
1489 		default:
1490 			err = -EINVAL;
1491 			goto out;
1492 		}
1493 	}
1494 
1495 	new_cl = cl;
1496 	err = -EOPNOTSUPP;
1497 	if (cops->change)
1498 		err = cops->change(q, clid, portid, tca, &new_cl);
1499 	if (err == 0)
1500 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1501 
1502 out:
1503 	if (cl)
1504 		cops->put(q, cl);
1505 
1506 	return err;
1507 }
1508 
1509 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event)1510 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1511 			  unsigned long cl,
1512 			  u32 portid, u32 seq, u16 flags, int event)
1513 {
1514 	struct tcmsg *tcm;
1515 	struct nlmsghdr  *nlh;
1516 	unsigned char *b = skb_tail_pointer(skb);
1517 	struct gnet_dump d;
1518 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1519 
1520 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1521 	if (!nlh)
1522 		goto out_nlmsg_trim;
1523 	tcm = nlmsg_data(nlh);
1524 	tcm->tcm_family = AF_UNSPEC;
1525 	tcm->tcm__pad1 = 0;
1526 	tcm->tcm__pad2 = 0;
1527 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1528 	tcm->tcm_parent = q->handle;
1529 	tcm->tcm_handle = q->handle;
1530 	tcm->tcm_info = 0;
1531 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1532 		goto nla_put_failure;
1533 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1534 		goto nla_put_failure;
1535 
1536 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1537 					 qdisc_root_sleeping_lock(q), &d) < 0)
1538 		goto nla_put_failure;
1539 
1540 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1541 		goto nla_put_failure;
1542 
1543 	if (gnet_stats_finish_copy(&d) < 0)
1544 		goto nla_put_failure;
1545 
1546 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1547 	return skb->len;
1548 
1549 out_nlmsg_trim:
1550 nla_put_failure:
1551 	nlmsg_trim(skb, b);
1552 	return -1;
1553 }
1554 
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1555 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1556 			 struct nlmsghdr *n, struct Qdisc *q,
1557 			 unsigned long cl, int event)
1558 {
1559 	struct sk_buff *skb;
1560 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1561 
1562 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1563 	if (!skb)
1564 		return -ENOBUFS;
1565 
1566 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1567 		kfree_skb(skb);
1568 		return -EINVAL;
1569 	}
1570 
1571 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1572 			      n->nlmsg_flags & NLM_F_ECHO);
1573 }
1574 
1575 struct qdisc_dump_args {
1576 	struct qdisc_walker	w;
1577 	struct sk_buff		*skb;
1578 	struct netlink_callback	*cb;
1579 };
1580 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)1581 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1582 {
1583 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1584 
1585 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1586 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1587 }
1588 
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1589 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1590 				struct tcmsg *tcm, struct netlink_callback *cb,
1591 				int *t_p, int s_t)
1592 {
1593 	struct qdisc_dump_args arg;
1594 
1595 	if (tc_qdisc_dump_ignore(q) ||
1596 	    *t_p < s_t || !q->ops->cl_ops ||
1597 	    (tcm->tcm_parent &&
1598 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1599 		(*t_p)++;
1600 		return 0;
1601 	}
1602 	if (*t_p > s_t)
1603 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1604 	arg.w.fn = qdisc_class_dump;
1605 	arg.skb = skb;
1606 	arg.cb = cb;
1607 	arg.w.stop  = 0;
1608 	arg.w.skip = cb->args[1];
1609 	arg.w.count = 0;
1610 	q->ops->cl_ops->walk(q, &arg.w);
1611 	cb->args[1] = arg.w.count;
1612 	if (arg.w.stop)
1613 		return -1;
1614 	(*t_p)++;
1615 	return 0;
1616 }
1617 
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1618 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1619 			       struct tcmsg *tcm, struct netlink_callback *cb,
1620 			       int *t_p, int s_t)
1621 {
1622 	struct Qdisc *q;
1623 
1624 	if (!root)
1625 		return 0;
1626 
1627 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1628 		return -1;
1629 
1630 	list_for_each_entry(q, &root->list, list) {
1631 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1632 			return -1;
1633 	}
1634 
1635 	return 0;
1636 }
1637 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)1638 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1639 {
1640 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
1641 	struct net *net = sock_net(skb->sk);
1642 	struct netdev_queue *dev_queue;
1643 	struct net_device *dev;
1644 	int t, s_t;
1645 
1646 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1647 		return 0;
1648 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
1649 	if (!dev)
1650 		return 0;
1651 
1652 	s_t = cb->args[0];
1653 	t = 0;
1654 
1655 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1656 		goto done;
1657 
1658 	dev_queue = dev_ingress_queue(dev);
1659 	if (dev_queue &&
1660 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1661 				&t, s_t) < 0)
1662 		goto done;
1663 
1664 done:
1665 	cb->args[0] = t;
1666 
1667 	dev_put(dev);
1668 	return skb->len;
1669 }
1670 
1671 /* Main classifier routine: scans classifier chain attached
1672  * to this qdisc, (optionally) tests for protocol and asks
1673  * specific classifiers.
1674  */
tc_classify_compat(struct sk_buff * skb,const struct tcf_proto * tp,struct tcf_result * res)1675 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1676 		       struct tcf_result *res)
1677 {
1678 	__be16 protocol = skb->protocol;
1679 	int err;
1680 
1681 	for (; tp; tp = tp->next) {
1682 		if (tp->protocol != protocol &&
1683 		    tp->protocol != htons(ETH_P_ALL))
1684 			continue;
1685 		err = tp->classify(skb, tp, res);
1686 
1687 		if (err >= 0) {
1688 #ifdef CONFIG_NET_CLS_ACT
1689 			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1690 				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1691 #endif
1692 			return err;
1693 		}
1694 	}
1695 	return -1;
1696 }
1697 EXPORT_SYMBOL(tc_classify_compat);
1698 
tc_classify(struct sk_buff * skb,const struct tcf_proto * tp,struct tcf_result * res)1699 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1700 		struct tcf_result *res)
1701 {
1702 	int err = 0;
1703 #ifdef CONFIG_NET_CLS_ACT
1704 	const struct tcf_proto *otp = tp;
1705 reclassify:
1706 #endif
1707 
1708 	err = tc_classify_compat(skb, tp, res);
1709 #ifdef CONFIG_NET_CLS_ACT
1710 	if (err == TC_ACT_RECLASSIFY) {
1711 		u32 verd = G_TC_VERD(skb->tc_verd);
1712 		tp = otp;
1713 
1714 		if (verd++ >= MAX_REC_LOOP) {
1715 			net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1716 					       tp->q->ops->id,
1717 					       tp->prio & 0xffff,
1718 					       ntohs(tp->protocol));
1719 			return TC_ACT_SHOT;
1720 		}
1721 		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1722 		goto reclassify;
1723 	}
1724 #endif
1725 	return err;
1726 }
1727 EXPORT_SYMBOL(tc_classify);
1728 
tcf_destroy(struct tcf_proto * tp)1729 void tcf_destroy(struct tcf_proto *tp)
1730 {
1731 	tp->ops->destroy(tp);
1732 	module_put(tp->ops->owner);
1733 	kfree(tp);
1734 }
1735 
tcf_destroy_chain(struct tcf_proto ** fl)1736 void tcf_destroy_chain(struct tcf_proto **fl)
1737 {
1738 	struct tcf_proto *tp;
1739 
1740 	while ((tp = *fl) != NULL) {
1741 		*fl = tp->next;
1742 		tcf_destroy(tp);
1743 	}
1744 }
1745 EXPORT_SYMBOL(tcf_destroy_chain);
1746 
1747 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)1748 static int psched_show(struct seq_file *seq, void *v)
1749 {
1750 	struct timespec ts;
1751 
1752 	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1753 	seq_printf(seq, "%08x %08x %08x %08x\n",
1754 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1755 		   1000000,
1756 		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1757 
1758 	return 0;
1759 }
1760 
psched_open(struct inode * inode,struct file * file)1761 static int psched_open(struct inode *inode, struct file *file)
1762 {
1763 	return single_open(file, psched_show, NULL);
1764 }
1765 
1766 static const struct file_operations psched_fops = {
1767 	.owner = THIS_MODULE,
1768 	.open = psched_open,
1769 	.read  = seq_read,
1770 	.llseek = seq_lseek,
1771 	.release = single_release,
1772 };
1773 
psched_net_init(struct net * net)1774 static int __net_init psched_net_init(struct net *net)
1775 {
1776 	struct proc_dir_entry *e;
1777 
1778 	e = proc_create("psched", 0, net->proc_net, &psched_fops);
1779 	if (e == NULL)
1780 		return -ENOMEM;
1781 
1782 	return 0;
1783 }
1784 
psched_net_exit(struct net * net)1785 static void __net_exit psched_net_exit(struct net *net)
1786 {
1787 	remove_proc_entry("psched", net->proc_net);
1788 }
1789 #else
psched_net_init(struct net * net)1790 static int __net_init psched_net_init(struct net *net)
1791 {
1792 	return 0;
1793 }
1794 
psched_net_exit(struct net * net)1795 static void __net_exit psched_net_exit(struct net *net)
1796 {
1797 }
1798 #endif
1799 
1800 static struct pernet_operations psched_net_ops = {
1801 	.init = psched_net_init,
1802 	.exit = psched_net_exit,
1803 };
1804 
pktsched_init(void)1805 static int __init pktsched_init(void)
1806 {
1807 	int err;
1808 
1809 	err = register_pernet_subsys(&psched_net_ops);
1810 	if (err) {
1811 		pr_err("pktsched_init: "
1812 		       "cannot initialize per netns operations\n");
1813 		return err;
1814 	}
1815 
1816 	register_qdisc(&pfifo_qdisc_ops);
1817 	register_qdisc(&bfifo_qdisc_ops);
1818 	register_qdisc(&pfifo_head_drop_qdisc_ops);
1819 	register_qdisc(&mq_qdisc_ops);
1820 
1821 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1822 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1823 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1824 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1825 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1826 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1827 
1828 	return 0;
1829 }
1830 
1831 subsys_initcall(pktsched_init);
1832