• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39 
40 /*
41 
42    Short review.
43    -------------
44 
45    This file consists of two interrelated parts:
46 
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49 
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54 
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59 
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62 
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68 
69    All real intelligent work is done inside qdisc modules.
70 
71 
72 
73    Every discipline has two major routines: enqueue and dequeue.
74 
75    ---dequeue
76 
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83 
84    ---enqueue
85 
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP 	- this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93 
94    Auxiliary routines:
95 
96    ---peek
97 
98    like dequeue but without removing a packet from the queue
99 
100    ---reset
101 
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104 
105    ---init
106 
107    initializes newly created qdisc.
108 
109    ---destroy
110 
111    destroys resources allocated by init and during lifetime of qdisc.
112 
113    ---change
114 
115    changes qdisc parameters.
116  */
117 
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120 
121 
122 /************************************************
123  *	Queueing disciplines manipulation.	*
124  ************************************************/
125 
126 
127 /* The list of all installed queueing disciplines. */
128 
129 static struct Qdisc_ops *qdisc_base;
130 
131 /* Register/unregister queueing discipline */
132 
register_qdisc(struct Qdisc_ops * qops)133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135 	struct Qdisc_ops *q, **qp;
136 	int rc = -EEXIST;
137 
138 	write_lock(&qdisc_mod_lock);
139 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 		if (!strcmp(qops->id, q->id))
141 			goto out;
142 
143 	if (qops->enqueue == NULL)
144 		qops->enqueue = noop_qdisc_ops.enqueue;
145 	if (qops->peek == NULL) {
146 		if (qops->dequeue == NULL)
147 			qops->peek = noop_qdisc_ops.peek;
148 		else
149 			goto out_einval;
150 	}
151 	if (qops->dequeue == NULL)
152 		qops->dequeue = noop_qdisc_ops.dequeue;
153 
154 	if (qops->cl_ops) {
155 		const struct Qdisc_class_ops *cops = qops->cl_ops;
156 
157 		if (!(cops->find && cops->walk && cops->leaf))
158 			goto out_einval;
159 
160 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 			goto out_einval;
162 	}
163 
164 	qops->next = NULL;
165 	*qp = qops;
166 	rc = 0;
167 out:
168 	write_unlock(&qdisc_mod_lock);
169 	return rc;
170 
171 out_einval:
172 	rc = -EINVAL;
173 	goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176 
unregister_qdisc(struct Qdisc_ops * qops)177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179 	struct Qdisc_ops *q, **qp;
180 	int err = -ENOENT;
181 
182 	write_lock(&qdisc_mod_lock);
183 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 		if (q == qops)
185 			break;
186 	if (q) {
187 		*qp = q->next;
188 		q->next = NULL;
189 		err = 0;
190 	}
191 	write_unlock(&qdisc_mod_lock);
192 	return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195 
196 /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)197 void qdisc_get_default(char *name, size_t len)
198 {
199 	read_lock(&qdisc_mod_lock);
200 	strlcpy(name, default_qdisc_ops->id, len);
201 	read_unlock(&qdisc_mod_lock);
202 }
203 
qdisc_lookup_default(const char * name)204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206 	struct Qdisc_ops *q = NULL;
207 
208 	for (q = qdisc_base; q; q = q->next) {
209 		if (!strcmp(name, q->id)) {
210 			if (!try_module_get(q->owner))
211 				q = NULL;
212 			break;
213 		}
214 	}
215 
216 	return q;
217 }
218 
219 /* Set new default qdisc to use */
qdisc_set_default(const char * name)220 int qdisc_set_default(const char *name)
221 {
222 	const struct Qdisc_ops *ops;
223 
224 	if (!capable(CAP_NET_ADMIN))
225 		return -EPERM;
226 
227 	write_lock(&qdisc_mod_lock);
228 	ops = qdisc_lookup_default(name);
229 	if (!ops) {
230 		/* Not found, drop lock and try to load module */
231 		write_unlock(&qdisc_mod_lock);
232 		request_module("sch_%s", name);
233 		write_lock(&qdisc_mod_lock);
234 
235 		ops = qdisc_lookup_default(name);
236 	}
237 
238 	if (ops) {
239 		/* Set new default */
240 		module_put(default_qdisc_ops->owner);
241 		default_qdisc_ops = ops;
242 	}
243 	write_unlock(&qdisc_mod_lock);
244 
245 	return ops ? 0 : -ENOENT;
246 }
247 
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
sch_default_qdisc(void)250 static int __init sch_default_qdisc(void)
251 {
252 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256 
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261 
qdisc_match_from_root(struct Qdisc * root,u32 handle)262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264 	struct Qdisc *q;
265 
266 	if (!qdisc_dev(root))
267 		return (root->handle == handle ? root : NULL);
268 
269 	if (!(root->flags & TCQ_F_BUILTIN) &&
270 	    root->handle == handle)
271 		return root;
272 
273 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 		if (q->handle == handle)
275 			return q;
276 	}
277 	return NULL;
278 }
279 
qdisc_hash_add(struct Qdisc * q,bool invisible)280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 		ASSERT_RTNL();
284 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 		if (invisible)
286 			q->flags |= TCQ_F_INVISIBLE;
287 	}
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290 
qdisc_hash_del(struct Qdisc * q)291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 		ASSERT_RTNL();
295 		hash_del_rcu(&q->hash);
296 	}
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299 
qdisc_lookup(struct net_device * dev,u32 handle)300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 	struct Qdisc *q;
303 
304 	if (!handle)
305 		return NULL;
306 	q = qdisc_match_from_root(dev->qdisc, handle);
307 	if (q)
308 		goto out;
309 
310 	if (dev_ingress_queue(dev))
311 		q = qdisc_match_from_root(
312 			dev_ingress_queue(dev)->qdisc_sleeping,
313 			handle);
314 out:
315 	return q;
316 }
317 
qdisc_leaf(struct Qdisc * p,u32 classid)318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320 	unsigned long cl;
321 	struct Qdisc *leaf;
322 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323 
324 	if (cops == NULL)
325 		return NULL;
326 	cl = cops->find(p, classid);
327 
328 	if (cl == 0)
329 		return NULL;
330 	leaf = cops->leaf(p, cl);
331 	return leaf;
332 }
333 
334 /* Find queueing discipline by name */
335 
qdisc_lookup_ops(struct nlattr * kind)336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338 	struct Qdisc_ops *q = NULL;
339 
340 	if (kind) {
341 		read_lock(&qdisc_mod_lock);
342 		for (q = qdisc_base; q; q = q->next) {
343 			if (nla_strcmp(kind, q->id) == 0) {
344 				if (!try_module_get(q->owner))
345 					q = NULL;
346 				break;
347 			}
348 		}
349 		read_unlock(&qdisc_mod_lock);
350 	}
351 	return q;
352 }
353 
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373 	int low       = roundup(r->mpu, 48);
374 	int high      = roundup(low+1, 48);
375 	int cell_low  = low >> r->cell_log;
376 	int cell_high = (high >> r->cell_log) - 1;
377 
378 	/* rtab is too inaccurate at rates > 100Mbit/s */
379 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 		pr_debug("TC linklayer: Giving up ATM detection\n");
381 		return TC_LINKLAYER_ETHERNET;
382 	}
383 
384 	if ((cell_high > cell_low) && (cell_high < 256)
385 	    && (rtab[cell_low] == rtab[cell_high])) {
386 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 			 cell_low, cell_high, rtab[cell_high]);
388 		return TC_LINKLAYER_ATM;
389 	}
390 	return TC_LINKLAYER_ETHERNET;
391 }
392 
393 static struct qdisc_rate_table *qdisc_rtab_list;
394 
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab,struct netlink_ext_ack * extack)395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 					struct nlattr *tab,
397 					struct netlink_ext_ack *extack)
398 {
399 	struct qdisc_rate_table *rtab;
400 
401 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 	    nla_len(tab) != TC_RTAB_SIZE) {
403 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 		return NULL;
405 	}
406 
407 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 			rtab->refcnt++;
411 			return rtab;
412 		}
413 	}
414 
415 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 	if (rtab) {
417 		rtab->rate = *r;
418 		rtab->refcnt = 1;
419 		memcpy(rtab->data, nla_data(tab), 1024);
420 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 			r->linklayer = __detect_linklayer(r, rtab->data);
422 		rtab->next = qdisc_rtab_list;
423 		qdisc_rtab_list = rtab;
424 	} else {
425 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426 	}
427 	return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430 
qdisc_put_rtab(struct qdisc_rate_table * tab)431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433 	struct qdisc_rate_table *rtab, **rtabp;
434 
435 	if (!tab || --tab->refcnt)
436 		return;
437 
438 	for (rtabp = &qdisc_rtab_list;
439 	     (rtab = *rtabp) != NULL;
440 	     rtabp = &rtab->next) {
441 		if (rtab == tab) {
442 			*rtabp = rtab->next;
443 			kfree(rtab);
444 			return;
445 		}
446 	}
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449 
450 static LIST_HEAD(qdisc_stab_list);
451 
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
454 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456 
qdisc_get_stab(struct nlattr * opt,struct netlink_ext_ack * extack)457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 					       struct netlink_ext_ack *extack)
459 {
460 	struct nlattr *tb[TCA_STAB_MAX + 1];
461 	struct qdisc_size_table *stab;
462 	struct tc_sizespec *s;
463 	unsigned int tsize = 0;
464 	u16 *tab = NULL;
465 	int err;
466 
467 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 	if (err < 0)
469 		return ERR_PTR(err);
470 	if (!tb[TCA_STAB_BASE]) {
471 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 		return ERR_PTR(-EINVAL);
473 	}
474 
475 	s = nla_data(tb[TCA_STAB_BASE]);
476 
477 	if (s->tsize > 0) {
478 		if (!tb[TCA_STAB_DATA]) {
479 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 			return ERR_PTR(-EINVAL);
481 		}
482 		tab = nla_data(tb[TCA_STAB_DATA]);
483 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484 	}
485 
486 	if (tsize != s->tsize || (!tab && tsize > 0)) {
487 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 		return ERR_PTR(-EINVAL);
489 	}
490 
491 	list_for_each_entry(stab, &qdisc_stab_list, list) {
492 		if (memcmp(&stab->szopts, s, sizeof(*s)))
493 			continue;
494 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 			continue;
496 		stab->refcnt++;
497 		return stab;
498 	}
499 
500 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 	if (!stab)
502 		return ERR_PTR(-ENOMEM);
503 
504 	stab->refcnt = 1;
505 	stab->szopts = *s;
506 	if (tsize > 0)
507 		memcpy(stab->data, tab, tsize * sizeof(u16));
508 
509 	list_add_tail(&stab->list, &qdisc_stab_list);
510 
511 	return stab;
512 }
513 
stab_kfree_rcu(struct rcu_head * head)514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516 	kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518 
qdisc_put_stab(struct qdisc_size_table * tab)519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521 	if (!tab)
522 		return;
523 
524 	if (--tab->refcnt == 0) {
525 		list_del(&tab->list);
526 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 	}
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530 
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533 	struct nlattr *nest;
534 
535 	nest = nla_nest_start(skb, TCA_STAB);
536 	if (nest == NULL)
537 		goto nla_put_failure;
538 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 		goto nla_put_failure;
540 	nla_nest_end(skb, nest);
541 
542 	return skb->len;
543 
544 nla_put_failure:
545 	return -1;
546 }
547 
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 			       const struct qdisc_size_table *stab)
550 {
551 	int pkt_len, slot;
552 
553 	pkt_len = skb->len + stab->szopts.overhead;
554 	if (unlikely(!stab->szopts.tsize))
555 		goto out;
556 
557 	slot = pkt_len + stab->szopts.cell_align;
558 	if (unlikely(slot < 0))
559 		slot = 0;
560 
561 	slot >>= stab->szopts.cell_log;
562 	if (likely(slot < stab->szopts.tsize))
563 		pkt_len = stab->data[slot];
564 	else
565 		pkt_len = stab->data[stab->szopts.tsize - 1] *
566 				(slot / stab->szopts.tsize) +
567 				stab->data[slot % stab->szopts.tsize];
568 
569 	pkt_len <<= stab->szopts.size_log;
570 out:
571 	if (unlikely(pkt_len < 1))
572 		pkt_len = 1;
573 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576 
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 			txt, qdisc->ops->id, qdisc->handle >> 16);
582 		qdisc->flags |= TCQ_F_WARN_NONWC;
583 	}
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586 
qdisc_watchdog(struct hrtimer * timer)587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 						 timer);
591 
592 	rcu_read_lock();
593 	__netif_schedule(qdisc_root(wd->qdisc));
594 	rcu_read_unlock();
595 
596 	return HRTIMER_NORESTART;
597 }
598 
qdisc_watchdog_init_clockid(struct qdisc_watchdog * wd,struct Qdisc * qdisc,clockid_t clockid)599 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
600 				 clockid_t clockid)
601 {
602 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
603 	wd->timer.function = qdisc_watchdog;
604 	wd->qdisc = qdisc;
605 }
606 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
607 
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)608 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
609 {
610 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
611 }
612 EXPORT_SYMBOL(qdisc_watchdog_init);
613 
qdisc_watchdog_schedule_ns(struct qdisc_watchdog * wd,u64 expires)614 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
615 {
616 	if (test_bit(__QDISC_STATE_DEACTIVATED,
617 		     &qdisc_root_sleeping(wd->qdisc)->state))
618 		return;
619 
620 	if (wd->last_expires == expires)
621 		return;
622 
623 	wd->last_expires = expires;
624 	hrtimer_start(&wd->timer,
625 		      ns_to_ktime(expires),
626 		      HRTIMER_MODE_ABS_PINNED);
627 }
628 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
629 
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)630 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
631 {
632 	hrtimer_cancel(&wd->timer);
633 }
634 EXPORT_SYMBOL(qdisc_watchdog_cancel);
635 
qdisc_class_hash_alloc(unsigned int n)636 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
637 {
638 	struct hlist_head *h;
639 	unsigned int i;
640 
641 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
642 
643 	if (h != NULL) {
644 		for (i = 0; i < n; i++)
645 			INIT_HLIST_HEAD(&h[i]);
646 	}
647 	return h;
648 }
649 
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)650 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
651 {
652 	struct Qdisc_class_common *cl;
653 	struct hlist_node *next;
654 	struct hlist_head *nhash, *ohash;
655 	unsigned int nsize, nmask, osize;
656 	unsigned int i, h;
657 
658 	/* Rehash when load factor exceeds 0.75 */
659 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
660 		return;
661 	nsize = clhash->hashsize * 2;
662 	nmask = nsize - 1;
663 	nhash = qdisc_class_hash_alloc(nsize);
664 	if (nhash == NULL)
665 		return;
666 
667 	ohash = clhash->hash;
668 	osize = clhash->hashsize;
669 
670 	sch_tree_lock(sch);
671 	for (i = 0; i < osize; i++) {
672 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
673 			h = qdisc_class_hash(cl->classid, nmask);
674 			hlist_add_head(&cl->hnode, &nhash[h]);
675 		}
676 	}
677 	clhash->hash     = nhash;
678 	clhash->hashsize = nsize;
679 	clhash->hashmask = nmask;
680 	sch_tree_unlock(sch);
681 
682 	kvfree(ohash);
683 }
684 EXPORT_SYMBOL(qdisc_class_hash_grow);
685 
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)686 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
687 {
688 	unsigned int size = 4;
689 
690 	clhash->hash = qdisc_class_hash_alloc(size);
691 	if (!clhash->hash)
692 		return -ENOMEM;
693 	clhash->hashsize  = size;
694 	clhash->hashmask  = size - 1;
695 	clhash->hashelems = 0;
696 	return 0;
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_init);
699 
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)700 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
701 {
702 	kvfree(clhash->hash);
703 }
704 EXPORT_SYMBOL(qdisc_class_hash_destroy);
705 
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)706 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
707 			     struct Qdisc_class_common *cl)
708 {
709 	unsigned int h;
710 
711 	INIT_HLIST_NODE(&cl->hnode);
712 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
713 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
714 	clhash->hashelems++;
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_insert);
717 
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)718 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
719 			     struct Qdisc_class_common *cl)
720 {
721 	hlist_del(&cl->hnode);
722 	clhash->hashelems--;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_remove);
725 
726 /* Allocate an unique handle from space managed by kernel
727  * Possible range is [8000-FFFF]:0000 (0x8000 values)
728  */
qdisc_alloc_handle(struct net_device * dev)729 static u32 qdisc_alloc_handle(struct net_device *dev)
730 {
731 	int i = 0x8000;
732 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
733 
734 	do {
735 		autohandle += TC_H_MAKE(0x10000U, 0);
736 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
737 			autohandle = TC_H_MAKE(0x80000000U, 0);
738 		if (!qdisc_lookup(dev, autohandle))
739 			return autohandle;
740 		cond_resched();
741 	} while	(--i > 0);
742 
743 	return 0;
744 }
745 
qdisc_tree_reduce_backlog(struct Qdisc * sch,unsigned int n,unsigned int len)746 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
747 			       unsigned int len)
748 {
749 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
750 	const struct Qdisc_class_ops *cops;
751 	unsigned long cl;
752 	u32 parentid;
753 	bool notify;
754 	int drops;
755 
756 	if (n == 0 && len == 0)
757 		return;
758 	drops = max_t(int, n, 0);
759 	rcu_read_lock();
760 	while ((parentid = sch->parent)) {
761 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
762 			break;
763 
764 		if (sch->flags & TCQ_F_NOPARENT)
765 			break;
766 		/* Notify parent qdisc only if child qdisc becomes empty.
767 		 *
768 		 * If child was empty even before update then backlog
769 		 * counter is screwed and we skip notification because
770 		 * parent class is already passive.
771 		 *
772 		 * If the original child was offloaded then it is allowed
773 		 * to be seem as empty, so the parent is notified anyway.
774 		 */
775 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
776 						       !qdisc_is_offloaded);
777 		/* TODO: perform the search on a per txq basis */
778 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
779 		if (sch == NULL) {
780 			WARN_ON_ONCE(parentid != TC_H_ROOT);
781 			break;
782 		}
783 		cops = sch->ops->cl_ops;
784 		if (notify && cops->qlen_notify) {
785 			cl = cops->find(sch, parentid);
786 			cops->qlen_notify(sch, cl);
787 		}
788 		sch->q.qlen -= n;
789 		sch->qstats.backlog -= len;
790 		__qdisc_qstats_drop(sch, drops);
791 	}
792 	rcu_read_unlock();
793 }
794 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
795 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event)796 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
797 			 u32 portid, u32 seq, u16 flags, int event)
798 {
799 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
800 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
801 	struct tcmsg *tcm;
802 	struct nlmsghdr  *nlh;
803 	unsigned char *b = skb_tail_pointer(skb);
804 	struct gnet_dump d;
805 	struct qdisc_size_table *stab;
806 	u32 block_index;
807 	__u32 qlen;
808 
809 	cond_resched();
810 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
811 	if (!nlh)
812 		goto out_nlmsg_trim;
813 	tcm = nlmsg_data(nlh);
814 	tcm->tcm_family = AF_UNSPEC;
815 	tcm->tcm__pad1 = 0;
816 	tcm->tcm__pad2 = 0;
817 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
818 	tcm->tcm_parent = clid;
819 	tcm->tcm_handle = q->handle;
820 	tcm->tcm_info = refcount_read(&q->refcnt);
821 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
822 		goto nla_put_failure;
823 	if (q->ops->ingress_block_get) {
824 		block_index = q->ops->ingress_block_get(q);
825 		if (block_index &&
826 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
827 			goto nla_put_failure;
828 	}
829 	if (q->ops->egress_block_get) {
830 		block_index = q->ops->egress_block_get(q);
831 		if (block_index &&
832 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
833 			goto nla_put_failure;
834 	}
835 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
836 		goto nla_put_failure;
837 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
838 		goto nla_put_failure;
839 	qlen = qdisc_qlen_sum(q);
840 
841 	stab = rtnl_dereference(q->stab);
842 	if (stab && qdisc_dump_stab(skb, stab) < 0)
843 		goto nla_put_failure;
844 
845 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
846 					 NULL, &d, TCA_PAD) < 0)
847 		goto nla_put_failure;
848 
849 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
850 		goto nla_put_failure;
851 
852 	if (qdisc_is_percpu_stats(q)) {
853 		cpu_bstats = q->cpu_bstats;
854 		cpu_qstats = q->cpu_qstats;
855 	}
856 
857 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
858 				  &d, cpu_bstats, &q->bstats) < 0 ||
859 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
860 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
861 		goto nla_put_failure;
862 
863 	if (gnet_stats_finish_copy(&d) < 0)
864 		goto nla_put_failure;
865 
866 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
867 	return skb->len;
868 
869 out_nlmsg_trim:
870 nla_put_failure:
871 	nlmsg_trim(skb, b);
872 	return -1;
873 }
874 
tc_qdisc_dump_ignore(struct Qdisc * q,bool dump_invisible)875 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
876 {
877 	if (q->flags & TCQ_F_BUILTIN)
878 		return true;
879 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
880 		return true;
881 
882 	return false;
883 }
884 
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)885 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
886 			struct nlmsghdr *n, u32 clid,
887 			struct Qdisc *old, struct Qdisc *new)
888 {
889 	struct sk_buff *skb;
890 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
891 
892 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
893 	if (!skb)
894 		return -ENOBUFS;
895 
896 	if (old && !tc_qdisc_dump_ignore(old, false)) {
897 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
898 				  0, RTM_DELQDISC) < 0)
899 			goto err_out;
900 	}
901 	if (new && !tc_qdisc_dump_ignore(new, false)) {
902 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
903 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
904 			goto err_out;
905 	}
906 
907 	if (skb->len)
908 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
909 				      n->nlmsg_flags & NLM_F_ECHO);
910 
911 err_out:
912 	kfree_skb(skb);
913 	return -EINVAL;
914 }
915 
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)916 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
917 			       struct nlmsghdr *n, u32 clid,
918 			       struct Qdisc *old, struct Qdisc *new)
919 {
920 	if (new || old)
921 		qdisc_notify(net, skb, n, clid, old, new);
922 
923 	if (old)
924 		qdisc_destroy(old);
925 }
926 
927 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
928  * to device "dev".
929  *
930  * When appropriate send a netlink notification using 'skb'
931  * and "n".
932  *
933  * On success, destroy old qdisc.
934  */
935 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)936 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
937 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
938 		       struct Qdisc *new, struct Qdisc *old,
939 		       struct netlink_ext_ack *extack)
940 {
941 	struct Qdisc *q = old;
942 	struct net *net = dev_net(dev);
943 	int err = 0;
944 
945 	if (parent == NULL) {
946 		unsigned int i, num_q, ingress;
947 
948 		ingress = 0;
949 		num_q = dev->num_tx_queues;
950 		if ((q && q->flags & TCQ_F_INGRESS) ||
951 		    (new && new->flags & TCQ_F_INGRESS)) {
952 			num_q = 1;
953 			ingress = 1;
954 			if (!dev_ingress_queue(dev)) {
955 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
956 				return -ENOENT;
957 			}
958 		}
959 
960 		if (dev->flags & IFF_UP)
961 			dev_deactivate(dev);
962 
963 		if (new && new->ops->attach)
964 			goto skip;
965 
966 		for (i = 0; i < num_q; i++) {
967 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
968 
969 			if (!ingress)
970 				dev_queue = netdev_get_tx_queue(dev, i);
971 
972 			old = dev_graft_qdisc(dev_queue, new);
973 			if (new && i > 0)
974 				qdisc_refcount_inc(new);
975 
976 			if (!ingress)
977 				qdisc_destroy(old);
978 		}
979 
980 skip:
981 		if (!ingress) {
982 			notify_and_destroy(net, skb, n, classid,
983 					   dev->qdisc, new);
984 			if (new && !new->ops->attach)
985 				qdisc_refcount_inc(new);
986 			dev->qdisc = new ? : &noop_qdisc;
987 
988 			if (new && new->ops->attach)
989 				new->ops->attach(new);
990 		} else {
991 			notify_and_destroy(net, skb, n, classid, old, new);
992 		}
993 
994 		if (dev->flags & IFF_UP)
995 			dev_activate(dev);
996 	} else {
997 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
998 
999 		/* Only support running class lockless if parent is lockless */
1000 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1001 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1002 			new->flags &= ~TCQ_F_NOLOCK;
1003 
1004 		err = -EOPNOTSUPP;
1005 		if (cops && cops->graft) {
1006 			unsigned long cl = cops->find(parent, classid);
1007 
1008 			if (cl) {
1009 				if (new && new->ops == &noqueue_qdisc_ops) {
1010 					NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1011 					err = -EINVAL;
1012 				} else {
1013 					err = cops->graft(parent, cl, new, &old, extack);
1014 				}
1015 			} else {
1016 				NL_SET_ERR_MSG(extack, "Specified class not found");
1017 				err = -ENOENT;
1018 			}
1019 		}
1020 		if (!err)
1021 			notify_and_destroy(net, skb, n, classid, old, new);
1022 	}
1023 	return err;
1024 }
1025 
qdisc_block_indexes_set(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1026 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1027 				   struct netlink_ext_ack *extack)
1028 {
1029 	u32 block_index;
1030 
1031 	if (tca[TCA_INGRESS_BLOCK]) {
1032 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1033 
1034 		if (!block_index) {
1035 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1036 			return -EINVAL;
1037 		}
1038 		if (!sch->ops->ingress_block_set) {
1039 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1040 			return -EOPNOTSUPP;
1041 		}
1042 		sch->ops->ingress_block_set(sch, block_index);
1043 	}
1044 	if (tca[TCA_EGRESS_BLOCK]) {
1045 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1046 
1047 		if (!block_index) {
1048 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1049 			return -EINVAL;
1050 		}
1051 		if (!sch->ops->egress_block_set) {
1052 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1053 			return -EOPNOTSUPP;
1054 		}
1055 		sch->ops->egress_block_set(sch, block_index);
1056 	}
1057 	return 0;
1058 }
1059 
1060 /* lockdep annotation is needed for ingress; egress gets it only for name */
1061 static struct lock_class_key qdisc_tx_lock;
1062 static struct lock_class_key qdisc_rx_lock;
1063 
1064 /*
1065    Allocate and initialize new qdisc.
1066 
1067    Parameters are passed via opt.
1068  */
1069 
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp,struct netlink_ext_ack * extack)1070 static struct Qdisc *qdisc_create(struct net_device *dev,
1071 				  struct netdev_queue *dev_queue,
1072 				  struct Qdisc *p, u32 parent, u32 handle,
1073 				  struct nlattr **tca, int *errp,
1074 				  struct netlink_ext_ack *extack)
1075 {
1076 	int err;
1077 	struct nlattr *kind = tca[TCA_KIND];
1078 	struct Qdisc *sch;
1079 	struct Qdisc_ops *ops;
1080 	struct qdisc_size_table *stab;
1081 
1082 	ops = qdisc_lookup_ops(kind);
1083 #ifdef CONFIG_MODULES
1084 	if (ops == NULL && kind != NULL) {
1085 		char name[IFNAMSIZ];
1086 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1087 			/* We dropped the RTNL semaphore in order to
1088 			 * perform the module load.  So, even if we
1089 			 * succeeded in loading the module we have to
1090 			 * tell the caller to replay the request.  We
1091 			 * indicate this using -EAGAIN.
1092 			 * We replay the request because the device may
1093 			 * go away in the mean time.
1094 			 */
1095 			rtnl_unlock();
1096 			request_module("sch_%s", name);
1097 			rtnl_lock();
1098 			ops = qdisc_lookup_ops(kind);
1099 			if (ops != NULL) {
1100 				/* We will try again qdisc_lookup_ops,
1101 				 * so don't keep a reference.
1102 				 */
1103 				module_put(ops->owner);
1104 				err = -EAGAIN;
1105 				goto err_out;
1106 			}
1107 		}
1108 	}
1109 #endif
1110 
1111 	err = -ENOENT;
1112 	if (!ops) {
1113 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1114 		goto err_out;
1115 	}
1116 
1117 	sch = qdisc_alloc(dev_queue, ops, extack);
1118 	if (IS_ERR(sch)) {
1119 		err = PTR_ERR(sch);
1120 		goto err_out2;
1121 	}
1122 
1123 	sch->parent = parent;
1124 
1125 	if (handle == TC_H_INGRESS) {
1126 		sch->flags |= TCQ_F_INGRESS;
1127 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1128 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1129 	} else {
1130 		if (handle == 0) {
1131 			handle = qdisc_alloc_handle(dev);
1132 			err = -ENOMEM;
1133 			if (handle == 0)
1134 				goto err_out3;
1135 		}
1136 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1137 		if (!netif_is_multiqueue(dev))
1138 			sch->flags |= TCQ_F_ONETXQUEUE;
1139 	}
1140 
1141 	sch->handle = handle;
1142 
1143 	/* This exist to keep backward compatible with a userspace
1144 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1145 	 * facility on older kernels by setting tx_queue_len=0 (prior
1146 	 * to qdisc init), and then forgot to reinit tx_queue_len
1147 	 * before again attaching a qdisc.
1148 	 */
1149 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1150 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1151 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1152 	}
1153 
1154 	err = qdisc_block_indexes_set(sch, tca, extack);
1155 	if (err)
1156 		goto err_out3;
1157 
1158 	if (ops->init) {
1159 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1160 		if (err != 0)
1161 			goto err_out5;
1162 	}
1163 
1164 	if (tca[TCA_STAB]) {
1165 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1166 		if (IS_ERR(stab)) {
1167 			err = PTR_ERR(stab);
1168 			goto err_out4;
1169 		}
1170 		rcu_assign_pointer(sch->stab, stab);
1171 	}
1172 	if (tca[TCA_RATE]) {
1173 		seqcount_t *running;
1174 
1175 		err = -EOPNOTSUPP;
1176 		if (sch->flags & TCQ_F_MQROOT) {
1177 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1178 			goto err_out4;
1179 		}
1180 
1181 		if (sch->parent != TC_H_ROOT &&
1182 		    !(sch->flags & TCQ_F_INGRESS) &&
1183 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1184 			running = qdisc_root_sleeping_running(sch);
1185 		else
1186 			running = &sch->running;
1187 
1188 		err = gen_new_estimator(&sch->bstats,
1189 					sch->cpu_bstats,
1190 					&sch->rate_est,
1191 					NULL,
1192 					running,
1193 					tca[TCA_RATE]);
1194 		if (err) {
1195 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1196 			goto err_out4;
1197 		}
1198 	}
1199 
1200 	qdisc_hash_add(sch, false);
1201 
1202 	return sch;
1203 
1204 err_out5:
1205 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1206 	if (ops->destroy)
1207 		ops->destroy(sch);
1208 err_out3:
1209 	dev_put(dev);
1210 	qdisc_free(sch);
1211 err_out2:
1212 	module_put(ops->owner);
1213 err_out:
1214 	*errp = err;
1215 	return NULL;
1216 
1217 err_out4:
1218 	/*
1219 	 * Any broken qdiscs that would require a ops->reset() here?
1220 	 * The qdisc was never in action so it shouldn't be necessary.
1221 	 */
1222 	qdisc_put_stab(rtnl_dereference(sch->stab));
1223 	if (ops->destroy)
1224 		ops->destroy(sch);
1225 	goto err_out3;
1226 }
1227 
qdisc_change(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1228 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1229 			struct netlink_ext_ack *extack)
1230 {
1231 	struct qdisc_size_table *ostab, *stab = NULL;
1232 	int err = 0;
1233 
1234 	if (tca[TCA_OPTIONS]) {
1235 		if (!sch->ops->change) {
1236 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1237 			return -EINVAL;
1238 		}
1239 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1240 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1241 			return -EOPNOTSUPP;
1242 		}
1243 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1244 		if (err)
1245 			return err;
1246 	}
1247 
1248 	if (tca[TCA_STAB]) {
1249 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1250 		if (IS_ERR(stab))
1251 			return PTR_ERR(stab);
1252 	}
1253 
1254 	ostab = rtnl_dereference(sch->stab);
1255 	rcu_assign_pointer(sch->stab, stab);
1256 	qdisc_put_stab(ostab);
1257 
1258 	if (tca[TCA_RATE]) {
1259 		/* NB: ignores errors from replace_estimator
1260 		   because change can't be undone. */
1261 		if (sch->flags & TCQ_F_MQROOT)
1262 			goto out;
1263 		gen_replace_estimator(&sch->bstats,
1264 				      sch->cpu_bstats,
1265 				      &sch->rate_est,
1266 				      NULL,
1267 				      qdisc_root_sleeping_running(sch),
1268 				      tca[TCA_RATE]);
1269 	}
1270 out:
1271 	return 0;
1272 }
1273 
1274 struct check_loop_arg {
1275 	struct qdisc_walker	w;
1276 	struct Qdisc		*p;
1277 	int			depth;
1278 };
1279 
1280 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1281 			 struct qdisc_walker *w);
1282 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1283 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1284 {
1285 	struct check_loop_arg	arg;
1286 
1287 	if (q->ops->cl_ops == NULL)
1288 		return 0;
1289 
1290 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1291 	arg.w.fn = check_loop_fn;
1292 	arg.depth = depth;
1293 	arg.p = p;
1294 	q->ops->cl_ops->walk(q, &arg.w);
1295 	return arg.w.stop ? -ELOOP : 0;
1296 }
1297 
1298 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1299 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1300 {
1301 	struct Qdisc *leaf;
1302 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1303 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1304 
1305 	leaf = cops->leaf(q, cl);
1306 	if (leaf) {
1307 		if (leaf == arg->p || arg->depth > 7)
1308 			return -ELOOP;
1309 		return check_loop(leaf, arg->p, arg->depth + 1);
1310 	}
1311 	return 0;
1312 }
1313 
1314 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1315 	[TCA_KIND]		= { .type = NLA_NUL_STRING,
1316 				    .len = IFNAMSIZ - 1 },
1317 	[TCA_RATE]		= { .type = NLA_BINARY,
1318 				    .len = sizeof(struct tc_estimator) },
1319 	[TCA_STAB]		= { .type = NLA_NESTED },
1320 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1321 	[TCA_CHAIN]		= { .type = NLA_U32 },
1322 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1323 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1324 };
1325 
1326 /*
1327  * Delete/get qdisc.
1328  */
1329 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1330 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1331 			struct netlink_ext_ack *extack)
1332 {
1333 	struct net *net = sock_net(skb->sk);
1334 	struct tcmsg *tcm = nlmsg_data(n);
1335 	struct nlattr *tca[TCA_MAX + 1];
1336 	struct net_device *dev;
1337 	u32 clid;
1338 	struct Qdisc *q = NULL;
1339 	struct Qdisc *p = NULL;
1340 	int err;
1341 
1342 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1343 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1344 		return -EPERM;
1345 
1346 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1347 			  extack);
1348 	if (err < 0)
1349 		return err;
1350 
1351 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1352 	if (!dev)
1353 		return -ENODEV;
1354 
1355 	clid = tcm->tcm_parent;
1356 	if (clid) {
1357 		if (clid != TC_H_ROOT) {
1358 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1359 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1360 				if (!p) {
1361 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1362 					return -ENOENT;
1363 				}
1364 				q = qdisc_leaf(p, clid);
1365 			} else if (dev_ingress_queue(dev)) {
1366 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1367 			}
1368 		} else {
1369 			q = dev->qdisc;
1370 		}
1371 		if (!q) {
1372 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1373 			return -ENOENT;
1374 		}
1375 
1376 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1377 			NL_SET_ERR_MSG(extack, "Invalid handle");
1378 			return -EINVAL;
1379 		}
1380 	} else {
1381 		q = qdisc_lookup(dev, tcm->tcm_handle);
1382 		if (!q) {
1383 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1384 			return -ENOENT;
1385 		}
1386 	}
1387 
1388 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1389 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1390 		return -EINVAL;
1391 	}
1392 
1393 	if (n->nlmsg_type == RTM_DELQDISC) {
1394 		if (!clid) {
1395 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1396 			return -EINVAL;
1397 		}
1398 		if (q->handle == 0) {
1399 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1400 			return -ENOENT;
1401 		}
1402 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1403 		if (err != 0)
1404 			return err;
1405 	} else {
1406 		qdisc_notify(net, skb, n, clid, NULL, q);
1407 	}
1408 	return 0;
1409 }
1410 
1411 /*
1412  * Create/change qdisc.
1413  */
1414 
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1415 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1416 			   struct netlink_ext_ack *extack)
1417 {
1418 	struct net *net = sock_net(skb->sk);
1419 	struct tcmsg *tcm;
1420 	struct nlattr *tca[TCA_MAX + 1];
1421 	struct net_device *dev;
1422 	u32 clid;
1423 	struct Qdisc *q, *p;
1424 	int err;
1425 
1426 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1427 		return -EPERM;
1428 
1429 replay:
1430 	/* Reinit, just in case something touches this. */
1431 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1432 			  extack);
1433 	if (err < 0)
1434 		return err;
1435 
1436 	tcm = nlmsg_data(n);
1437 	clid = tcm->tcm_parent;
1438 	q = p = NULL;
1439 
1440 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1441 	if (!dev)
1442 		return -ENODEV;
1443 
1444 
1445 	if (clid) {
1446 		if (clid != TC_H_ROOT) {
1447 			if (clid != TC_H_INGRESS) {
1448 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1449 				if (!p) {
1450 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1451 					return -ENOENT;
1452 				}
1453 				q = qdisc_leaf(p, clid);
1454 			} else if (dev_ingress_queue_create(dev)) {
1455 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1456 			}
1457 		} else {
1458 			q = dev->qdisc;
1459 		}
1460 
1461 		/* It may be default qdisc, ignore it */
1462 		if (q && q->handle == 0)
1463 			q = NULL;
1464 
1465 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1466 			if (tcm->tcm_handle) {
1467 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1468 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1469 					return -EEXIST;
1470 				}
1471 				if (TC_H_MIN(tcm->tcm_handle)) {
1472 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1473 					return -EINVAL;
1474 				}
1475 				q = qdisc_lookup(dev, tcm->tcm_handle);
1476 				if (!q)
1477 					goto create_n_graft;
1478 				if (n->nlmsg_flags & NLM_F_EXCL) {
1479 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1480 					return -EEXIST;
1481 				}
1482 				if (tca[TCA_KIND] &&
1483 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1484 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1485 					return -EINVAL;
1486 				}
1487 				if (q == p ||
1488 				    (p && check_loop(q, p, 0))) {
1489 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1490 					return -ELOOP;
1491 				}
1492 				qdisc_refcount_inc(q);
1493 				goto graft;
1494 			} else {
1495 				if (!q)
1496 					goto create_n_graft;
1497 
1498 				/* This magic test requires explanation.
1499 				 *
1500 				 *   We know, that some child q is already
1501 				 *   attached to this parent and have choice:
1502 				 *   either to change it or to create/graft new one.
1503 				 *
1504 				 *   1. We are allowed to create/graft only
1505 				 *   if CREATE and REPLACE flags are set.
1506 				 *
1507 				 *   2. If EXCL is set, requestor wanted to say,
1508 				 *   that qdisc tcm_handle is not expected
1509 				 *   to exist, so that we choose create/graft too.
1510 				 *
1511 				 *   3. The last case is when no flags are set.
1512 				 *   Alas, it is sort of hole in API, we
1513 				 *   cannot decide what to do unambiguously.
1514 				 *   For now we select create/graft, if
1515 				 *   user gave KIND, which does not match existing.
1516 				 */
1517 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1518 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1519 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1520 				     (tca[TCA_KIND] &&
1521 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1522 					goto create_n_graft;
1523 			}
1524 		}
1525 	} else {
1526 		if (!tcm->tcm_handle) {
1527 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1528 			return -EINVAL;
1529 		}
1530 		q = qdisc_lookup(dev, tcm->tcm_handle);
1531 	}
1532 
1533 	/* Change qdisc parameters */
1534 	if (!q) {
1535 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1536 		return -ENOENT;
1537 	}
1538 	if (n->nlmsg_flags & NLM_F_EXCL) {
1539 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1540 		return -EEXIST;
1541 	}
1542 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1543 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1544 		return -EINVAL;
1545 	}
1546 	err = qdisc_change(q, tca, extack);
1547 	if (err == 0)
1548 		qdisc_notify(net, skb, n, clid, NULL, q);
1549 	return err;
1550 
1551 create_n_graft:
1552 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1553 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1554 		return -ENOENT;
1555 	}
1556 	if (clid == TC_H_INGRESS) {
1557 		if (dev_ingress_queue(dev)) {
1558 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1559 					 tcm->tcm_parent, tcm->tcm_parent,
1560 					 tca, &err, extack);
1561 		} else {
1562 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1563 			err = -ENOENT;
1564 		}
1565 	} else {
1566 		struct netdev_queue *dev_queue;
1567 
1568 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1569 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1570 		else if (p)
1571 			dev_queue = p->dev_queue;
1572 		else
1573 			dev_queue = netdev_get_tx_queue(dev, 0);
1574 
1575 		q = qdisc_create(dev, dev_queue, p,
1576 				 tcm->tcm_parent, tcm->tcm_handle,
1577 				 tca, &err, extack);
1578 	}
1579 	if (q == NULL) {
1580 		if (err == -EAGAIN)
1581 			goto replay;
1582 		return err;
1583 	}
1584 
1585 graft:
1586 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1587 	if (err) {
1588 		if (q)
1589 			qdisc_destroy(q);
1590 		return err;
1591 	}
1592 
1593 	return 0;
1594 }
1595 
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur,bool dump_invisible)1596 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1597 			      struct netlink_callback *cb,
1598 			      int *q_idx_p, int s_q_idx, bool recur,
1599 			      bool dump_invisible)
1600 {
1601 	int ret = 0, q_idx = *q_idx_p;
1602 	struct Qdisc *q;
1603 	int b;
1604 
1605 	if (!root)
1606 		return 0;
1607 
1608 	q = root;
1609 	if (q_idx < s_q_idx) {
1610 		q_idx++;
1611 	} else {
1612 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1613 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1614 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1615 				  RTM_NEWQDISC) <= 0)
1616 			goto done;
1617 		q_idx++;
1618 	}
1619 
1620 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1621 	 * itself has already been dumped.
1622 	 *
1623 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1624 	 * qdisc hashtable, we don't want to hit it again
1625 	 */
1626 	if (!qdisc_dev(root) || !recur)
1627 		goto out;
1628 
1629 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1630 		if (q_idx < s_q_idx) {
1631 			q_idx++;
1632 			continue;
1633 		}
1634 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1635 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1636 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1637 				  RTM_NEWQDISC) <= 0)
1638 			goto done;
1639 		q_idx++;
1640 	}
1641 
1642 out:
1643 	*q_idx_p = q_idx;
1644 	return ret;
1645 done:
1646 	ret = -1;
1647 	goto out;
1648 }
1649 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1650 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1651 {
1652 	struct net *net = sock_net(skb->sk);
1653 	int idx, q_idx;
1654 	int s_idx, s_q_idx;
1655 	struct net_device *dev;
1656 	const struct nlmsghdr *nlh = cb->nlh;
1657 	struct nlattr *tca[TCA_MAX + 1];
1658 	int err;
1659 
1660 	s_idx = cb->args[0];
1661 	s_q_idx = q_idx = cb->args[1];
1662 
1663 	idx = 0;
1664 	ASSERT_RTNL();
1665 
1666 	err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1667 			  rtm_tca_policy, NULL);
1668 	if (err < 0)
1669 		return err;
1670 
1671 	for_each_netdev(net, dev) {
1672 		struct netdev_queue *dev_queue;
1673 
1674 		if (idx < s_idx)
1675 			goto cont;
1676 		if (idx > s_idx)
1677 			s_q_idx = 0;
1678 		q_idx = 0;
1679 
1680 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1681 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1682 			goto done;
1683 
1684 		dev_queue = dev_ingress_queue(dev);
1685 		if (dev_queue &&
1686 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1687 				       &q_idx, s_q_idx, false,
1688 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1689 			goto done;
1690 
1691 cont:
1692 		idx++;
1693 	}
1694 
1695 done:
1696 	cb->args[0] = idx;
1697 	cb->args[1] = q_idx;
1698 
1699 	return skb->len;
1700 }
1701 
1702 
1703 
1704 /************************************************
1705  *	Traffic classes manipulation.		*
1706  ************************************************/
1707 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event)1708 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1709 			  unsigned long cl,
1710 			  u32 portid, u32 seq, u16 flags, int event)
1711 {
1712 	struct tcmsg *tcm;
1713 	struct nlmsghdr  *nlh;
1714 	unsigned char *b = skb_tail_pointer(skb);
1715 	struct gnet_dump d;
1716 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1717 
1718 	cond_resched();
1719 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1720 	if (!nlh)
1721 		goto out_nlmsg_trim;
1722 	tcm = nlmsg_data(nlh);
1723 	tcm->tcm_family = AF_UNSPEC;
1724 	tcm->tcm__pad1 = 0;
1725 	tcm->tcm__pad2 = 0;
1726 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1727 	tcm->tcm_parent = q->handle;
1728 	tcm->tcm_handle = q->handle;
1729 	tcm->tcm_info = 0;
1730 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1731 		goto nla_put_failure;
1732 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1733 		goto nla_put_failure;
1734 
1735 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1736 					 NULL, &d, TCA_PAD) < 0)
1737 		goto nla_put_failure;
1738 
1739 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1740 		goto nla_put_failure;
1741 
1742 	if (gnet_stats_finish_copy(&d) < 0)
1743 		goto nla_put_failure;
1744 
1745 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1746 	return skb->len;
1747 
1748 out_nlmsg_trim:
1749 nla_put_failure:
1750 	nlmsg_trim(skb, b);
1751 	return -1;
1752 }
1753 
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1754 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1755 			 struct nlmsghdr *n, struct Qdisc *q,
1756 			 unsigned long cl, int event)
1757 {
1758 	struct sk_buff *skb;
1759 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1760 
1761 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1762 	if (!skb)
1763 		return -ENOBUFS;
1764 
1765 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1766 		kfree_skb(skb);
1767 		return -EINVAL;
1768 	}
1769 
1770 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1771 			      n->nlmsg_flags & NLM_F_ECHO);
1772 }
1773 
tclass_del_notify(struct net * net,const struct Qdisc_class_ops * cops,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl)1774 static int tclass_del_notify(struct net *net,
1775 			     const struct Qdisc_class_ops *cops,
1776 			     struct sk_buff *oskb, struct nlmsghdr *n,
1777 			     struct Qdisc *q, unsigned long cl)
1778 {
1779 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1780 	struct sk_buff *skb;
1781 	int err = 0;
1782 
1783 	if (!cops->delete)
1784 		return -EOPNOTSUPP;
1785 
1786 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1787 	if (!skb)
1788 		return -ENOBUFS;
1789 
1790 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1791 			   RTM_DELTCLASS) < 0) {
1792 		kfree_skb(skb);
1793 		return -EINVAL;
1794 	}
1795 
1796 	err = cops->delete(q, cl);
1797 	if (err) {
1798 		kfree_skb(skb);
1799 		return err;
1800 	}
1801 
1802 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1803 			      n->nlmsg_flags & NLM_F_ECHO);
1804 }
1805 
1806 #ifdef CONFIG_NET_CLS
1807 
1808 struct tcf_bind_args {
1809 	struct tcf_walker w;
1810 	unsigned long base;
1811 	unsigned long cl;
1812 	u32 classid;
1813 };
1814 
tcf_node_bind(struct tcf_proto * tp,void * n,struct tcf_walker * arg)1815 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1816 {
1817 	struct tcf_bind_args *a = (void *)arg;
1818 
1819 	if (tp->ops->bind_class) {
1820 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1821 
1822 		sch_tree_lock(q);
1823 		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1824 		sch_tree_unlock(q);
1825 	}
1826 	return 0;
1827 }
1828 
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1829 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1830 			   unsigned long new_cl)
1831 {
1832 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1833 	struct tcf_block *block;
1834 	struct tcf_chain *chain;
1835 	unsigned long cl;
1836 
1837 	cl = cops->find(q, portid);
1838 	if (!cl)
1839 		return;
1840 	if (!cops->tcf_block)
1841 		return;
1842 	block = cops->tcf_block(q, cl, NULL);
1843 	if (!block)
1844 		return;
1845 	list_for_each_entry(chain, &block->chain_list, list) {
1846 		struct tcf_proto *tp;
1847 
1848 		for (tp = rtnl_dereference(chain->filter_chain);
1849 		     tp; tp = rtnl_dereference(tp->next)) {
1850 			struct tcf_bind_args arg = {};
1851 
1852 			arg.w.fn = tcf_node_bind;
1853 			arg.classid = clid;
1854 			arg.base = cl;
1855 			arg.cl = new_cl;
1856 			tp->ops->walk(tp, &arg.w);
1857 		}
1858 	}
1859 }
1860 
1861 #else
1862 
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1863 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1864 			   unsigned long new_cl)
1865 {
1866 }
1867 
1868 #endif
1869 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1870 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1871 			 struct netlink_ext_ack *extack)
1872 {
1873 	struct net *net = sock_net(skb->sk);
1874 	struct tcmsg *tcm = nlmsg_data(n);
1875 	struct nlattr *tca[TCA_MAX + 1];
1876 	struct net_device *dev;
1877 	struct Qdisc *q = NULL;
1878 	const struct Qdisc_class_ops *cops;
1879 	unsigned long cl = 0;
1880 	unsigned long new_cl;
1881 	u32 portid;
1882 	u32 clid;
1883 	u32 qid;
1884 	int err;
1885 
1886 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1887 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1888 		return -EPERM;
1889 
1890 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1891 			  extack);
1892 	if (err < 0)
1893 		return err;
1894 
1895 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1896 	if (!dev)
1897 		return -ENODEV;
1898 
1899 	/*
1900 	   parent == TC_H_UNSPEC - unspecified parent.
1901 	   parent == TC_H_ROOT   - class is root, which has no parent.
1902 	   parent == X:0	 - parent is root class.
1903 	   parent == X:Y	 - parent is a node in hierarchy.
1904 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1905 
1906 	   handle == 0:0	 - generate handle from kernel pool.
1907 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1908 	   handle == X:Y	 - clear.
1909 	   handle == X:0	 - root class.
1910 	 */
1911 
1912 	/* Step 1. Determine qdisc handle X:0 */
1913 
1914 	portid = tcm->tcm_parent;
1915 	clid = tcm->tcm_handle;
1916 	qid = TC_H_MAJ(clid);
1917 
1918 	if (portid != TC_H_ROOT) {
1919 		u32 qid1 = TC_H_MAJ(portid);
1920 
1921 		if (qid && qid1) {
1922 			/* If both majors are known, they must be identical. */
1923 			if (qid != qid1)
1924 				return -EINVAL;
1925 		} else if (qid1) {
1926 			qid = qid1;
1927 		} else if (qid == 0)
1928 			qid = dev->qdisc->handle;
1929 
1930 		/* Now qid is genuine qdisc handle consistent
1931 		 * both with parent and child.
1932 		 *
1933 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1934 		 */
1935 		if (portid)
1936 			portid = TC_H_MAKE(qid, portid);
1937 	} else {
1938 		if (qid == 0)
1939 			qid = dev->qdisc->handle;
1940 	}
1941 
1942 	/* OK. Locate qdisc */
1943 	q = qdisc_lookup(dev, qid);
1944 	if (!q)
1945 		return -ENOENT;
1946 
1947 	/* An check that it supports classes */
1948 	cops = q->ops->cl_ops;
1949 	if (cops == NULL)
1950 		return -EINVAL;
1951 
1952 	/* Now try to get class */
1953 	if (clid == 0) {
1954 		if (portid == TC_H_ROOT)
1955 			clid = qid;
1956 	} else
1957 		clid = TC_H_MAKE(qid, clid);
1958 
1959 	if (clid)
1960 		cl = cops->find(q, clid);
1961 
1962 	if (cl == 0) {
1963 		err = -ENOENT;
1964 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1965 		    !(n->nlmsg_flags & NLM_F_CREATE))
1966 			goto out;
1967 	} else {
1968 		switch (n->nlmsg_type) {
1969 		case RTM_NEWTCLASS:
1970 			err = -EEXIST;
1971 			if (n->nlmsg_flags & NLM_F_EXCL)
1972 				goto out;
1973 			break;
1974 		case RTM_DELTCLASS:
1975 			err = tclass_del_notify(net, cops, skb, n, q, cl);
1976 			/* Unbind the class with flilters with 0 */
1977 			tc_bind_tclass(q, portid, clid, 0);
1978 			goto out;
1979 		case RTM_GETTCLASS:
1980 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1981 			goto out;
1982 		default:
1983 			err = -EINVAL;
1984 			goto out;
1985 		}
1986 	}
1987 
1988 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1989 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1990 		return -EOPNOTSUPP;
1991 	}
1992 
1993 	new_cl = cl;
1994 	err = -EOPNOTSUPP;
1995 	if (cops->change)
1996 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
1997 	if (err == 0) {
1998 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1999 		/* We just create a new class, need to do reverse binding. */
2000 		if (cl != new_cl)
2001 			tc_bind_tclass(q, portid, clid, new_cl);
2002 	}
2003 out:
2004 	return err;
2005 }
2006 
2007 struct qdisc_dump_args {
2008 	struct qdisc_walker	w;
2009 	struct sk_buff		*skb;
2010 	struct netlink_callback	*cb;
2011 };
2012 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)2013 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2014 			    struct qdisc_walker *arg)
2015 {
2016 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2017 
2018 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2019 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2020 			      RTM_NEWTCLASS);
2021 }
2022 
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2023 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2024 				struct tcmsg *tcm, struct netlink_callback *cb,
2025 				int *t_p, int s_t)
2026 {
2027 	struct qdisc_dump_args arg;
2028 
2029 	if (tc_qdisc_dump_ignore(q, false) ||
2030 	    *t_p < s_t || !q->ops->cl_ops ||
2031 	    (tcm->tcm_parent &&
2032 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2033 		(*t_p)++;
2034 		return 0;
2035 	}
2036 	if (*t_p > s_t)
2037 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2038 	arg.w.fn = qdisc_class_dump;
2039 	arg.skb = skb;
2040 	arg.cb = cb;
2041 	arg.w.stop  = 0;
2042 	arg.w.skip = cb->args[1];
2043 	arg.w.count = 0;
2044 	q->ops->cl_ops->walk(q, &arg.w);
2045 	cb->args[1] = arg.w.count;
2046 	if (arg.w.stop)
2047 		return -1;
2048 	(*t_p)++;
2049 	return 0;
2050 }
2051 
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2052 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2053 			       struct tcmsg *tcm, struct netlink_callback *cb,
2054 			       int *t_p, int s_t)
2055 {
2056 	struct Qdisc *q;
2057 	int b;
2058 
2059 	if (!root)
2060 		return 0;
2061 
2062 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2063 		return -1;
2064 
2065 	if (!qdisc_dev(root))
2066 		return 0;
2067 
2068 	if (tcm->tcm_parent) {
2069 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2070 		if (q && q != root &&
2071 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2072 			return -1;
2073 		return 0;
2074 	}
2075 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2076 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2077 			return -1;
2078 	}
2079 
2080 	return 0;
2081 }
2082 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)2083 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2084 {
2085 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2086 	struct net *net = sock_net(skb->sk);
2087 	struct netdev_queue *dev_queue;
2088 	struct net_device *dev;
2089 	int t, s_t;
2090 
2091 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2092 		return 0;
2093 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2094 	if (!dev)
2095 		return 0;
2096 
2097 	s_t = cb->args[0];
2098 	t = 0;
2099 
2100 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2101 		goto done;
2102 
2103 	dev_queue = dev_ingress_queue(dev);
2104 	if (dev_queue &&
2105 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2106 				&t, s_t) < 0)
2107 		goto done;
2108 
2109 done:
2110 	cb->args[0] = t;
2111 
2112 	dev_put(dev);
2113 	return skb->len;
2114 }
2115 
2116 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)2117 static int psched_show(struct seq_file *seq, void *v)
2118 {
2119 	seq_printf(seq, "%08x %08x %08x %08x\n",
2120 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2121 		   1000000,
2122 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2123 
2124 	return 0;
2125 }
2126 
psched_net_init(struct net * net)2127 static int __net_init psched_net_init(struct net *net)
2128 {
2129 	struct proc_dir_entry *e;
2130 
2131 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2132 	if (e == NULL)
2133 		return -ENOMEM;
2134 
2135 	return 0;
2136 }
2137 
psched_net_exit(struct net * net)2138 static void __net_exit psched_net_exit(struct net *net)
2139 {
2140 	remove_proc_entry("psched", net->proc_net);
2141 }
2142 #else
psched_net_init(struct net * net)2143 static int __net_init psched_net_init(struct net *net)
2144 {
2145 	return 0;
2146 }
2147 
psched_net_exit(struct net * net)2148 static void __net_exit psched_net_exit(struct net *net)
2149 {
2150 }
2151 #endif
2152 
2153 static struct pernet_operations psched_net_ops = {
2154 	.init = psched_net_init,
2155 	.exit = psched_net_exit,
2156 };
2157 
pktsched_init(void)2158 static int __init pktsched_init(void)
2159 {
2160 	int err;
2161 
2162 	err = register_pernet_subsys(&psched_net_ops);
2163 	if (err) {
2164 		pr_err("pktsched_init: "
2165 		       "cannot initialize per netns operations\n");
2166 		return err;
2167 	}
2168 
2169 	register_qdisc(&pfifo_fast_ops);
2170 	register_qdisc(&pfifo_qdisc_ops);
2171 	register_qdisc(&bfifo_qdisc_ops);
2172 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2173 	register_qdisc(&mq_qdisc_ops);
2174 	register_qdisc(&noqueue_qdisc_ops);
2175 
2176 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2177 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2178 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2179 		      0);
2180 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2181 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2182 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2183 		      0);
2184 
2185 	return 0;
2186 }
2187 
2188 subsys_initcall(pktsched_init);
2189