• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c	Packet scheduler API.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13 
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28 
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 
35 /*
36 
37    Short review.
38    -------------
39 
40    This file consists of two interrelated parts:
41 
42    1. queueing disciplines manager frontend.
43    2. traffic classes manager frontend.
44 
45    Generally, queueing discipline ("qdisc") is a black box,
46    which is able to enqueue packets and to dequeue them (when
47    device is ready to send something) in order and at times
48    determined by algorithm hidden in it.
49 
50    qdisc's are divided to two categories:
51    - "queues", which have no internal structure visible from outside.
52    - "schedulers", which split all the packets to "traffic classes",
53      using "packet classifiers" (look at cls_api.c)
54 
55    In turn, classes may have child qdiscs (as rule, queues)
56    attached to them etc. etc. etc.
57 
58    The goal of the routines in this file is to translate
59    information supplied by user in the form of handles
60    to more intelligible for kernel form, to make some sanity
61    checks and part of work, which is common to all qdiscs
62    and to provide rtnetlink notifications.
63 
64    All real intelligent work is done inside qdisc modules.
65 
66 
67 
68    Every discipline has two major routines: enqueue and dequeue.
69 
70    ---dequeue
71 
72    dequeue usually returns a skb to send. It is allowed to return NULL,
73    but it does not mean that queue is empty, it just means that
74    discipline does not want to send anything this time.
75    Queue is really empty if q->q.qlen == 0.
76    For complicated disciplines with multiple queues q->q is not
77    real packet queue, but however q->q.qlen must be valid.
78 
79    ---enqueue
80 
81    enqueue returns 0, if packet was enqueued successfully.
82    If packet (this one or another one) was dropped, it returns
83    not zero error code.
84    NET_XMIT_DROP 	- this packet dropped
85      Expected action: do not backoff, but wait until queue will clear.
86    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
87      Expected action: backoff or ignore
88 
89    Auxiliary routines:
90 
91    ---peek
92 
93    like dequeue but without removing a packet from the queue
94 
95    ---reset
96 
97    returns qdisc to initial state: purge all buffers, clear all
98    timers, counters (except for statistics) etc.
99 
100    ---init
101 
102    initializes newly created qdisc.
103 
104    ---destroy
105 
106    destroys resources allocated by init and during lifetime of qdisc.
107 
108    ---change
109 
110    changes qdisc parameters.
111  */
112 
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
115 
116 
117 /************************************************
118  *	Queueing disciplines manipulation.	*
119  ************************************************/
120 
121 
122 /* The list of all installed queueing disciplines. */
123 
124 static struct Qdisc_ops *qdisc_base;
125 
126 /* Register/unregister queueing discipline */
127 
register_qdisc(struct Qdisc_ops * qops)128 int register_qdisc(struct Qdisc_ops *qops)
129 {
130 	struct Qdisc_ops *q, **qp;
131 	int rc = -EEXIST;
132 
133 	write_lock(&qdisc_mod_lock);
134 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135 		if (!strcmp(qops->id, q->id))
136 			goto out;
137 
138 	if (qops->enqueue == NULL)
139 		qops->enqueue = noop_qdisc_ops.enqueue;
140 	if (qops->peek == NULL) {
141 		if (qops->dequeue == NULL)
142 			qops->peek = noop_qdisc_ops.peek;
143 		else
144 			goto out_einval;
145 	}
146 	if (qops->dequeue == NULL)
147 		qops->dequeue = noop_qdisc_ops.dequeue;
148 
149 	if (qops->cl_ops) {
150 		const struct Qdisc_class_ops *cops = qops->cl_ops;
151 
152 		if (!(cops->find && cops->walk && cops->leaf))
153 			goto out_einval;
154 
155 		if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156 			goto out_einval;
157 	}
158 
159 	qops->next = NULL;
160 	*qp = qops;
161 	rc = 0;
162 out:
163 	write_unlock(&qdisc_mod_lock);
164 	return rc;
165 
166 out_einval:
167 	rc = -EINVAL;
168 	goto out;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171 
unregister_qdisc(struct Qdisc_ops * qops)172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174 	struct Qdisc_ops *q, **qp;
175 	int err = -ENOENT;
176 
177 	write_lock(&qdisc_mod_lock);
178 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179 		if (q == qops)
180 			break;
181 	if (q) {
182 		*qp = q->next;
183 		q->next = NULL;
184 		err = 0;
185 	}
186 	write_unlock(&qdisc_mod_lock);
187 	return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190 
191 /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)192 void qdisc_get_default(char *name, size_t len)
193 {
194 	read_lock(&qdisc_mod_lock);
195 	strlcpy(name, default_qdisc_ops->id, len);
196 	read_unlock(&qdisc_mod_lock);
197 }
198 
qdisc_lookup_default(const char * name)199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200 {
201 	struct Qdisc_ops *q = NULL;
202 
203 	for (q = qdisc_base; q; q = q->next) {
204 		if (!strcmp(name, q->id)) {
205 			if (!try_module_get(q->owner))
206 				q = NULL;
207 			break;
208 		}
209 	}
210 
211 	return q;
212 }
213 
214 /* Set new default qdisc to use */
qdisc_set_default(const char * name)215 int qdisc_set_default(const char *name)
216 {
217 	const struct Qdisc_ops *ops;
218 
219 	if (!capable(CAP_NET_ADMIN))
220 		return -EPERM;
221 
222 	write_lock(&qdisc_mod_lock);
223 	ops = qdisc_lookup_default(name);
224 	if (!ops) {
225 		/* Not found, drop lock and try to load module */
226 		write_unlock(&qdisc_mod_lock);
227 		request_module("sch_%s", name);
228 		write_lock(&qdisc_mod_lock);
229 
230 		ops = qdisc_lookup_default(name);
231 	}
232 
233 	if (ops) {
234 		/* Set new default */
235 		module_put(default_qdisc_ops->owner);
236 		default_qdisc_ops = ops;
237 	}
238 	write_unlock(&qdisc_mod_lock);
239 
240 	return ops ? 0 : -ENOENT;
241 }
242 
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
sch_default_qdisc(void)245 static int __init sch_default_qdisc(void)
246 {
247 	return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248 }
249 late_initcall(sch_default_qdisc);
250 #endif
251 
252 /* We know handle. Find qdisc among all qdisc's attached to device
253  * (root qdisc, all its children, children of children etc.)
254  * Note: caller either uses rtnl or rcu_read_lock()
255  */
256 
qdisc_match_from_root(struct Qdisc * root,u32 handle)257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258 {
259 	struct Qdisc *q;
260 
261 	if (!qdisc_dev(root))
262 		return (root->handle == handle ? root : NULL);
263 
264 	if (!(root->flags & TCQ_F_BUILTIN) &&
265 	    root->handle == handle)
266 		return root;
267 
268 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269 		if (q->handle == handle)
270 			return q;
271 	}
272 	return NULL;
273 }
274 
qdisc_hash_add(struct Qdisc * q,bool invisible)275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
276 {
277 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278 		ASSERT_RTNL();
279 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280 		if (invisible)
281 			q->flags |= TCQ_F_INVISIBLE;
282 	}
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285 
qdisc_hash_del(struct Qdisc * q)286 void qdisc_hash_del(struct Qdisc *q)
287 {
288 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289 		ASSERT_RTNL();
290 		hash_del_rcu(&q->hash);
291 	}
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294 
qdisc_lookup(struct net_device * dev,u32 handle)295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297 	struct Qdisc *q;
298 
299 	if (!handle)
300 		return NULL;
301 	q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
302 	if (q)
303 		goto out;
304 
305 	if (dev_ingress_queue(dev))
306 		q = qdisc_match_from_root(
307 			dev_ingress_queue(dev)->qdisc_sleeping,
308 			handle);
309 out:
310 	return q;
311 }
312 
qdisc_lookup_rcu(struct net_device * dev,u32 handle)313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314 {
315 	struct netdev_queue *nq;
316 	struct Qdisc *q;
317 
318 	if (!handle)
319 		return NULL;
320 	q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
321 	if (q)
322 		goto out;
323 
324 	nq = dev_ingress_queue_rcu(dev);
325 	if (nq)
326 		q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327 out:
328 	return q;
329 }
330 
qdisc_leaf(struct Qdisc * p,u32 classid)331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332 {
333 	unsigned long cl;
334 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335 
336 	if (cops == NULL)
337 		return NULL;
338 	cl = cops->find(p, classid);
339 
340 	if (cl == 0)
341 		return NULL;
342 	return cops->leaf(p, cl);
343 }
344 
345 /* Find queueing discipline by name */
346 
qdisc_lookup_ops(struct nlattr * kind)347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348 {
349 	struct Qdisc_ops *q = NULL;
350 
351 	if (kind) {
352 		read_lock(&qdisc_mod_lock);
353 		for (q = qdisc_base; q; q = q->next) {
354 			if (nla_strcmp(kind, q->id) == 0) {
355 				if (!try_module_get(q->owner))
356 					q = NULL;
357 				break;
358 			}
359 		}
360 		read_unlock(&qdisc_mod_lock);
361 	}
362 	return q;
363 }
364 
365 /* The linklayer setting were not transferred from iproute2, in older
366  * versions, and the rate tables lookup systems have been dropped in
367  * the kernel. To keep backward compatible with older iproute2 tc
368  * utils, we detect the linklayer setting by detecting if the rate
369  * table were modified.
370  *
371  * For linklayer ATM table entries, the rate table will be aligned to
372  * 48 bytes, thus some table entries will contain the same value.  The
373  * mpu (min packet unit) is also encoded into the old rate table, thus
374  * starting from the mpu, we find low and high table entries for
375  * mapping this cell.  If these entries contain the same value, when
376  * the rate tables have been modified for linklayer ATM.
377  *
378  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379  * and then roundup to the next cell, calc the table entry one below,
380  * and compare.
381  */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383 {
384 	int low       = roundup(r->mpu, 48);
385 	int high      = roundup(low+1, 48);
386 	int cell_low  = low >> r->cell_log;
387 	int cell_high = (high >> r->cell_log) - 1;
388 
389 	/* rtab is too inaccurate at rates > 100Mbit/s */
390 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391 		pr_debug("TC linklayer: Giving up ATM detection\n");
392 		return TC_LINKLAYER_ETHERNET;
393 	}
394 
395 	if ((cell_high > cell_low) && (cell_high < 256)
396 	    && (rtab[cell_low] == rtab[cell_high])) {
397 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398 			 cell_low, cell_high, rtab[cell_high]);
399 		return TC_LINKLAYER_ATM;
400 	}
401 	return TC_LINKLAYER_ETHERNET;
402 }
403 
404 static struct qdisc_rate_table *qdisc_rtab_list;
405 
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab,struct netlink_ext_ack * extack)406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407 					struct nlattr *tab,
408 					struct netlink_ext_ack *extack)
409 {
410 	struct qdisc_rate_table *rtab;
411 
412 	if (tab == NULL || r->rate == 0 ||
413 	    r->cell_log == 0 || r->cell_log >= 32 ||
414 	    nla_len(tab) != TC_RTAB_SIZE) {
415 		NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
416 		return NULL;
417 	}
418 
419 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
420 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
421 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
422 			rtab->refcnt++;
423 			return rtab;
424 		}
425 	}
426 
427 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
428 	if (rtab) {
429 		rtab->rate = *r;
430 		rtab->refcnt = 1;
431 		memcpy(rtab->data, nla_data(tab), 1024);
432 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
433 			r->linklayer = __detect_linklayer(r, rtab->data);
434 		rtab->next = qdisc_rtab_list;
435 		qdisc_rtab_list = rtab;
436 	} else {
437 		NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
438 	}
439 	return rtab;
440 }
441 EXPORT_SYMBOL(qdisc_get_rtab);
442 
qdisc_put_rtab(struct qdisc_rate_table * tab)443 void qdisc_put_rtab(struct qdisc_rate_table *tab)
444 {
445 	struct qdisc_rate_table *rtab, **rtabp;
446 
447 	if (!tab || --tab->refcnt)
448 		return;
449 
450 	for (rtabp = &qdisc_rtab_list;
451 	     (rtab = *rtabp) != NULL;
452 	     rtabp = &rtab->next) {
453 		if (rtab == tab) {
454 			*rtabp = rtab->next;
455 			kfree(rtab);
456 			return;
457 		}
458 	}
459 }
460 EXPORT_SYMBOL(qdisc_put_rtab);
461 
462 static LIST_HEAD(qdisc_stab_list);
463 
464 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
465 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
466 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
467 };
468 
qdisc_get_stab(struct nlattr * opt,struct netlink_ext_ack * extack)469 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
470 					       struct netlink_ext_ack *extack)
471 {
472 	struct nlattr *tb[TCA_STAB_MAX + 1];
473 	struct qdisc_size_table *stab;
474 	struct tc_sizespec *s;
475 	unsigned int tsize = 0;
476 	u16 *tab = NULL;
477 	int err;
478 
479 	err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
480 					  extack);
481 	if (err < 0)
482 		return ERR_PTR(err);
483 	if (!tb[TCA_STAB_BASE]) {
484 		NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
485 		return ERR_PTR(-EINVAL);
486 	}
487 
488 	s = nla_data(tb[TCA_STAB_BASE]);
489 
490 	if (s->tsize > 0) {
491 		if (!tb[TCA_STAB_DATA]) {
492 			NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
493 			return ERR_PTR(-EINVAL);
494 		}
495 		tab = nla_data(tb[TCA_STAB_DATA]);
496 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
497 	}
498 
499 	if (tsize != s->tsize || (!tab && tsize > 0)) {
500 		NL_SET_ERR_MSG(extack, "Invalid size of size table");
501 		return ERR_PTR(-EINVAL);
502 	}
503 
504 	list_for_each_entry(stab, &qdisc_stab_list, list) {
505 		if (memcmp(&stab->szopts, s, sizeof(*s)))
506 			continue;
507 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
508 			continue;
509 		stab->refcnt++;
510 		return stab;
511 	}
512 
513 	if (s->size_log > STAB_SIZE_LOG_MAX ||
514 	    s->cell_log > STAB_SIZE_LOG_MAX) {
515 		NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
516 		return ERR_PTR(-EINVAL);
517 	}
518 
519 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
520 	if (!stab)
521 		return ERR_PTR(-ENOMEM);
522 
523 	stab->refcnt = 1;
524 	stab->szopts = *s;
525 	if (tsize > 0)
526 		memcpy(stab->data, tab, tsize * sizeof(u16));
527 
528 	list_add_tail(&stab->list, &qdisc_stab_list);
529 
530 	return stab;
531 }
532 
qdisc_put_stab(struct qdisc_size_table * tab)533 void qdisc_put_stab(struct qdisc_size_table *tab)
534 {
535 	if (!tab)
536 		return;
537 
538 	if (--tab->refcnt == 0) {
539 		list_del(&tab->list);
540 		kfree_rcu(tab, rcu);
541 	}
542 }
543 EXPORT_SYMBOL(qdisc_put_stab);
544 
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)545 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
546 {
547 	struct nlattr *nest;
548 
549 	nest = nla_nest_start_noflag(skb, TCA_STAB);
550 	if (nest == NULL)
551 		goto nla_put_failure;
552 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
553 		goto nla_put_failure;
554 	nla_nest_end(skb, nest);
555 
556 	return skb->len;
557 
558 nla_put_failure:
559 	return -1;
560 }
561 
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)562 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
563 			       const struct qdisc_size_table *stab)
564 {
565 	int pkt_len, slot;
566 
567 	pkt_len = skb->len + stab->szopts.overhead;
568 	if (unlikely(!stab->szopts.tsize))
569 		goto out;
570 
571 	slot = pkt_len + stab->szopts.cell_align;
572 	if (unlikely(slot < 0))
573 		slot = 0;
574 
575 	slot >>= stab->szopts.cell_log;
576 	if (likely(slot < stab->szopts.tsize))
577 		pkt_len = stab->data[slot];
578 	else
579 		pkt_len = stab->data[stab->szopts.tsize - 1] *
580 				(slot / stab->szopts.tsize) +
581 				stab->data[slot % stab->szopts.tsize];
582 
583 	pkt_len <<= stab->szopts.size_log;
584 out:
585 	if (unlikely(pkt_len < 1))
586 		pkt_len = 1;
587 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
588 }
589 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
590 
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)591 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
592 {
593 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
594 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
595 			txt, qdisc->ops->id, qdisc->handle >> 16);
596 		qdisc->flags |= TCQ_F_WARN_NONWC;
597 	}
598 }
599 EXPORT_SYMBOL(qdisc_warn_nonwc);
600 
qdisc_watchdog(struct hrtimer * timer)601 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
602 {
603 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
604 						 timer);
605 
606 	rcu_read_lock();
607 	__netif_schedule(qdisc_root(wd->qdisc));
608 	rcu_read_unlock();
609 
610 	return HRTIMER_NORESTART;
611 }
612 
qdisc_watchdog_init_clockid(struct qdisc_watchdog * wd,struct Qdisc * qdisc,clockid_t clockid)613 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
614 				 clockid_t clockid)
615 {
616 	hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
617 	wd->timer.function = qdisc_watchdog;
618 	wd->qdisc = qdisc;
619 }
620 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
621 
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)622 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
623 {
624 	qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init);
627 
qdisc_watchdog_schedule_ns(struct qdisc_watchdog * wd,u64 expires)628 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
629 {
630 	if (test_bit(__QDISC_STATE_DEACTIVATED,
631 		     &qdisc_root_sleeping(wd->qdisc)->state))
632 		return;
633 
634 	if (wd->last_expires == expires)
635 		return;
636 
637 	wd->last_expires = expires;
638 	hrtimer_start(&wd->timer,
639 		      ns_to_ktime(expires),
640 		      HRTIMER_MODE_ABS_PINNED);
641 }
642 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
643 
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)644 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
645 {
646 	hrtimer_cancel(&wd->timer);
647 }
648 EXPORT_SYMBOL(qdisc_watchdog_cancel);
649 
qdisc_class_hash_alloc(unsigned int n)650 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
651 {
652 	struct hlist_head *h;
653 	unsigned int i;
654 
655 	h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
656 
657 	if (h != NULL) {
658 		for (i = 0; i < n; i++)
659 			INIT_HLIST_HEAD(&h[i]);
660 	}
661 	return h;
662 }
663 
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)664 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
665 {
666 	struct Qdisc_class_common *cl;
667 	struct hlist_node *next;
668 	struct hlist_head *nhash, *ohash;
669 	unsigned int nsize, nmask, osize;
670 	unsigned int i, h;
671 
672 	/* Rehash when load factor exceeds 0.75 */
673 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
674 		return;
675 	nsize = clhash->hashsize * 2;
676 	nmask = nsize - 1;
677 	nhash = qdisc_class_hash_alloc(nsize);
678 	if (nhash == NULL)
679 		return;
680 
681 	ohash = clhash->hash;
682 	osize = clhash->hashsize;
683 
684 	sch_tree_lock(sch);
685 	for (i = 0; i < osize; i++) {
686 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
687 			h = qdisc_class_hash(cl->classid, nmask);
688 			hlist_add_head(&cl->hnode, &nhash[h]);
689 		}
690 	}
691 	clhash->hash     = nhash;
692 	clhash->hashsize = nsize;
693 	clhash->hashmask = nmask;
694 	sch_tree_unlock(sch);
695 
696 	kvfree(ohash);
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_grow);
699 
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)700 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
701 {
702 	unsigned int size = 4;
703 
704 	clhash->hash = qdisc_class_hash_alloc(size);
705 	if (!clhash->hash)
706 		return -ENOMEM;
707 	clhash->hashsize  = size;
708 	clhash->hashmask  = size - 1;
709 	clhash->hashelems = 0;
710 	return 0;
711 }
712 EXPORT_SYMBOL(qdisc_class_hash_init);
713 
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)714 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
715 {
716 	kvfree(clhash->hash);
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_destroy);
719 
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)720 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
721 			     struct Qdisc_class_common *cl)
722 {
723 	unsigned int h;
724 
725 	INIT_HLIST_NODE(&cl->hnode);
726 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
727 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
728 	clhash->hashelems++;
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_insert);
731 
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)732 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
733 			     struct Qdisc_class_common *cl)
734 {
735 	hlist_del(&cl->hnode);
736 	clhash->hashelems--;
737 }
738 EXPORT_SYMBOL(qdisc_class_hash_remove);
739 
740 /* Allocate an unique handle from space managed by kernel
741  * Possible range is [8000-FFFF]:0000 (0x8000 values)
742  */
qdisc_alloc_handle(struct net_device * dev)743 static u32 qdisc_alloc_handle(struct net_device *dev)
744 {
745 	int i = 0x8000;
746 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
747 
748 	do {
749 		autohandle += TC_H_MAKE(0x10000U, 0);
750 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
751 			autohandle = TC_H_MAKE(0x80000000U, 0);
752 		if (!qdisc_lookup(dev, autohandle))
753 			return autohandle;
754 		cond_resched();
755 	} while	(--i > 0);
756 
757 	return 0;
758 }
759 
qdisc_tree_reduce_backlog(struct Qdisc * sch,int n,int len)760 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
761 {
762 	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
763 	const struct Qdisc_class_ops *cops;
764 	unsigned long cl;
765 	u32 parentid;
766 	bool notify;
767 	int drops;
768 
769 	if (n == 0 && len == 0)
770 		return;
771 	drops = max_t(int, n, 0);
772 	rcu_read_lock();
773 	while ((parentid = sch->parent)) {
774 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
775 			break;
776 
777 		if (sch->flags & TCQ_F_NOPARENT)
778 			break;
779 		/* Notify parent qdisc only if child qdisc becomes empty.
780 		 *
781 		 * If child was empty even before update then backlog
782 		 * counter is screwed and we skip notification because
783 		 * parent class is already passive.
784 		 *
785 		 * If the original child was offloaded then it is allowed
786 		 * to be seem as empty, so the parent is notified anyway.
787 		 */
788 		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
789 						       !qdisc_is_offloaded);
790 		/* TODO: perform the search on a per txq basis */
791 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
792 		if (sch == NULL) {
793 			WARN_ON_ONCE(parentid != TC_H_ROOT);
794 			break;
795 		}
796 		cops = sch->ops->cl_ops;
797 		if (notify && cops->qlen_notify) {
798 			cl = cops->find(sch, parentid);
799 			cops->qlen_notify(sch, cl);
800 		}
801 		sch->q.qlen -= n;
802 		sch->qstats.backlog -= len;
803 		__qdisc_qstats_drop(sch, drops);
804 	}
805 	rcu_read_unlock();
806 }
807 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
808 
qdisc_offload_dump_helper(struct Qdisc * sch,enum tc_setup_type type,void * type_data)809 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
810 			      void *type_data)
811 {
812 	struct net_device *dev = qdisc_dev(sch);
813 	int err;
814 
815 	sch->flags &= ~TCQ_F_OFFLOADED;
816 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
817 		return 0;
818 
819 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
820 	if (err == -EOPNOTSUPP)
821 		return 0;
822 
823 	if (!err)
824 		sch->flags |= TCQ_F_OFFLOADED;
825 
826 	return err;
827 }
828 EXPORT_SYMBOL(qdisc_offload_dump_helper);
829 
qdisc_offload_graft_helper(struct net_device * dev,struct Qdisc * sch,struct Qdisc * new,struct Qdisc * old,enum tc_setup_type type,void * type_data,struct netlink_ext_ack * extack)830 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
831 				struct Qdisc *new, struct Qdisc *old,
832 				enum tc_setup_type type, void *type_data,
833 				struct netlink_ext_ack *extack)
834 {
835 	bool any_qdisc_is_offloaded;
836 	int err;
837 
838 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
839 		return;
840 
841 	err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
842 
843 	/* Don't report error if the graft is part of destroy operation. */
844 	if (!err || !new || new == &noop_qdisc)
845 		return;
846 
847 	/* Don't report error if the parent, the old child and the new
848 	 * one are not offloaded.
849 	 */
850 	any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
851 	any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
852 	any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
853 
854 	if (any_qdisc_is_offloaded)
855 		NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
856 }
857 EXPORT_SYMBOL(qdisc_offload_graft_helper);
858 
qdisc_offload_graft_root(struct net_device * dev,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)859 static void qdisc_offload_graft_root(struct net_device *dev,
860 				     struct Qdisc *new, struct Qdisc *old,
861 				     struct netlink_ext_ack *extack)
862 {
863 	struct tc_root_qopt_offload graft_offload = {
864 		.command	= TC_ROOT_GRAFT,
865 		.handle		= new ? new->handle : 0,
866 		.ingress	= (new && new->flags & TCQ_F_INGRESS) ||
867 				  (old && old->flags & TCQ_F_INGRESS),
868 	};
869 
870 	qdisc_offload_graft_helper(dev, NULL, new, old,
871 				   TC_SETUP_ROOT_QDISC, &graft_offload, extack);
872 }
873 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event)874 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
875 			 u32 portid, u32 seq, u16 flags, int event)
876 {
877 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
878 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
879 	struct tcmsg *tcm;
880 	struct nlmsghdr  *nlh;
881 	unsigned char *b = skb_tail_pointer(skb);
882 	struct gnet_dump d;
883 	struct qdisc_size_table *stab;
884 	u32 block_index;
885 	__u32 qlen;
886 
887 	cond_resched();
888 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
889 	if (!nlh)
890 		goto out_nlmsg_trim;
891 	tcm = nlmsg_data(nlh);
892 	tcm->tcm_family = AF_UNSPEC;
893 	tcm->tcm__pad1 = 0;
894 	tcm->tcm__pad2 = 0;
895 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
896 	tcm->tcm_parent = clid;
897 	tcm->tcm_handle = q->handle;
898 	tcm->tcm_info = refcount_read(&q->refcnt);
899 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
900 		goto nla_put_failure;
901 	if (q->ops->ingress_block_get) {
902 		block_index = q->ops->ingress_block_get(q);
903 		if (block_index &&
904 		    nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
905 			goto nla_put_failure;
906 	}
907 	if (q->ops->egress_block_get) {
908 		block_index = q->ops->egress_block_get(q);
909 		if (block_index &&
910 		    nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
911 			goto nla_put_failure;
912 	}
913 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
914 		goto nla_put_failure;
915 	if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
916 		goto nla_put_failure;
917 	qlen = qdisc_qlen_sum(q);
918 
919 	stab = rtnl_dereference(q->stab);
920 	if (stab && qdisc_dump_stab(skb, stab) < 0)
921 		goto nla_put_failure;
922 
923 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
924 					 NULL, &d, TCA_PAD) < 0)
925 		goto nla_put_failure;
926 
927 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
928 		goto nla_put_failure;
929 
930 	if (qdisc_is_percpu_stats(q)) {
931 		cpu_bstats = q->cpu_bstats;
932 		cpu_qstats = q->cpu_qstats;
933 	}
934 
935 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
936 				  &d, cpu_bstats, &q->bstats) < 0 ||
937 	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
938 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
939 		goto nla_put_failure;
940 
941 	if (gnet_stats_finish_copy(&d) < 0)
942 		goto nla_put_failure;
943 
944 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
945 	return skb->len;
946 
947 out_nlmsg_trim:
948 nla_put_failure:
949 	nlmsg_trim(skb, b);
950 	return -1;
951 }
952 
tc_qdisc_dump_ignore(struct Qdisc * q,bool dump_invisible)953 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
954 {
955 	if (q->flags & TCQ_F_BUILTIN)
956 		return true;
957 	if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
958 		return true;
959 
960 	return false;
961 }
962 
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)963 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
964 			struct nlmsghdr *n, u32 clid,
965 			struct Qdisc *old, struct Qdisc *new)
966 {
967 	struct sk_buff *skb;
968 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
969 
970 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
971 	if (!skb)
972 		return -ENOBUFS;
973 
974 	if (old && !tc_qdisc_dump_ignore(old, false)) {
975 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
976 				  0, RTM_DELQDISC) < 0)
977 			goto err_out;
978 	}
979 	if (new && !tc_qdisc_dump_ignore(new, false)) {
980 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
981 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
982 			goto err_out;
983 	}
984 
985 	if (skb->len)
986 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
987 				      n->nlmsg_flags & NLM_F_ECHO);
988 
989 err_out:
990 	kfree_skb(skb);
991 	return -EINVAL;
992 }
993 
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)994 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
995 			       struct nlmsghdr *n, u32 clid,
996 			       struct Qdisc *old, struct Qdisc *new)
997 {
998 	if (new || old)
999 		qdisc_notify(net, skb, n, clid, old, new);
1000 
1001 	if (old)
1002 		qdisc_put(old);
1003 }
1004 
qdisc_clear_nolock(struct Qdisc * sch)1005 static void qdisc_clear_nolock(struct Qdisc *sch)
1006 {
1007 	sch->flags &= ~TCQ_F_NOLOCK;
1008 	if (!(sch->flags & TCQ_F_CPUSTATS))
1009 		return;
1010 
1011 	free_percpu(sch->cpu_bstats);
1012 	free_percpu(sch->cpu_qstats);
1013 	sch->cpu_bstats = NULL;
1014 	sch->cpu_qstats = NULL;
1015 	sch->flags &= ~TCQ_F_CPUSTATS;
1016 }
1017 
1018 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1019  * to device "dev".
1020  *
1021  * When appropriate send a netlink notification using 'skb'
1022  * and "n".
1023  *
1024  * On success, destroy old qdisc.
1025  */
1026 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)1027 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1028 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1029 		       struct Qdisc *new, struct Qdisc *old,
1030 		       struct netlink_ext_ack *extack)
1031 {
1032 	struct Qdisc *q = old;
1033 	struct net *net = dev_net(dev);
1034 
1035 	if (parent == NULL) {
1036 		unsigned int i, num_q, ingress;
1037 
1038 		ingress = 0;
1039 		num_q = dev->num_tx_queues;
1040 		if ((q && q->flags & TCQ_F_INGRESS) ||
1041 		    (new && new->flags & TCQ_F_INGRESS)) {
1042 			num_q = 1;
1043 			ingress = 1;
1044 			if (!dev_ingress_queue(dev)) {
1045 				NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1046 				return -ENOENT;
1047 			}
1048 		}
1049 
1050 		if (dev->flags & IFF_UP)
1051 			dev_deactivate(dev);
1052 
1053 		qdisc_offload_graft_root(dev, new, old, extack);
1054 
1055 		if (new && new->ops->attach)
1056 			goto skip;
1057 
1058 		for (i = 0; i < num_q; i++) {
1059 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1060 
1061 			if (!ingress)
1062 				dev_queue = netdev_get_tx_queue(dev, i);
1063 
1064 			old = dev_graft_qdisc(dev_queue, new);
1065 			if (new && i > 0)
1066 				qdisc_refcount_inc(new);
1067 
1068 			if (!ingress)
1069 				qdisc_put(old);
1070 		}
1071 
1072 skip:
1073 		if (!ingress) {
1074 			old = rtnl_dereference(dev->qdisc);
1075 			if (new && !new->ops->attach)
1076 				qdisc_refcount_inc(new);
1077 			rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1078 
1079 			notify_and_destroy(net, skb, n, classid, old, new);
1080 
1081 			if (new && new->ops->attach)
1082 				new->ops->attach(new);
1083 		} else {
1084 			notify_and_destroy(net, skb, n, classid, old, new);
1085 		}
1086 
1087 		if (dev->flags & IFF_UP)
1088 			dev_activate(dev);
1089 	} else {
1090 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1091 		unsigned long cl;
1092 		int err;
1093 
1094 		/* Only support running class lockless if parent is lockless */
1095 		if (new && (new->flags & TCQ_F_NOLOCK) &&
1096 		    parent && !(parent->flags & TCQ_F_NOLOCK))
1097 			qdisc_clear_nolock(new);
1098 
1099 		if (!cops || !cops->graft)
1100 			return -EOPNOTSUPP;
1101 
1102 		cl = cops->find(parent, classid);
1103 		if (!cl) {
1104 			NL_SET_ERR_MSG(extack, "Specified class not found");
1105 			return -ENOENT;
1106 		}
1107 
1108 		if (new && new->ops == &noqueue_qdisc_ops) {
1109 			NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1110 			return -EINVAL;
1111 		}
1112 
1113 		err = cops->graft(parent, cl, new, &old, extack);
1114 		if (err)
1115 			return err;
1116 		notify_and_destroy(net, skb, n, classid, old, new);
1117 	}
1118 	return 0;
1119 }
1120 
qdisc_block_indexes_set(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1121 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1122 				   struct netlink_ext_ack *extack)
1123 {
1124 	u32 block_index;
1125 
1126 	if (tca[TCA_INGRESS_BLOCK]) {
1127 		block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1128 
1129 		if (!block_index) {
1130 			NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1131 			return -EINVAL;
1132 		}
1133 		if (!sch->ops->ingress_block_set) {
1134 			NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1135 			return -EOPNOTSUPP;
1136 		}
1137 		sch->ops->ingress_block_set(sch, block_index);
1138 	}
1139 	if (tca[TCA_EGRESS_BLOCK]) {
1140 		block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1141 
1142 		if (!block_index) {
1143 			NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1144 			return -EINVAL;
1145 		}
1146 		if (!sch->ops->egress_block_set) {
1147 			NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1148 			return -EOPNOTSUPP;
1149 		}
1150 		sch->ops->egress_block_set(sch, block_index);
1151 	}
1152 	return 0;
1153 }
1154 
1155 /*
1156    Allocate and initialize new qdisc.
1157 
1158    Parameters are passed via opt.
1159  */
1160 
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp,struct netlink_ext_ack * extack)1161 static struct Qdisc *qdisc_create(struct net_device *dev,
1162 				  struct netdev_queue *dev_queue,
1163 				  struct Qdisc *p, u32 parent, u32 handle,
1164 				  struct nlattr **tca, int *errp,
1165 				  struct netlink_ext_ack *extack)
1166 {
1167 	int err;
1168 	struct nlattr *kind = tca[TCA_KIND];
1169 	struct Qdisc *sch;
1170 	struct Qdisc_ops *ops;
1171 	struct qdisc_size_table *stab;
1172 
1173 	ops = qdisc_lookup_ops(kind);
1174 #ifdef CONFIG_MODULES
1175 	if (ops == NULL && kind != NULL) {
1176 		char name[IFNAMSIZ];
1177 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1178 			/* We dropped the RTNL semaphore in order to
1179 			 * perform the module load.  So, even if we
1180 			 * succeeded in loading the module we have to
1181 			 * tell the caller to replay the request.  We
1182 			 * indicate this using -EAGAIN.
1183 			 * We replay the request because the device may
1184 			 * go away in the mean time.
1185 			 */
1186 			rtnl_unlock();
1187 			request_module("sch_%s", name);
1188 			rtnl_lock();
1189 			ops = qdisc_lookup_ops(kind);
1190 			if (ops != NULL) {
1191 				/* We will try again qdisc_lookup_ops,
1192 				 * so don't keep a reference.
1193 				 */
1194 				module_put(ops->owner);
1195 				err = -EAGAIN;
1196 				goto err_out;
1197 			}
1198 		}
1199 	}
1200 #endif
1201 
1202 	err = -ENOENT;
1203 	if (!ops) {
1204 		NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1205 		goto err_out;
1206 	}
1207 
1208 	sch = qdisc_alloc(dev_queue, ops, extack);
1209 	if (IS_ERR(sch)) {
1210 		err = PTR_ERR(sch);
1211 		goto err_out2;
1212 	}
1213 
1214 	sch->parent = parent;
1215 
1216 	if (handle == TC_H_INGRESS) {
1217 		if (!(sch->flags & TCQ_F_INGRESS)) {
1218 			NL_SET_ERR_MSG(extack,
1219 				       "Specified parent ID is reserved for ingress and clsact Qdiscs");
1220 			err = -EINVAL;
1221 			goto err_out3;
1222 		}
1223 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
1224 	} else {
1225 		if (handle == 0) {
1226 			handle = qdisc_alloc_handle(dev);
1227 			if (handle == 0) {
1228 				NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1229 				err = -ENOSPC;
1230 				goto err_out3;
1231 			}
1232 		}
1233 		if (!netif_is_multiqueue(dev))
1234 			sch->flags |= TCQ_F_ONETXQUEUE;
1235 	}
1236 
1237 	sch->handle = handle;
1238 
1239 	/* This exist to keep backward compatible with a userspace
1240 	 * loophole, what allowed userspace to get IFF_NO_QUEUE
1241 	 * facility on older kernels by setting tx_queue_len=0 (prior
1242 	 * to qdisc init), and then forgot to reinit tx_queue_len
1243 	 * before again attaching a qdisc.
1244 	 */
1245 	if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1246 		dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1247 		netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1248 	}
1249 
1250 	err = qdisc_block_indexes_set(sch, tca, extack);
1251 	if (err)
1252 		goto err_out3;
1253 
1254 	if (ops->init) {
1255 		err = ops->init(sch, tca[TCA_OPTIONS], extack);
1256 		if (err != 0)
1257 			goto err_out5;
1258 	}
1259 
1260 	if (tca[TCA_STAB]) {
1261 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1262 		if (IS_ERR(stab)) {
1263 			err = PTR_ERR(stab);
1264 			goto err_out4;
1265 		}
1266 		rcu_assign_pointer(sch->stab, stab);
1267 	}
1268 	if (tca[TCA_RATE]) {
1269 		seqcount_t *running;
1270 
1271 		err = -EOPNOTSUPP;
1272 		if (sch->flags & TCQ_F_MQROOT) {
1273 			NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1274 			goto err_out4;
1275 		}
1276 
1277 		if (sch->parent != TC_H_ROOT &&
1278 		    !(sch->flags & TCQ_F_INGRESS) &&
1279 		    (!p || !(p->flags & TCQ_F_MQROOT)))
1280 			running = qdisc_root_sleeping_running(sch);
1281 		else
1282 			running = &sch->running;
1283 
1284 		err = gen_new_estimator(&sch->bstats,
1285 					sch->cpu_bstats,
1286 					&sch->rate_est,
1287 					NULL,
1288 					running,
1289 					tca[TCA_RATE]);
1290 		if (err) {
1291 			NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1292 			goto err_out4;
1293 		}
1294 	}
1295 
1296 	qdisc_hash_add(sch, false);
1297 
1298 	return sch;
1299 
1300 err_out5:
1301 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1302 	if (ops->destroy)
1303 		ops->destroy(sch);
1304 err_out3:
1305 	dev_put(dev);
1306 	qdisc_free(sch);
1307 err_out2:
1308 	module_put(ops->owner);
1309 err_out:
1310 	*errp = err;
1311 	return NULL;
1312 
1313 err_out4:
1314 	/*
1315 	 * Any broken qdiscs that would require a ops->reset() here?
1316 	 * The qdisc was never in action so it shouldn't be necessary.
1317 	 */
1318 	qdisc_put_stab(rtnl_dereference(sch->stab));
1319 	if (ops->destroy)
1320 		ops->destroy(sch);
1321 	goto err_out3;
1322 }
1323 
qdisc_change(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1324 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1325 			struct netlink_ext_ack *extack)
1326 {
1327 	struct qdisc_size_table *ostab, *stab = NULL;
1328 	int err = 0;
1329 
1330 	if (tca[TCA_OPTIONS]) {
1331 		if (!sch->ops->change) {
1332 			NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1333 			return -EINVAL;
1334 		}
1335 		if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1336 			NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1337 			return -EOPNOTSUPP;
1338 		}
1339 		err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1340 		if (err)
1341 			return err;
1342 	}
1343 
1344 	if (tca[TCA_STAB]) {
1345 		stab = qdisc_get_stab(tca[TCA_STAB], extack);
1346 		if (IS_ERR(stab))
1347 			return PTR_ERR(stab);
1348 	}
1349 
1350 	ostab = rtnl_dereference(sch->stab);
1351 	rcu_assign_pointer(sch->stab, stab);
1352 	qdisc_put_stab(ostab);
1353 
1354 	if (tca[TCA_RATE]) {
1355 		/* NB: ignores errors from replace_estimator
1356 		   because change can't be undone. */
1357 		if (sch->flags & TCQ_F_MQROOT)
1358 			goto out;
1359 		gen_replace_estimator(&sch->bstats,
1360 				      sch->cpu_bstats,
1361 				      &sch->rate_est,
1362 				      NULL,
1363 				      qdisc_root_sleeping_running(sch),
1364 				      tca[TCA_RATE]);
1365 	}
1366 out:
1367 	return 0;
1368 }
1369 
1370 struct check_loop_arg {
1371 	struct qdisc_walker	w;
1372 	struct Qdisc		*p;
1373 	int			depth;
1374 };
1375 
1376 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1377 			 struct qdisc_walker *w);
1378 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1379 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1380 {
1381 	struct check_loop_arg	arg;
1382 
1383 	if (q->ops->cl_ops == NULL)
1384 		return 0;
1385 
1386 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1387 	arg.w.fn = check_loop_fn;
1388 	arg.depth = depth;
1389 	arg.p = p;
1390 	q->ops->cl_ops->walk(q, &arg.w);
1391 	return arg.w.stop ? -ELOOP : 0;
1392 }
1393 
1394 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1395 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1396 {
1397 	struct Qdisc *leaf;
1398 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1399 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1400 
1401 	leaf = cops->leaf(q, cl);
1402 	if (leaf) {
1403 		if (leaf == arg->p || arg->depth > 7)
1404 			return -ELOOP;
1405 		return check_loop(leaf, arg->p, arg->depth + 1);
1406 	}
1407 	return 0;
1408 }
1409 
1410 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1411 	[TCA_KIND]		= { .type = NLA_STRING },
1412 	[TCA_RATE]		= { .type = NLA_BINARY,
1413 				    .len = sizeof(struct tc_estimator) },
1414 	[TCA_STAB]		= { .type = NLA_NESTED },
1415 	[TCA_DUMP_INVISIBLE]	= { .type = NLA_FLAG },
1416 	[TCA_CHAIN]		= { .type = NLA_U32 },
1417 	[TCA_INGRESS_BLOCK]	= { .type = NLA_U32 },
1418 	[TCA_EGRESS_BLOCK]	= { .type = NLA_U32 },
1419 };
1420 
1421 /*
1422  * Delete/get qdisc.
1423  */
1424 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1425 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1426 			struct netlink_ext_ack *extack)
1427 {
1428 	struct net *net = sock_net(skb->sk);
1429 	struct tcmsg *tcm = nlmsg_data(n);
1430 	struct nlattr *tca[TCA_MAX + 1];
1431 	struct net_device *dev;
1432 	u32 clid;
1433 	struct Qdisc *q = NULL;
1434 	struct Qdisc *p = NULL;
1435 	int err;
1436 
1437 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1438 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1439 		return -EPERM;
1440 
1441 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1442 				     rtm_tca_policy, extack);
1443 	if (err < 0)
1444 		return err;
1445 
1446 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1447 	if (!dev)
1448 		return -ENODEV;
1449 
1450 	clid = tcm->tcm_parent;
1451 	if (clid) {
1452 		if (clid != TC_H_ROOT) {
1453 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1454 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1455 				if (!p) {
1456 					NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1457 					return -ENOENT;
1458 				}
1459 				q = qdisc_leaf(p, clid);
1460 			} else if (dev_ingress_queue(dev)) {
1461 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1462 			}
1463 		} else {
1464 			q = rtnl_dereference(dev->qdisc);
1465 		}
1466 		if (!q) {
1467 			NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1468 			return -ENOENT;
1469 		}
1470 
1471 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1472 			NL_SET_ERR_MSG(extack, "Invalid handle");
1473 			return -EINVAL;
1474 		}
1475 	} else {
1476 		q = qdisc_lookup(dev, tcm->tcm_handle);
1477 		if (!q) {
1478 			NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1479 			return -ENOENT;
1480 		}
1481 	}
1482 
1483 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1484 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1485 		return -EINVAL;
1486 	}
1487 
1488 	if (n->nlmsg_type == RTM_DELQDISC) {
1489 		if (!clid) {
1490 			NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1491 			return -EINVAL;
1492 		}
1493 		if (q->handle == 0) {
1494 			NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1495 			return -ENOENT;
1496 		}
1497 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1498 		if (err != 0)
1499 			return err;
1500 	} else {
1501 		qdisc_notify(net, skb, n, clid, NULL, q);
1502 	}
1503 	return 0;
1504 }
1505 
req_create_or_replace(struct nlmsghdr * n)1506 static bool req_create_or_replace(struct nlmsghdr *n)
1507 {
1508 	return (n->nlmsg_flags & NLM_F_CREATE &&
1509 		n->nlmsg_flags & NLM_F_REPLACE);
1510 }
1511 
req_create_exclusive(struct nlmsghdr * n)1512 static bool req_create_exclusive(struct nlmsghdr *n)
1513 {
1514 	return (n->nlmsg_flags & NLM_F_CREATE &&
1515 		n->nlmsg_flags & NLM_F_EXCL);
1516 }
1517 
req_change(struct nlmsghdr * n)1518 static bool req_change(struct nlmsghdr *n)
1519 {
1520 	return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1521 		!(n->nlmsg_flags & NLM_F_REPLACE) &&
1522 		!(n->nlmsg_flags & NLM_F_EXCL));
1523 }
1524 
1525 /*
1526  * Create/change qdisc.
1527  */
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1528 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1529 			   struct netlink_ext_ack *extack)
1530 {
1531 	struct net *net = sock_net(skb->sk);
1532 	struct tcmsg *tcm;
1533 	struct nlattr *tca[TCA_MAX + 1];
1534 	struct net_device *dev;
1535 	u32 clid;
1536 	struct Qdisc *q, *p;
1537 	int err;
1538 
1539 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1540 		return -EPERM;
1541 
1542 replay:
1543 	/* Reinit, just in case something touches this. */
1544 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1545 				     rtm_tca_policy, extack);
1546 	if (err < 0)
1547 		return err;
1548 
1549 	tcm = nlmsg_data(n);
1550 	clid = tcm->tcm_parent;
1551 	q = p = NULL;
1552 
1553 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1554 	if (!dev)
1555 		return -ENODEV;
1556 
1557 
1558 	if (clid) {
1559 		if (clid != TC_H_ROOT) {
1560 			if (clid != TC_H_INGRESS) {
1561 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1562 				if (!p) {
1563 					NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1564 					return -ENOENT;
1565 				}
1566 				q = qdisc_leaf(p, clid);
1567 			} else if (dev_ingress_queue_create(dev)) {
1568 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1569 			}
1570 		} else {
1571 			q = rtnl_dereference(dev->qdisc);
1572 		}
1573 
1574 		/* It may be default qdisc, ignore it */
1575 		if (q && q->handle == 0)
1576 			q = NULL;
1577 
1578 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1579 			if (tcm->tcm_handle) {
1580 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1581 					NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1582 					return -EEXIST;
1583 				}
1584 				if (TC_H_MIN(tcm->tcm_handle)) {
1585 					NL_SET_ERR_MSG(extack, "Invalid minor handle");
1586 					return -EINVAL;
1587 				}
1588 				q = qdisc_lookup(dev, tcm->tcm_handle);
1589 				if (!q)
1590 					goto create_n_graft;
1591 				if (n->nlmsg_flags & NLM_F_EXCL) {
1592 					NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1593 					return -EEXIST;
1594 				}
1595 				if (tca[TCA_KIND] &&
1596 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1597 					NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1598 					return -EINVAL;
1599 				}
1600 				if (q->flags & TCQ_F_INGRESS) {
1601 					NL_SET_ERR_MSG(extack,
1602 						       "Cannot regraft ingress or clsact Qdiscs");
1603 					return -EINVAL;
1604 				}
1605 				if (q == p ||
1606 				    (p && check_loop(q, p, 0))) {
1607 					NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1608 					return -ELOOP;
1609 				}
1610 				if (clid == TC_H_INGRESS) {
1611 					NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1612 					return -EINVAL;
1613 				}
1614 				qdisc_refcount_inc(q);
1615 				goto graft;
1616 			} else {
1617 				if (!q)
1618 					goto create_n_graft;
1619 
1620 				/* This magic test requires explanation.
1621 				 *
1622 				 *   We know, that some child q is already
1623 				 *   attached to this parent and have choice:
1624 				 *   1) change it or 2) create/graft new one.
1625 				 *   If the requested qdisc kind is different
1626 				 *   than the existing one, then we choose graft.
1627 				 *   If they are the same then this is "change"
1628 				 *   operation - just let it fallthrough..
1629 				 *
1630 				 *   1. We are allowed to create/graft only
1631 				 *   if the request is explicitly stating
1632 				 *   "please create if it doesn't exist".
1633 				 *
1634 				 *   2. If the request is to exclusive create
1635 				 *   then the qdisc tcm_handle is not expected
1636 				 *   to exist, so that we choose create/graft too.
1637 				 *
1638 				 *   3. The last case is when no flags are set.
1639 				 *   This will happen when for example tc
1640 				 *   utility issues a "change" command.
1641 				 *   Alas, it is sort of hole in API, we
1642 				 *   cannot decide what to do unambiguously.
1643 				 *   For now we select create/graft.
1644 				 */
1645 				if (tca[TCA_KIND] &&
1646 				    nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1647 					if (req_create_or_replace(n) ||
1648 					    req_create_exclusive(n))
1649 						goto create_n_graft;
1650 					else if (req_change(n))
1651 						goto create_n_graft2;
1652 				}
1653 			}
1654 		}
1655 	} else {
1656 		if (!tcm->tcm_handle) {
1657 			NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1658 			return -EINVAL;
1659 		}
1660 		q = qdisc_lookup(dev, tcm->tcm_handle);
1661 	}
1662 
1663 	/* Change qdisc parameters */
1664 	if (!q) {
1665 		NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1666 		return -ENOENT;
1667 	}
1668 	if (n->nlmsg_flags & NLM_F_EXCL) {
1669 		NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1670 		return -EEXIST;
1671 	}
1672 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1673 		NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1674 		return -EINVAL;
1675 	}
1676 	err = qdisc_change(q, tca, extack);
1677 	if (err == 0)
1678 		qdisc_notify(net, skb, n, clid, NULL, q);
1679 	return err;
1680 
1681 create_n_graft:
1682 	if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1683 		NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1684 		return -ENOENT;
1685 	}
1686 create_n_graft2:
1687 	if (clid == TC_H_INGRESS) {
1688 		if (dev_ingress_queue(dev)) {
1689 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1690 					 tcm->tcm_parent, tcm->tcm_parent,
1691 					 tca, &err, extack);
1692 		} else {
1693 			NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1694 			err = -ENOENT;
1695 		}
1696 	} else {
1697 		struct netdev_queue *dev_queue;
1698 
1699 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1700 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1701 		else if (p)
1702 			dev_queue = p->dev_queue;
1703 		else
1704 			dev_queue = netdev_get_tx_queue(dev, 0);
1705 
1706 		q = qdisc_create(dev, dev_queue, p,
1707 				 tcm->tcm_parent, tcm->tcm_handle,
1708 				 tca, &err, extack);
1709 	}
1710 	if (q == NULL) {
1711 		if (err == -EAGAIN)
1712 			goto replay;
1713 		return err;
1714 	}
1715 
1716 graft:
1717 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1718 	if (err) {
1719 		if (q)
1720 			qdisc_put(q);
1721 		return err;
1722 	}
1723 
1724 	return 0;
1725 }
1726 
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur,bool dump_invisible)1727 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1728 			      struct netlink_callback *cb,
1729 			      int *q_idx_p, int s_q_idx, bool recur,
1730 			      bool dump_invisible)
1731 {
1732 	int ret = 0, q_idx = *q_idx_p;
1733 	struct Qdisc *q;
1734 	int b;
1735 
1736 	if (!root)
1737 		return 0;
1738 
1739 	q = root;
1740 	if (q_idx < s_q_idx) {
1741 		q_idx++;
1742 	} else {
1743 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1744 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1745 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1746 				  RTM_NEWQDISC) <= 0)
1747 			goto done;
1748 		q_idx++;
1749 	}
1750 
1751 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1752 	 * itself has already been dumped.
1753 	 *
1754 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1755 	 * qdisc hashtable, we don't want to hit it again
1756 	 */
1757 	if (!qdisc_dev(root) || !recur)
1758 		goto out;
1759 
1760 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1761 		if (q_idx < s_q_idx) {
1762 			q_idx++;
1763 			continue;
1764 		}
1765 		if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1766 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1767 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1768 				  RTM_NEWQDISC) <= 0)
1769 			goto done;
1770 		q_idx++;
1771 	}
1772 
1773 out:
1774 	*q_idx_p = q_idx;
1775 	return ret;
1776 done:
1777 	ret = -1;
1778 	goto out;
1779 }
1780 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1781 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1782 {
1783 	struct net *net = sock_net(skb->sk);
1784 	int idx, q_idx;
1785 	int s_idx, s_q_idx;
1786 	struct net_device *dev;
1787 	const struct nlmsghdr *nlh = cb->nlh;
1788 	struct nlattr *tca[TCA_MAX + 1];
1789 	int err;
1790 
1791 	s_idx = cb->args[0];
1792 	s_q_idx = q_idx = cb->args[1];
1793 
1794 	idx = 0;
1795 	ASSERT_RTNL();
1796 
1797 	err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1798 				     rtm_tca_policy, cb->extack);
1799 	if (err < 0)
1800 		return err;
1801 
1802 	for_each_netdev(net, dev) {
1803 		struct netdev_queue *dev_queue;
1804 
1805 		if (idx < s_idx)
1806 			goto cont;
1807 		if (idx > s_idx)
1808 			s_q_idx = 0;
1809 		q_idx = 0;
1810 
1811 		if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1812 				       skb, cb, &q_idx, s_q_idx,
1813 				       true, tca[TCA_DUMP_INVISIBLE]) < 0)
1814 			goto done;
1815 
1816 		dev_queue = dev_ingress_queue(dev);
1817 		if (dev_queue &&
1818 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1819 				       &q_idx, s_q_idx, false,
1820 				       tca[TCA_DUMP_INVISIBLE]) < 0)
1821 			goto done;
1822 
1823 cont:
1824 		idx++;
1825 	}
1826 
1827 done:
1828 	cb->args[0] = idx;
1829 	cb->args[1] = q_idx;
1830 
1831 	return skb->len;
1832 }
1833 
1834 
1835 
1836 /************************************************
1837  *	Traffic classes manipulation.		*
1838  ************************************************/
1839 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event)1840 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1841 			  unsigned long cl,
1842 			  u32 portid, u32 seq, u16 flags, int event)
1843 {
1844 	struct tcmsg *tcm;
1845 	struct nlmsghdr  *nlh;
1846 	unsigned char *b = skb_tail_pointer(skb);
1847 	struct gnet_dump d;
1848 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1849 
1850 	cond_resched();
1851 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1852 	if (!nlh)
1853 		goto out_nlmsg_trim;
1854 	tcm = nlmsg_data(nlh);
1855 	tcm->tcm_family = AF_UNSPEC;
1856 	tcm->tcm__pad1 = 0;
1857 	tcm->tcm__pad2 = 0;
1858 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1859 	tcm->tcm_parent = q->handle;
1860 	tcm->tcm_handle = q->handle;
1861 	tcm->tcm_info = 0;
1862 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1863 		goto nla_put_failure;
1864 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1865 		goto nla_put_failure;
1866 
1867 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1868 					 NULL, &d, TCA_PAD) < 0)
1869 		goto nla_put_failure;
1870 
1871 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1872 		goto nla_put_failure;
1873 
1874 	if (gnet_stats_finish_copy(&d) < 0)
1875 		goto nla_put_failure;
1876 
1877 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1878 	return skb->len;
1879 
1880 out_nlmsg_trim:
1881 nla_put_failure:
1882 	nlmsg_trim(skb, b);
1883 	return -1;
1884 }
1885 
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1886 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1887 			 struct nlmsghdr *n, struct Qdisc *q,
1888 			 unsigned long cl, int event)
1889 {
1890 	struct sk_buff *skb;
1891 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1892 	int err = 0;
1893 
1894 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1895 	if (!skb)
1896 		return -ENOBUFS;
1897 
1898 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1899 		kfree_skb(skb);
1900 		return -EINVAL;
1901 	}
1902 
1903 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1904 			     n->nlmsg_flags & NLM_F_ECHO);
1905 	if (err > 0)
1906 		err = 0;
1907 	return err;
1908 }
1909 
tclass_del_notify(struct net * net,const struct Qdisc_class_ops * cops,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl)1910 static int tclass_del_notify(struct net *net,
1911 			     const struct Qdisc_class_ops *cops,
1912 			     struct sk_buff *oskb, struct nlmsghdr *n,
1913 			     struct Qdisc *q, unsigned long cl)
1914 {
1915 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1916 	struct sk_buff *skb;
1917 	int err = 0;
1918 
1919 	if (!cops->delete)
1920 		return -EOPNOTSUPP;
1921 
1922 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1923 	if (!skb)
1924 		return -ENOBUFS;
1925 
1926 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1927 			   RTM_DELTCLASS) < 0) {
1928 		kfree_skb(skb);
1929 		return -EINVAL;
1930 	}
1931 
1932 	err = cops->delete(q, cl);
1933 	if (err) {
1934 		kfree_skb(skb);
1935 		return err;
1936 	}
1937 
1938 	err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1939 			     n->nlmsg_flags & NLM_F_ECHO);
1940 	if (err > 0)
1941 		err = 0;
1942 	return err;
1943 }
1944 
1945 #ifdef CONFIG_NET_CLS
1946 
1947 struct tcf_bind_args {
1948 	struct tcf_walker w;
1949 	unsigned long base;
1950 	unsigned long cl;
1951 	u32 classid;
1952 };
1953 
tcf_node_bind(struct tcf_proto * tp,void * n,struct tcf_walker * arg)1954 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1955 {
1956 	struct tcf_bind_args *a = (void *)arg;
1957 
1958 	if (tp->ops->bind_class) {
1959 		struct Qdisc *q = tcf_block_q(tp->chain->block);
1960 
1961 		sch_tree_lock(q);
1962 		tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1963 		sch_tree_unlock(q);
1964 	}
1965 	return 0;
1966 }
1967 
1968 struct tc_bind_class_args {
1969 	struct qdisc_walker w;
1970 	unsigned long new_cl;
1971 	u32 portid;
1972 	u32 clid;
1973 };
1974 
tc_bind_class_walker(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1975 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1976 				struct qdisc_walker *w)
1977 {
1978 	struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1979 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1980 	struct tcf_block *block;
1981 	struct tcf_chain *chain;
1982 
1983 	block = cops->tcf_block(q, cl, NULL);
1984 	if (!block)
1985 		return 0;
1986 	for (chain = tcf_get_next_chain(block, NULL);
1987 	     chain;
1988 	     chain = tcf_get_next_chain(block, chain)) {
1989 		struct tcf_proto *tp;
1990 
1991 		for (tp = tcf_get_next_proto(chain, NULL, true);
1992 		     tp; tp = tcf_get_next_proto(chain, tp, true)) {
1993 			struct tcf_bind_args arg = {};
1994 
1995 			arg.w.fn = tcf_node_bind;
1996 			arg.classid = a->clid;
1997 			arg.base = cl;
1998 			arg.cl = a->new_cl;
1999 			tp->ops->walk(tp, &arg.w, true);
2000 		}
2001 	}
2002 
2003 	return 0;
2004 }
2005 
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)2006 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2007 			   unsigned long new_cl)
2008 {
2009 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2010 	struct tc_bind_class_args args = {};
2011 
2012 	if (!cops->tcf_block)
2013 		return;
2014 	args.portid = portid;
2015 	args.clid = clid;
2016 	args.new_cl = new_cl;
2017 	args.w.fn = tc_bind_class_walker;
2018 	q->ops->cl_ops->walk(q, &args.w);
2019 }
2020 
2021 #else
2022 
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)2023 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2024 			   unsigned long new_cl)
2025 {
2026 }
2027 
2028 #endif
2029 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)2030 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2031 			 struct netlink_ext_ack *extack)
2032 {
2033 	struct net *net = sock_net(skb->sk);
2034 	struct tcmsg *tcm = nlmsg_data(n);
2035 	struct nlattr *tca[TCA_MAX + 1];
2036 	struct net_device *dev;
2037 	struct Qdisc *q = NULL;
2038 	const struct Qdisc_class_ops *cops;
2039 	unsigned long cl = 0;
2040 	unsigned long new_cl;
2041 	u32 portid;
2042 	u32 clid;
2043 	u32 qid;
2044 	int err;
2045 
2046 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
2047 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2048 		return -EPERM;
2049 
2050 	err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2051 				     rtm_tca_policy, extack);
2052 	if (err < 0)
2053 		return err;
2054 
2055 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2056 	if (!dev)
2057 		return -ENODEV;
2058 
2059 	/*
2060 	   parent == TC_H_UNSPEC - unspecified parent.
2061 	   parent == TC_H_ROOT   - class is root, which has no parent.
2062 	   parent == X:0	 - parent is root class.
2063 	   parent == X:Y	 - parent is a node in hierarchy.
2064 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
2065 
2066 	   handle == 0:0	 - generate handle from kernel pool.
2067 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
2068 	   handle == X:Y	 - clear.
2069 	   handle == X:0	 - root class.
2070 	 */
2071 
2072 	/* Step 1. Determine qdisc handle X:0 */
2073 
2074 	portid = tcm->tcm_parent;
2075 	clid = tcm->tcm_handle;
2076 	qid = TC_H_MAJ(clid);
2077 
2078 	if (portid != TC_H_ROOT) {
2079 		u32 qid1 = TC_H_MAJ(portid);
2080 
2081 		if (qid && qid1) {
2082 			/* If both majors are known, they must be identical. */
2083 			if (qid != qid1)
2084 				return -EINVAL;
2085 		} else if (qid1) {
2086 			qid = qid1;
2087 		} else if (qid == 0)
2088 			qid = rtnl_dereference(dev->qdisc)->handle;
2089 
2090 		/* Now qid is genuine qdisc handle consistent
2091 		 * both with parent and child.
2092 		 *
2093 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2094 		 */
2095 		if (portid)
2096 			portid = TC_H_MAKE(qid, portid);
2097 	} else {
2098 		if (qid == 0)
2099 			qid = rtnl_dereference(dev->qdisc)->handle;
2100 	}
2101 
2102 	/* OK. Locate qdisc */
2103 	q = qdisc_lookup(dev, qid);
2104 	if (!q)
2105 		return -ENOENT;
2106 
2107 	/* An check that it supports classes */
2108 	cops = q->ops->cl_ops;
2109 	if (cops == NULL)
2110 		return -EINVAL;
2111 
2112 	/* Now try to get class */
2113 	if (clid == 0) {
2114 		if (portid == TC_H_ROOT)
2115 			clid = qid;
2116 	} else
2117 		clid = TC_H_MAKE(qid, clid);
2118 
2119 	if (clid)
2120 		cl = cops->find(q, clid);
2121 
2122 	if (cl == 0) {
2123 		err = -ENOENT;
2124 		if (n->nlmsg_type != RTM_NEWTCLASS ||
2125 		    !(n->nlmsg_flags & NLM_F_CREATE))
2126 			goto out;
2127 	} else {
2128 		switch (n->nlmsg_type) {
2129 		case RTM_NEWTCLASS:
2130 			err = -EEXIST;
2131 			if (n->nlmsg_flags & NLM_F_EXCL)
2132 				goto out;
2133 			break;
2134 		case RTM_DELTCLASS:
2135 			err = tclass_del_notify(net, cops, skb, n, q, cl);
2136 			/* Unbind the class with flilters with 0 */
2137 			tc_bind_tclass(q, portid, clid, 0);
2138 			goto out;
2139 		case RTM_GETTCLASS:
2140 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2141 			goto out;
2142 		default:
2143 			err = -EINVAL;
2144 			goto out;
2145 		}
2146 	}
2147 
2148 	if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2149 		NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2150 		return -EOPNOTSUPP;
2151 	}
2152 
2153 	new_cl = cl;
2154 	err = -EOPNOTSUPP;
2155 	if (cops->change)
2156 		err = cops->change(q, clid, portid, tca, &new_cl, extack);
2157 	if (err == 0) {
2158 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2159 		/* We just create a new class, need to do reverse binding. */
2160 		if (cl != new_cl)
2161 			tc_bind_tclass(q, portid, clid, new_cl);
2162 	}
2163 out:
2164 	return err;
2165 }
2166 
2167 struct qdisc_dump_args {
2168 	struct qdisc_walker	w;
2169 	struct sk_buff		*skb;
2170 	struct netlink_callback	*cb;
2171 };
2172 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)2173 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2174 			    struct qdisc_walker *arg)
2175 {
2176 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2177 
2178 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2179 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2180 			      RTM_NEWTCLASS);
2181 }
2182 
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2183 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2184 				struct tcmsg *tcm, struct netlink_callback *cb,
2185 				int *t_p, int s_t)
2186 {
2187 	struct qdisc_dump_args arg;
2188 
2189 	if (tc_qdisc_dump_ignore(q, false) ||
2190 	    *t_p < s_t || !q->ops->cl_ops ||
2191 	    (tcm->tcm_parent &&
2192 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2193 		(*t_p)++;
2194 		return 0;
2195 	}
2196 	if (*t_p > s_t)
2197 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2198 	arg.w.fn = qdisc_class_dump;
2199 	arg.skb = skb;
2200 	arg.cb = cb;
2201 	arg.w.stop  = 0;
2202 	arg.w.skip = cb->args[1];
2203 	arg.w.count = 0;
2204 	q->ops->cl_ops->walk(q, &arg.w);
2205 	cb->args[1] = arg.w.count;
2206 	if (arg.w.stop)
2207 		return -1;
2208 	(*t_p)++;
2209 	return 0;
2210 }
2211 
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t,bool recur)2212 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2213 			       struct tcmsg *tcm, struct netlink_callback *cb,
2214 			       int *t_p, int s_t, bool recur)
2215 {
2216 	struct Qdisc *q;
2217 	int b;
2218 
2219 	if (!root)
2220 		return 0;
2221 
2222 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2223 		return -1;
2224 
2225 	if (!qdisc_dev(root) || !recur)
2226 		return 0;
2227 
2228 	if (tcm->tcm_parent) {
2229 		q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2230 		if (q && q != root &&
2231 		    tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2232 			return -1;
2233 		return 0;
2234 	}
2235 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2236 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2237 			return -1;
2238 	}
2239 
2240 	return 0;
2241 }
2242 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)2243 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2244 {
2245 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
2246 	struct net *net = sock_net(skb->sk);
2247 	struct netdev_queue *dev_queue;
2248 	struct net_device *dev;
2249 	int t, s_t;
2250 
2251 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2252 		return 0;
2253 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
2254 	if (!dev)
2255 		return 0;
2256 
2257 	s_t = cb->args[0];
2258 	t = 0;
2259 
2260 	if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2261 				skb, tcm, cb, &t, s_t, true) < 0)
2262 		goto done;
2263 
2264 	dev_queue = dev_ingress_queue(dev);
2265 	if (dev_queue &&
2266 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2267 				&t, s_t, false) < 0)
2268 		goto done;
2269 
2270 done:
2271 	cb->args[0] = t;
2272 
2273 	dev_put(dev);
2274 	return skb->len;
2275 }
2276 
2277 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)2278 static int psched_show(struct seq_file *seq, void *v)
2279 {
2280 	seq_printf(seq, "%08x %08x %08x %08x\n",
2281 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2282 		   1000000,
2283 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
2284 
2285 	return 0;
2286 }
2287 
psched_net_init(struct net * net)2288 static int __net_init psched_net_init(struct net *net)
2289 {
2290 	struct proc_dir_entry *e;
2291 
2292 	e = proc_create_single("psched", 0, net->proc_net, psched_show);
2293 	if (e == NULL)
2294 		return -ENOMEM;
2295 
2296 	return 0;
2297 }
2298 
psched_net_exit(struct net * net)2299 static void __net_exit psched_net_exit(struct net *net)
2300 {
2301 	remove_proc_entry("psched", net->proc_net);
2302 }
2303 #else
psched_net_init(struct net * net)2304 static int __net_init psched_net_init(struct net *net)
2305 {
2306 	return 0;
2307 }
2308 
psched_net_exit(struct net * net)2309 static void __net_exit psched_net_exit(struct net *net)
2310 {
2311 }
2312 #endif
2313 
2314 static struct pernet_operations psched_net_ops = {
2315 	.init = psched_net_init,
2316 	.exit = psched_net_exit,
2317 };
2318 
pktsched_init(void)2319 static int __init pktsched_init(void)
2320 {
2321 	int err;
2322 
2323 	err = register_pernet_subsys(&psched_net_ops);
2324 	if (err) {
2325 		pr_err("pktsched_init: "
2326 		       "cannot initialize per netns operations\n");
2327 		return err;
2328 	}
2329 
2330 	register_qdisc(&pfifo_fast_ops);
2331 	register_qdisc(&pfifo_qdisc_ops);
2332 	register_qdisc(&bfifo_qdisc_ops);
2333 	register_qdisc(&pfifo_head_drop_qdisc_ops);
2334 	register_qdisc(&mq_qdisc_ops);
2335 	register_qdisc(&noqueue_qdisc_ops);
2336 
2337 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2338 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2339 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2340 		      0);
2341 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2342 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2343 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2344 		      0);
2345 
2346 	return 0;
2347 }
2348 
2349 subsys_initcall(pktsched_init);
2350