1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * net/sched/sch_api.c Packet scheduler API.
4 *
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6 *
7 * Fixes:
8 *
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12 */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39 Short review.
40 -------------
41
42 This file consists of two interrelated parts:
43
44 1. queueing disciplines manager frontend.
45 2. traffic classes manager frontend.
46
47 Generally, queueing discipline ("qdisc") is a black box,
48 which is able to enqueue packets and to dequeue them (when
49 device is ready to send something) in order and at times
50 determined by algorithm hidden in it.
51
52 qdisc's are divided to two categories:
53 - "queues", which have no internal structure visible from outside.
54 - "schedulers", which split all the packets to "traffic classes",
55 using "packet classifiers" (look at cls_api.c)
56
57 In turn, classes may have child qdiscs (as rule, queues)
58 attached to them etc. etc. etc.
59
60 The goal of the routines in this file is to translate
61 information supplied by user in the form of handles
62 to more intelligible for kernel form, to make some sanity
63 checks and part of work, which is common to all qdiscs
64 and to provide rtnetlink notifications.
65
66 All real intelligent work is done inside qdisc modules.
67
68
69
70 Every discipline has two major routines: enqueue and dequeue.
71
72 ---dequeue
73
74 dequeue usually returns a skb to send. It is allowed to return NULL,
75 but it does not mean that queue is empty, it just means that
76 discipline does not want to send anything this time.
77 Queue is really empty if q->q.qlen == 0.
78 For complicated disciplines with multiple queues q->q is not
79 real packet queue, but however q->q.qlen must be valid.
80
81 ---enqueue
82
83 enqueue returns 0, if packet was enqueued successfully.
84 If packet (this one or another one) was dropped, it returns
85 not zero error code.
86 NET_XMIT_DROP - this packet dropped
87 Expected action: do not backoff, but wait until queue will clear.
88 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
89 Expected action: backoff or ignore
90
91 Auxiliary routines:
92
93 ---peek
94
95 like dequeue but without removing a packet from the queue
96
97 ---reset
98
99 returns qdisc to initial state: purge all buffers, clear all
100 timers, counters (except for statistics) etc.
101
102 ---init
103
104 initializes newly created qdisc.
105
106 ---destroy
107
108 destroys resources allocated by init and during lifetime of qdisc.
109
110 ---change
111
112 changes qdisc parameters.
113 */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120 * Queueing disciplines manipulation. *
121 ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
register_qdisc(struct Qdisc_ops * qops)130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132 struct Qdisc_ops *q, **qp;
133 int rc = -EEXIST;
134
135 write_lock(&qdisc_mod_lock);
136 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137 if (!strcmp(qops->id, q->id))
138 goto out;
139
140 if (qops->enqueue == NULL)
141 qops->enqueue = noop_qdisc_ops.enqueue;
142 if (qops->peek == NULL) {
143 if (qops->dequeue == NULL)
144 qops->peek = noop_qdisc_ops.peek;
145 else
146 goto out_einval;
147 }
148 if (qops->dequeue == NULL)
149 qops->dequeue = noop_qdisc_ops.dequeue;
150
151 if (qops->cl_ops) {
152 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154 if (!(cops->find && cops->walk && cops->leaf))
155 goto out_einval;
156
157 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158 goto out_einval;
159 }
160
161 qops->next = NULL;
162 *qp = qops;
163 rc = 0;
164 out:
165 write_unlock(&qdisc_mod_lock);
166 return rc;
167
168 out_einval:
169 rc = -EINVAL;
170 goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
unregister_qdisc(struct Qdisc_ops * qops)174 void unregister_qdisc(struct Qdisc_ops *qops)
175 {
176 struct Qdisc_ops *q, **qp;
177 int err = -ENOENT;
178
179 write_lock(&qdisc_mod_lock);
180 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181 if (q == qops)
182 break;
183 if (q) {
184 *qp = q->next;
185 q->next = NULL;
186 err = 0;
187 }
188 write_unlock(&qdisc_mod_lock);
189
190 WARN(err, "unregister qdisc(%s) failed\n", qops->id);
191 }
192 EXPORT_SYMBOL(unregister_qdisc);
193
194 /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)195 void qdisc_get_default(char *name, size_t len)
196 {
197 read_lock(&qdisc_mod_lock);
198 strscpy(name, default_qdisc_ops->id, len);
199 read_unlock(&qdisc_mod_lock);
200 }
201
qdisc_lookup_default(const char * name)202 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
203 {
204 struct Qdisc_ops *q = NULL;
205
206 for (q = qdisc_base; q; q = q->next) {
207 if (!strcmp(name, q->id)) {
208 if (!try_module_get(q->owner))
209 q = NULL;
210 break;
211 }
212 }
213
214 return q;
215 }
216
217 /* Set new default qdisc to use */
qdisc_set_default(const char * name)218 int qdisc_set_default(const char *name)
219 {
220 const struct Qdisc_ops *ops;
221
222 if (!capable(CAP_NET_ADMIN))
223 return -EPERM;
224
225 write_lock(&qdisc_mod_lock);
226 ops = qdisc_lookup_default(name);
227 if (!ops) {
228 /* Not found, drop lock and try to load module */
229 write_unlock(&qdisc_mod_lock);
230 request_module("sch_%s", name);
231 write_lock(&qdisc_mod_lock);
232
233 ops = qdisc_lookup_default(name);
234 }
235
236 if (ops) {
237 /* Set new default */
238 module_put(default_qdisc_ops->owner);
239 default_qdisc_ops = ops;
240 }
241 write_unlock(&qdisc_mod_lock);
242
243 return ops ? 0 : -ENOENT;
244 }
245
246 #ifdef CONFIG_NET_SCH_DEFAULT
247 /* Set default value from kernel config */
sch_default_qdisc(void)248 static int __init sch_default_qdisc(void)
249 {
250 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
251 }
252 late_initcall(sch_default_qdisc);
253 #endif
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256 * (root qdisc, all its children, children of children etc.)
257 * Note: caller either uses rtnl or rcu_read_lock()
258 */
259
qdisc_match_from_root(struct Qdisc * root,u32 handle)260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 {
262 struct Qdisc *q;
263
264 if (!qdisc_dev(root))
265 return (root->handle == handle ? root : NULL);
266
267 if (!(root->flags & TCQ_F_BUILTIN) &&
268 root->handle == handle)
269 return root;
270
271 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
272 lockdep_rtnl_is_held()) {
273 if (q->handle == handle)
274 return q;
275 }
276 return NULL;
277 }
278
qdisc_hash_add(struct Qdisc * q,bool invisible)279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282 ASSERT_RTNL();
283 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284 if (invisible)
285 q->flags |= TCQ_F_INVISIBLE;
286 }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
qdisc_hash_del(struct Qdisc * q)290 void qdisc_hash_del(struct Qdisc *q)
291 {
292 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293 ASSERT_RTNL();
294 hash_del_rcu(&q->hash);
295 }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
qdisc_lookup(struct net_device * dev,u32 handle)299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301 struct Qdisc *q;
302
303 if (!handle)
304 return NULL;
305 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
306 if (q)
307 goto out;
308
309 if (dev_ingress_queue(dev))
310 q = qdisc_match_from_root(
311 rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
312 handle);
313 out:
314 return q;
315 }
316
qdisc_lookup_rcu(struct net_device * dev,u32 handle)317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319 struct netdev_queue *nq;
320 struct Qdisc *q;
321
322 if (!handle)
323 return NULL;
324 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
325 if (q)
326 goto out;
327
328 nq = dev_ingress_queue_rcu(dev);
329 if (nq)
330 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
331 handle);
332 out:
333 return q;
334 }
335
qdisc_leaf(struct Qdisc * p,u32 classid)336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
337 {
338 unsigned long cl;
339 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340
341 if (cops == NULL)
342 return NULL;
343 cl = cops->find(p, classid);
344
345 if (cl == 0)
346 return NULL;
347 return cops->leaf(p, cl);
348 }
349
350 /* Find queueing discipline by name */
351
qdisc_lookup_ops(struct nlattr * kind)352 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
353 {
354 struct Qdisc_ops *q = NULL;
355
356 if (kind) {
357 read_lock(&qdisc_mod_lock);
358 for (q = qdisc_base; q; q = q->next) {
359 if (nla_strcmp(kind, q->id) == 0) {
360 if (!try_module_get(q->owner))
361 q = NULL;
362 break;
363 }
364 }
365 read_unlock(&qdisc_mod_lock);
366 }
367 return q;
368 }
369
370 /* The linklayer setting were not transferred from iproute2, in older
371 * versions, and the rate tables lookup systems have been dropped in
372 * the kernel. To keep backward compatible with older iproute2 tc
373 * utils, we detect the linklayer setting by detecting if the rate
374 * table were modified.
375 *
376 * For linklayer ATM table entries, the rate table will be aligned to
377 * 48 bytes, thus some table entries will contain the same value. The
378 * mpu (min packet unit) is also encoded into the old rate table, thus
379 * starting from the mpu, we find low and high table entries for
380 * mapping this cell. If these entries contain the same value, when
381 * the rate tables have been modified for linklayer ATM.
382 *
383 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
384 * and then roundup to the next cell, calc the table entry one below,
385 * and compare.
386 */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)387 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
388 {
389 int low = roundup(r->mpu, 48);
390 int high = roundup(low+1, 48);
391 int cell_low = low >> r->cell_log;
392 int cell_high = (high >> r->cell_log) - 1;
393
394 /* rtab is too inaccurate at rates > 100Mbit/s */
395 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
396 pr_debug("TC linklayer: Giving up ATM detection\n");
397 return TC_LINKLAYER_ETHERNET;
398 }
399
400 if ((cell_high > cell_low) && (cell_high < 256)
401 && (rtab[cell_low] == rtab[cell_high])) {
402 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
403 cell_low, cell_high, rtab[cell_high]);
404 return TC_LINKLAYER_ATM;
405 }
406 return TC_LINKLAYER_ETHERNET;
407 }
408
409 static struct qdisc_rate_table *qdisc_rtab_list;
410
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab,struct netlink_ext_ack * extack)411 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
412 struct nlattr *tab,
413 struct netlink_ext_ack *extack)
414 {
415 struct qdisc_rate_table *rtab;
416
417 if (tab == NULL || r->rate == 0 ||
418 r->cell_log == 0 || r->cell_log >= 32 ||
419 nla_len(tab) != TC_RTAB_SIZE) {
420 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421 return NULL;
422 }
423
424 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426 !memcmp(&rtab->data, nla_data(tab), 1024)) {
427 rtab->refcnt++;
428 return rtab;
429 }
430 }
431
432 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433 if (rtab) {
434 rtab->rate = *r;
435 rtab->refcnt = 1;
436 memcpy(rtab->data, nla_data(tab), 1024);
437 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438 r->linklayer = __detect_linklayer(r, rtab->data);
439 rtab->next = qdisc_rtab_list;
440 qdisc_rtab_list = rtab;
441 } else {
442 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443 }
444 return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447
qdisc_put_rtab(struct qdisc_rate_table * tab)448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450 struct qdisc_rate_table *rtab, **rtabp;
451
452 if (!tab || --tab->refcnt)
453 return;
454
455 for (rtabp = &qdisc_rtab_list;
456 (rtab = *rtabp) != NULL;
457 rtabp = &rtab->next) {
458 if (rtab == tab) {
459 *rtabp = rtab->next;
460 kfree(rtab);
461 return;
462 }
463 }
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466
467 static LIST_HEAD(qdisc_stab_list);
468
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471 [TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473
qdisc_get_stab(struct nlattr * opt,struct netlink_ext_ack * extack)474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475 struct netlink_ext_ack *extack)
476 {
477 struct nlattr *tb[TCA_STAB_MAX + 1];
478 struct qdisc_size_table *stab;
479 struct tc_sizespec *s;
480 unsigned int tsize = 0;
481 u16 *tab = NULL;
482 int err;
483
484 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
485 extack);
486 if (err < 0)
487 return ERR_PTR(err);
488 if (!tb[TCA_STAB_BASE]) {
489 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
490 return ERR_PTR(-EINVAL);
491 }
492
493 s = nla_data(tb[TCA_STAB_BASE]);
494
495 if (s->tsize > 0) {
496 if (!tb[TCA_STAB_DATA]) {
497 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
498 return ERR_PTR(-EINVAL);
499 }
500 tab = nla_data(tb[TCA_STAB_DATA]);
501 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
502 }
503
504 if (tsize != s->tsize || (!tab && tsize > 0)) {
505 NL_SET_ERR_MSG(extack, "Invalid size of size table");
506 return ERR_PTR(-EINVAL);
507 }
508
509 list_for_each_entry(stab, &qdisc_stab_list, list) {
510 if (memcmp(&stab->szopts, s, sizeof(*s)))
511 continue;
512 if (tsize > 0 &&
513 memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
514 continue;
515 stab->refcnt++;
516 return stab;
517 }
518
519 if (s->size_log > STAB_SIZE_LOG_MAX ||
520 s->cell_log > STAB_SIZE_LOG_MAX) {
521 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522 return ERR_PTR(-EINVAL);
523 }
524
525 stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
526 if (!stab)
527 return ERR_PTR(-ENOMEM);
528
529 stab->refcnt = 1;
530 stab->szopts = *s;
531 if (tsize > 0)
532 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
533
534 list_add_tail(&stab->list, &qdisc_stab_list);
535
536 return stab;
537 }
538
qdisc_put_stab(struct qdisc_size_table * tab)539 void qdisc_put_stab(struct qdisc_size_table *tab)
540 {
541 if (!tab)
542 return;
543
544 if (--tab->refcnt == 0) {
545 list_del(&tab->list);
546 kfree_rcu(tab, rcu);
547 }
548 }
549 EXPORT_SYMBOL(qdisc_put_stab);
550
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)551 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
552 {
553 struct nlattr *nest;
554
555 nest = nla_nest_start_noflag(skb, TCA_STAB);
556 if (nest == NULL)
557 goto nla_put_failure;
558 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
559 goto nla_put_failure;
560 nla_nest_end(skb, nest);
561
562 return skb->len;
563
564 nla_put_failure:
565 return -1;
566 }
567
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)568 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
569 const struct qdisc_size_table *stab)
570 {
571 int pkt_len, slot;
572
573 pkt_len = skb->len + stab->szopts.overhead;
574 if (unlikely(!stab->szopts.tsize))
575 goto out;
576
577 slot = pkt_len + stab->szopts.cell_align;
578 if (unlikely(slot < 0))
579 slot = 0;
580
581 slot >>= stab->szopts.cell_log;
582 if (likely(slot < stab->szopts.tsize))
583 pkt_len = stab->data[slot];
584 else
585 pkt_len = stab->data[stab->szopts.tsize - 1] *
586 (slot / stab->szopts.tsize) +
587 stab->data[slot % stab->szopts.tsize];
588
589 pkt_len <<= stab->szopts.size_log;
590 out:
591 if (unlikely(pkt_len < 1))
592 pkt_len = 1;
593 qdisc_skb_cb(skb)->pkt_len = pkt_len;
594 }
595 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
596
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601 txt, qdisc->ops->id, qdisc->handle >> 16);
602 qdisc->flags |= TCQ_F_WARN_NONWC;
603 }
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606
qdisc_watchdog(struct hrtimer * timer)607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610 timer);
611
612 rcu_read_lock();
613 __netif_schedule(qdisc_root(wd->qdisc));
614 rcu_read_unlock();
615
616 return HRTIMER_NORESTART;
617 }
618
qdisc_watchdog_init_clockid(struct qdisc_watchdog * wd,struct Qdisc * qdisc,clockid_t clockid)619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620 clockid_t clockid)
621 {
622 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623 wd->timer.function = qdisc_watchdog;
624 wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633
qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog * wd,u64 expires,u64 delta_ns)634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635 u64 delta_ns)
636 {
637 bool deactivated;
638
639 rcu_read_lock();
640 deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
641 &qdisc_root_sleeping(wd->qdisc)->state);
642 rcu_read_unlock();
643 if (deactivated)
644 return;
645
646 if (hrtimer_is_queued(&wd->timer)) {
647 /* If timer is already set in [expires, expires + delta_ns],
648 * do not reprogram it.
649 */
650 if (wd->last_expires - expires <= delta_ns)
651 return;
652 }
653
654 wd->last_expires = expires;
655 hrtimer_start_range_ns(&wd->timer,
656 ns_to_ktime(expires),
657 delta_ns,
658 HRTIMER_MODE_ABS_PINNED);
659 }
660 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
661
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)662 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
663 {
664 hrtimer_cancel(&wd->timer);
665 }
666 EXPORT_SYMBOL(qdisc_watchdog_cancel);
667
qdisc_class_hash_alloc(unsigned int n)668 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
669 {
670 struct hlist_head *h;
671 unsigned int i;
672
673 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
674
675 if (h != NULL) {
676 for (i = 0; i < n; i++)
677 INIT_HLIST_HEAD(&h[i]);
678 }
679 return h;
680 }
681
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)682 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
683 {
684 struct Qdisc_class_common *cl;
685 struct hlist_node *next;
686 struct hlist_head *nhash, *ohash;
687 unsigned int nsize, nmask, osize;
688 unsigned int i, h;
689
690 /* Rehash when load factor exceeds 0.75 */
691 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
692 return;
693 nsize = clhash->hashsize * 2;
694 nmask = nsize - 1;
695 nhash = qdisc_class_hash_alloc(nsize);
696 if (nhash == NULL)
697 return;
698
699 ohash = clhash->hash;
700 osize = clhash->hashsize;
701
702 sch_tree_lock(sch);
703 for (i = 0; i < osize; i++) {
704 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
705 h = qdisc_class_hash(cl->classid, nmask);
706 hlist_add_head(&cl->hnode, &nhash[h]);
707 }
708 }
709 clhash->hash = nhash;
710 clhash->hashsize = nsize;
711 clhash->hashmask = nmask;
712 sch_tree_unlock(sch);
713
714 kvfree(ohash);
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_grow);
717
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)718 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
719 {
720 unsigned int size = 4;
721
722 clhash->hash = qdisc_class_hash_alloc(size);
723 if (!clhash->hash)
724 return -ENOMEM;
725 clhash->hashsize = size;
726 clhash->hashmask = size - 1;
727 clhash->hashelems = 0;
728 return 0;
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_init);
731
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)732 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
733 {
734 kvfree(clhash->hash);
735 }
736 EXPORT_SYMBOL(qdisc_class_hash_destroy);
737
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)738 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
739 struct Qdisc_class_common *cl)
740 {
741 unsigned int h;
742
743 INIT_HLIST_NODE(&cl->hnode);
744 h = qdisc_class_hash(cl->classid, clhash->hashmask);
745 hlist_add_head(&cl->hnode, &clhash->hash[h]);
746 clhash->hashelems++;
747 }
748 EXPORT_SYMBOL(qdisc_class_hash_insert);
749
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)750 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
751 struct Qdisc_class_common *cl)
752 {
753 hlist_del(&cl->hnode);
754 clhash->hashelems--;
755 }
756 EXPORT_SYMBOL(qdisc_class_hash_remove);
757
758 /* Allocate an unique handle from space managed by kernel
759 * Possible range is [8000-FFFF]:0000 (0x8000 values)
760 */
qdisc_alloc_handle(struct net_device * dev)761 static u32 qdisc_alloc_handle(struct net_device *dev)
762 {
763 int i = 0x8000;
764 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
765
766 do {
767 autohandle += TC_H_MAKE(0x10000U, 0);
768 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
769 autohandle = TC_H_MAKE(0x80000000U, 0);
770 if (!qdisc_lookup(dev, autohandle))
771 return autohandle;
772 cond_resched();
773 } while (--i > 0);
774
775 return 0;
776 }
777
qdisc_tree_reduce_backlog(struct Qdisc * sch,int n,int len)778 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
779 {
780 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
781 const struct Qdisc_class_ops *cops;
782 unsigned long cl;
783 u32 parentid;
784 bool notify;
785 int drops;
786
787 if (n == 0 && len == 0)
788 return;
789 drops = max_t(int, n, 0);
790 rcu_read_lock();
791 while ((parentid = sch->parent)) {
792 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
793 break;
794
795 if (sch->flags & TCQ_F_NOPARENT)
796 break;
797 /* Notify parent qdisc only if child qdisc becomes empty.
798 *
799 * If child was empty even before update then backlog
800 * counter is screwed and we skip notification because
801 * parent class is already passive.
802 *
803 * If the original child was offloaded then it is allowed
804 * to be seem as empty, so the parent is notified anyway.
805 */
806 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
807 !qdisc_is_offloaded);
808 /* TODO: perform the search on a per txq basis */
809 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
810 if (sch == NULL) {
811 WARN_ON_ONCE(parentid != TC_H_ROOT);
812 break;
813 }
814 cops = sch->ops->cl_ops;
815 if (notify && cops->qlen_notify) {
816 cl = cops->find(sch, parentid);
817 cops->qlen_notify(sch, cl);
818 }
819 sch->q.qlen -= n;
820 sch->qstats.backlog -= len;
821 __qdisc_qstats_drop(sch, drops);
822 }
823 rcu_read_unlock();
824 }
825 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
826
qdisc_offload_dump_helper(struct Qdisc * sch,enum tc_setup_type type,void * type_data)827 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
828 void *type_data)
829 {
830 struct net_device *dev = qdisc_dev(sch);
831 int err;
832
833 sch->flags &= ~TCQ_F_OFFLOADED;
834 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
835 return 0;
836
837 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
838 if (err == -EOPNOTSUPP)
839 return 0;
840
841 if (!err)
842 sch->flags |= TCQ_F_OFFLOADED;
843
844 return err;
845 }
846 EXPORT_SYMBOL(qdisc_offload_dump_helper);
847
qdisc_offload_graft_helper(struct net_device * dev,struct Qdisc * sch,struct Qdisc * new,struct Qdisc * old,enum tc_setup_type type,void * type_data,struct netlink_ext_ack * extack)848 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
849 struct Qdisc *new, struct Qdisc *old,
850 enum tc_setup_type type, void *type_data,
851 struct netlink_ext_ack *extack)
852 {
853 bool any_qdisc_is_offloaded;
854 int err;
855
856 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
857 return;
858
859 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
860
861 /* Don't report error if the graft is part of destroy operation. */
862 if (!err || !new || new == &noop_qdisc)
863 return;
864
865 /* Don't report error if the parent, the old child and the new
866 * one are not offloaded.
867 */
868 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
869 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
870 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
871
872 if (any_qdisc_is_offloaded)
873 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
874 }
875 EXPORT_SYMBOL(qdisc_offload_graft_helper);
876
qdisc_offload_query_caps(struct net_device * dev,enum tc_setup_type type,void * caps,size_t caps_len)877 void qdisc_offload_query_caps(struct net_device *dev,
878 enum tc_setup_type type,
879 void *caps, size_t caps_len)
880 {
881 const struct net_device_ops *ops = dev->netdev_ops;
882 struct tc_query_caps_base base = {
883 .type = type,
884 .caps = caps,
885 };
886
887 memset(caps, 0, caps_len);
888
889 if (ops->ndo_setup_tc)
890 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
891 }
892 EXPORT_SYMBOL(qdisc_offload_query_caps);
893
qdisc_offload_graft_root(struct net_device * dev,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)894 static void qdisc_offload_graft_root(struct net_device *dev,
895 struct Qdisc *new, struct Qdisc *old,
896 struct netlink_ext_ack *extack)
897 {
898 struct tc_root_qopt_offload graft_offload = {
899 .command = TC_ROOT_GRAFT,
900 .handle = new ? new->handle : 0,
901 .ingress = (new && new->flags & TCQ_F_INGRESS) ||
902 (old && old->flags & TCQ_F_INGRESS),
903 };
904
905 qdisc_offload_graft_helper(dev, NULL, new, old,
906 TC_SETUP_ROOT_QDISC, &graft_offload, extack);
907 }
908
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event,struct netlink_ext_ack * extack)909 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
910 u32 portid, u32 seq, u16 flags, int event,
911 struct netlink_ext_ack *extack)
912 {
913 struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
914 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
915 struct tcmsg *tcm;
916 struct nlmsghdr *nlh;
917 unsigned char *b = skb_tail_pointer(skb);
918 struct gnet_dump d;
919 struct qdisc_size_table *stab;
920 u32 block_index;
921 __u32 qlen;
922
923 cond_resched();
924 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
925 if (!nlh)
926 goto out_nlmsg_trim;
927 tcm = nlmsg_data(nlh);
928 tcm->tcm_family = AF_UNSPEC;
929 tcm->tcm__pad1 = 0;
930 tcm->tcm__pad2 = 0;
931 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
932 tcm->tcm_parent = clid;
933 tcm->tcm_handle = q->handle;
934 tcm->tcm_info = refcount_read(&q->refcnt);
935 if (nla_put_string(skb, TCA_KIND, q->ops->id))
936 goto nla_put_failure;
937 if (q->ops->ingress_block_get) {
938 block_index = q->ops->ingress_block_get(q);
939 if (block_index &&
940 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
941 goto nla_put_failure;
942 }
943 if (q->ops->egress_block_get) {
944 block_index = q->ops->egress_block_get(q);
945 if (block_index &&
946 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
947 goto nla_put_failure;
948 }
949 if (q->ops->dump && q->ops->dump(q, skb) < 0)
950 goto nla_put_failure;
951 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
952 goto nla_put_failure;
953 qlen = qdisc_qlen_sum(q);
954
955 stab = rtnl_dereference(q->stab);
956 if (stab && qdisc_dump_stab(skb, stab) < 0)
957 goto nla_put_failure;
958
959 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
960 NULL, &d, TCA_PAD) < 0)
961 goto nla_put_failure;
962
963 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
964 goto nla_put_failure;
965
966 if (qdisc_is_percpu_stats(q)) {
967 cpu_bstats = q->cpu_bstats;
968 cpu_qstats = q->cpu_qstats;
969 }
970
971 if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
972 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
973 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
974 goto nla_put_failure;
975
976 if (gnet_stats_finish_copy(&d) < 0)
977 goto nla_put_failure;
978
979 if (extack && extack->_msg &&
980 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
981 goto out_nlmsg_trim;
982
983 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
984
985 return skb->len;
986
987 out_nlmsg_trim:
988 nla_put_failure:
989 nlmsg_trim(skb, b);
990 return -1;
991 }
992
tc_qdisc_dump_ignore(struct Qdisc * q,bool dump_invisible)993 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
994 {
995 if (q->flags & TCQ_F_BUILTIN)
996 return true;
997 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
998 return true;
999
1000 return false;
1001 }
1002
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new,struct netlink_ext_ack * extack)1003 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1004 struct nlmsghdr *n, u32 clid,
1005 struct Qdisc *old, struct Qdisc *new,
1006 struct netlink_ext_ack *extack)
1007 {
1008 struct sk_buff *skb;
1009 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1010
1011 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1012 if (!skb)
1013 return -ENOBUFS;
1014
1015 if (old && !tc_qdisc_dump_ignore(old, false)) {
1016 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1017 0, RTM_DELQDISC, extack) < 0)
1018 goto err_out;
1019 }
1020 if (new && !tc_qdisc_dump_ignore(new, false)) {
1021 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1022 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1023 goto err_out;
1024 }
1025
1026 if (skb->len)
1027 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1028 n->nlmsg_flags & NLM_F_ECHO);
1029
1030 err_out:
1031 kfree_skb(skb);
1032 return -EINVAL;
1033 }
1034
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new,struct netlink_ext_ack * extack)1035 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1036 struct nlmsghdr *n, u32 clid,
1037 struct Qdisc *old, struct Qdisc *new,
1038 struct netlink_ext_ack *extack)
1039 {
1040 if (new || old)
1041 qdisc_notify(net, skb, n, clid, old, new, extack);
1042
1043 if (old)
1044 qdisc_put(old);
1045 }
1046
qdisc_clear_nolock(struct Qdisc * sch)1047 static void qdisc_clear_nolock(struct Qdisc *sch)
1048 {
1049 sch->flags &= ~TCQ_F_NOLOCK;
1050 if (!(sch->flags & TCQ_F_CPUSTATS))
1051 return;
1052
1053 free_percpu(sch->cpu_bstats);
1054 free_percpu(sch->cpu_qstats);
1055 sch->cpu_bstats = NULL;
1056 sch->cpu_qstats = NULL;
1057 sch->flags &= ~TCQ_F_CPUSTATS;
1058 }
1059
1060 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1061 * to device "dev".
1062 *
1063 * When appropriate send a netlink notification using 'skb'
1064 * and "n".
1065 *
1066 * On success, destroy old qdisc.
1067 */
1068
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)1069 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1070 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1071 struct Qdisc *new, struct Qdisc *old,
1072 struct netlink_ext_ack *extack)
1073 {
1074 struct Qdisc *q = old;
1075 struct net *net = dev_net(dev);
1076
1077 if (parent == NULL) {
1078 unsigned int i, num_q, ingress;
1079 struct netdev_queue *dev_queue;
1080
1081 ingress = 0;
1082 num_q = dev->num_tx_queues;
1083 if ((q && q->flags & TCQ_F_INGRESS) ||
1084 (new && new->flags & TCQ_F_INGRESS)) {
1085 ingress = 1;
1086 dev_queue = dev_ingress_queue(dev);
1087 if (!dev_queue) {
1088 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1089 return -ENOENT;
1090 }
1091
1092 q = rtnl_dereference(dev_queue->qdisc_sleeping);
1093
1094 /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1095 * __tcf_qdisc_find() for filter requests.
1096 */
1097 if (!qdisc_refcount_dec_if_one(q)) {
1098 NL_SET_ERR_MSG(extack,
1099 "Current ingress or clsact Qdisc has ongoing filter requests");
1100 return -EBUSY;
1101 }
1102 }
1103
1104 if (dev->flags & IFF_UP)
1105 dev_deactivate(dev);
1106
1107 qdisc_offload_graft_root(dev, new, old, extack);
1108
1109 if (new && new->ops->attach && !ingress)
1110 goto skip;
1111
1112 if (!ingress) {
1113 for (i = 0; i < num_q; i++) {
1114 dev_queue = netdev_get_tx_queue(dev, i);
1115 old = dev_graft_qdisc(dev_queue, new);
1116
1117 if (new && i > 0)
1118 qdisc_refcount_inc(new);
1119 qdisc_put(old);
1120 }
1121 } else {
1122 old = dev_graft_qdisc(dev_queue, NULL);
1123
1124 /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1125 * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1126 * pointer(s) in mini_qdisc_pair_swap().
1127 */
1128 qdisc_notify(net, skb, n, classid, old, new, extack);
1129 qdisc_destroy(old);
1130
1131 dev_graft_qdisc(dev_queue, new);
1132 }
1133
1134 skip:
1135 if (!ingress) {
1136 old = rtnl_dereference(dev->qdisc);
1137 if (new && !new->ops->attach)
1138 qdisc_refcount_inc(new);
1139 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1140
1141 notify_and_destroy(net, skb, n, classid, old, new, extack);
1142
1143 if (new && new->ops->attach)
1144 new->ops->attach(new);
1145 }
1146
1147 if (dev->flags & IFF_UP)
1148 dev_activate(dev);
1149 } else {
1150 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1151 unsigned long cl;
1152 int err;
1153
1154 /* Only support running class lockless if parent is lockless */
1155 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1156 qdisc_clear_nolock(new);
1157
1158 if (!cops || !cops->graft)
1159 return -EOPNOTSUPP;
1160
1161 cl = cops->find(parent, classid);
1162 if (!cl) {
1163 NL_SET_ERR_MSG(extack, "Specified class not found");
1164 return -ENOENT;
1165 }
1166
1167 if (new && new->ops == &noqueue_qdisc_ops) {
1168 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1169 return -EINVAL;
1170 }
1171
1172 err = cops->graft(parent, cl, new, &old, extack);
1173 if (err)
1174 return err;
1175 notify_and_destroy(net, skb, n, classid, old, new, extack);
1176 }
1177 return 0;
1178 }
1179
qdisc_block_indexes_set(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1180 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1181 struct netlink_ext_ack *extack)
1182 {
1183 u32 block_index;
1184
1185 if (tca[TCA_INGRESS_BLOCK]) {
1186 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1187
1188 if (!block_index) {
1189 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1190 return -EINVAL;
1191 }
1192 if (!sch->ops->ingress_block_set) {
1193 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1194 return -EOPNOTSUPP;
1195 }
1196 sch->ops->ingress_block_set(sch, block_index);
1197 }
1198 if (tca[TCA_EGRESS_BLOCK]) {
1199 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1200
1201 if (!block_index) {
1202 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1203 return -EINVAL;
1204 }
1205 if (!sch->ops->egress_block_set) {
1206 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1207 return -EOPNOTSUPP;
1208 }
1209 sch->ops->egress_block_set(sch, block_index);
1210 }
1211 return 0;
1212 }
1213
1214 /*
1215 Allocate and initialize new qdisc.
1216
1217 Parameters are passed via opt.
1218 */
1219
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,u32 parent,u32 handle,struct nlattr ** tca,int * errp,struct netlink_ext_ack * extack)1220 static struct Qdisc *qdisc_create(struct net_device *dev,
1221 struct netdev_queue *dev_queue,
1222 u32 parent, u32 handle,
1223 struct nlattr **tca, int *errp,
1224 struct netlink_ext_ack *extack)
1225 {
1226 int err;
1227 struct nlattr *kind = tca[TCA_KIND];
1228 struct Qdisc *sch;
1229 struct Qdisc_ops *ops;
1230 struct qdisc_size_table *stab;
1231
1232 ops = qdisc_lookup_ops(kind);
1233 #ifdef CONFIG_MODULES
1234 if (ops == NULL && kind != NULL) {
1235 char name[IFNAMSIZ];
1236 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1237 /* We dropped the RTNL semaphore in order to
1238 * perform the module load. So, even if we
1239 * succeeded in loading the module we have to
1240 * tell the caller to replay the request. We
1241 * indicate this using -EAGAIN.
1242 * We replay the request because the device may
1243 * go away in the mean time.
1244 */
1245 rtnl_unlock();
1246 request_module("sch_%s", name);
1247 rtnl_lock();
1248 ops = qdisc_lookup_ops(kind);
1249 if (ops != NULL) {
1250 /* We will try again qdisc_lookup_ops,
1251 * so don't keep a reference.
1252 */
1253 module_put(ops->owner);
1254 err = -EAGAIN;
1255 goto err_out;
1256 }
1257 }
1258 }
1259 #endif
1260
1261 err = -ENOENT;
1262 if (!ops) {
1263 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1264 goto err_out;
1265 }
1266
1267 sch = qdisc_alloc(dev_queue, ops, extack);
1268 if (IS_ERR(sch)) {
1269 err = PTR_ERR(sch);
1270 goto err_out2;
1271 }
1272
1273 sch->parent = parent;
1274
1275 if (handle == TC_H_INGRESS) {
1276 if (!(sch->flags & TCQ_F_INGRESS)) {
1277 NL_SET_ERR_MSG(extack,
1278 "Specified parent ID is reserved for ingress and clsact Qdiscs");
1279 err = -EINVAL;
1280 goto err_out3;
1281 }
1282 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1283 } else {
1284 if (handle == 0) {
1285 handle = qdisc_alloc_handle(dev);
1286 if (handle == 0) {
1287 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1288 err = -ENOSPC;
1289 goto err_out3;
1290 }
1291 }
1292 if (!netif_is_multiqueue(dev))
1293 sch->flags |= TCQ_F_ONETXQUEUE;
1294 }
1295
1296 sch->handle = handle;
1297
1298 /* This exist to keep backward compatible with a userspace
1299 * loophole, what allowed userspace to get IFF_NO_QUEUE
1300 * facility on older kernels by setting tx_queue_len=0 (prior
1301 * to qdisc init), and then forgot to reinit tx_queue_len
1302 * before again attaching a qdisc.
1303 */
1304 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1305 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1306 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1307 }
1308
1309 err = qdisc_block_indexes_set(sch, tca, extack);
1310 if (err)
1311 goto err_out3;
1312
1313 if (ops->init) {
1314 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1315 if (err != 0)
1316 goto err_out5;
1317 }
1318
1319 if (tca[TCA_STAB]) {
1320 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1321 if (IS_ERR(stab)) {
1322 err = PTR_ERR(stab);
1323 goto err_out4;
1324 }
1325 rcu_assign_pointer(sch->stab, stab);
1326 }
1327 if (tca[TCA_RATE]) {
1328 err = -EOPNOTSUPP;
1329 if (sch->flags & TCQ_F_MQROOT) {
1330 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1331 goto err_out4;
1332 }
1333
1334 err = gen_new_estimator(&sch->bstats,
1335 sch->cpu_bstats,
1336 &sch->rate_est,
1337 NULL,
1338 true,
1339 tca[TCA_RATE]);
1340 if (err) {
1341 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1342 goto err_out4;
1343 }
1344 }
1345
1346 qdisc_hash_add(sch, false);
1347 trace_qdisc_create(ops, dev, parent);
1348
1349 return sch;
1350
1351 err_out5:
1352 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1353 if (ops->destroy)
1354 ops->destroy(sch);
1355 err_out3:
1356 netdev_put(dev, &sch->dev_tracker);
1357 qdisc_free(sch);
1358 err_out2:
1359 module_put(ops->owner);
1360 err_out:
1361 *errp = err;
1362 return NULL;
1363
1364 err_out4:
1365 /*
1366 * Any broken qdiscs that would require a ops->reset() here?
1367 * The qdisc was never in action so it shouldn't be necessary.
1368 */
1369 qdisc_put_stab(rtnl_dereference(sch->stab));
1370 if (ops->destroy)
1371 ops->destroy(sch);
1372 goto err_out3;
1373 }
1374
qdisc_change(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1375 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1376 struct netlink_ext_ack *extack)
1377 {
1378 struct qdisc_size_table *ostab, *stab = NULL;
1379 int err = 0;
1380
1381 if (tca[TCA_OPTIONS]) {
1382 if (!sch->ops->change) {
1383 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1384 return -EINVAL;
1385 }
1386 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1387 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1388 return -EOPNOTSUPP;
1389 }
1390 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1391 if (err)
1392 return err;
1393 }
1394
1395 if (tca[TCA_STAB]) {
1396 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1397 if (IS_ERR(stab))
1398 return PTR_ERR(stab);
1399 }
1400
1401 ostab = rtnl_dereference(sch->stab);
1402 rcu_assign_pointer(sch->stab, stab);
1403 qdisc_put_stab(ostab);
1404
1405 if (tca[TCA_RATE]) {
1406 /* NB: ignores errors from replace_estimator
1407 because change can't be undone. */
1408 if (sch->flags & TCQ_F_MQROOT)
1409 goto out;
1410 gen_replace_estimator(&sch->bstats,
1411 sch->cpu_bstats,
1412 &sch->rate_est,
1413 NULL,
1414 true,
1415 tca[TCA_RATE]);
1416 }
1417 out:
1418 return 0;
1419 }
1420
1421 struct check_loop_arg {
1422 struct qdisc_walker w;
1423 struct Qdisc *p;
1424 int depth;
1425 };
1426
1427 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1428 struct qdisc_walker *w);
1429
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1430 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1431 {
1432 struct check_loop_arg arg;
1433
1434 if (q->ops->cl_ops == NULL)
1435 return 0;
1436
1437 arg.w.stop = arg.w.skip = arg.w.count = 0;
1438 arg.w.fn = check_loop_fn;
1439 arg.depth = depth;
1440 arg.p = p;
1441 q->ops->cl_ops->walk(q, &arg.w);
1442 return arg.w.stop ? -ELOOP : 0;
1443 }
1444
1445 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1446 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1447 {
1448 struct Qdisc *leaf;
1449 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1450 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1451
1452 leaf = cops->leaf(q, cl);
1453 if (leaf) {
1454 if (leaf == arg->p || arg->depth > 7)
1455 return -ELOOP;
1456 return check_loop(leaf, arg->p, arg->depth + 1);
1457 }
1458 return 0;
1459 }
1460
1461 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1462 [TCA_KIND] = { .type = NLA_STRING },
1463 [TCA_RATE] = { .type = NLA_BINARY,
1464 .len = sizeof(struct tc_estimator) },
1465 [TCA_STAB] = { .type = NLA_NESTED },
1466 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1467 [TCA_CHAIN] = { .type = NLA_U32 },
1468 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1469 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1470 };
1471
1472 /*
1473 * Delete/get qdisc.
1474 */
1475
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1476 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1477 struct netlink_ext_ack *extack)
1478 {
1479 struct net *net = sock_net(skb->sk);
1480 struct tcmsg *tcm = nlmsg_data(n);
1481 struct nlattr *tca[TCA_MAX + 1];
1482 struct net_device *dev;
1483 u32 clid;
1484 struct Qdisc *q = NULL;
1485 struct Qdisc *p = NULL;
1486 int err;
1487
1488 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1489 rtm_tca_policy, extack);
1490 if (err < 0)
1491 return err;
1492
1493 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1494 if (!dev)
1495 return -ENODEV;
1496
1497 clid = tcm->tcm_parent;
1498 if (clid) {
1499 if (clid != TC_H_ROOT) {
1500 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1501 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1502 if (!p) {
1503 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1504 return -ENOENT;
1505 }
1506 q = qdisc_leaf(p, clid);
1507 } else if (dev_ingress_queue(dev)) {
1508 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1509 }
1510 } else {
1511 q = rtnl_dereference(dev->qdisc);
1512 }
1513 if (!q) {
1514 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1515 return -ENOENT;
1516 }
1517
1518 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1519 NL_SET_ERR_MSG(extack, "Invalid handle");
1520 return -EINVAL;
1521 }
1522 } else {
1523 q = qdisc_lookup(dev, tcm->tcm_handle);
1524 if (!q) {
1525 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1526 return -ENOENT;
1527 }
1528 }
1529
1530 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1531 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1532 return -EINVAL;
1533 }
1534
1535 if (n->nlmsg_type == RTM_DELQDISC) {
1536 if (!clid) {
1537 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1538 return -EINVAL;
1539 }
1540 if (q->handle == 0) {
1541 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1542 return -ENOENT;
1543 }
1544 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1545 if (err != 0)
1546 return err;
1547 } else {
1548 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1549 }
1550 return 0;
1551 }
1552
req_create_or_replace(struct nlmsghdr * n)1553 static bool req_create_or_replace(struct nlmsghdr *n)
1554 {
1555 return (n->nlmsg_flags & NLM_F_CREATE &&
1556 n->nlmsg_flags & NLM_F_REPLACE);
1557 }
1558
req_create_exclusive(struct nlmsghdr * n)1559 static bool req_create_exclusive(struct nlmsghdr *n)
1560 {
1561 return (n->nlmsg_flags & NLM_F_CREATE &&
1562 n->nlmsg_flags & NLM_F_EXCL);
1563 }
1564
req_change(struct nlmsghdr * n)1565 static bool req_change(struct nlmsghdr *n)
1566 {
1567 return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1568 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1569 !(n->nlmsg_flags & NLM_F_EXCL));
1570 }
1571
1572 /*
1573 * Create/change qdisc.
1574 */
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1575 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1576 struct netlink_ext_ack *extack)
1577 {
1578 struct net *net = sock_net(skb->sk);
1579 struct tcmsg *tcm;
1580 struct nlattr *tca[TCA_MAX + 1];
1581 struct net_device *dev;
1582 u32 clid;
1583 struct Qdisc *q, *p;
1584 int err;
1585
1586 replay:
1587 /* Reinit, just in case something touches this. */
1588 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1589 rtm_tca_policy, extack);
1590 if (err < 0)
1591 return err;
1592
1593 tcm = nlmsg_data(n);
1594 clid = tcm->tcm_parent;
1595 q = p = NULL;
1596
1597 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1598 if (!dev)
1599 return -ENODEV;
1600
1601
1602 if (clid) {
1603 if (clid != TC_H_ROOT) {
1604 if (clid != TC_H_INGRESS) {
1605 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1606 if (!p) {
1607 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1608 return -ENOENT;
1609 }
1610 q = qdisc_leaf(p, clid);
1611 } else if (dev_ingress_queue_create(dev)) {
1612 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1613 }
1614 } else {
1615 q = rtnl_dereference(dev->qdisc);
1616 }
1617
1618 /* It may be default qdisc, ignore it */
1619 if (q && q->handle == 0)
1620 q = NULL;
1621
1622 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1623 if (tcm->tcm_handle) {
1624 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1625 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1626 return -EEXIST;
1627 }
1628 if (TC_H_MIN(tcm->tcm_handle)) {
1629 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1630 return -EINVAL;
1631 }
1632 q = qdisc_lookup(dev, tcm->tcm_handle);
1633 if (!q)
1634 goto create_n_graft;
1635 if (n->nlmsg_flags & NLM_F_EXCL) {
1636 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1637 return -EEXIST;
1638 }
1639 if (tca[TCA_KIND] &&
1640 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1641 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1642 return -EINVAL;
1643 }
1644 if (q->flags & TCQ_F_INGRESS) {
1645 NL_SET_ERR_MSG(extack,
1646 "Cannot regraft ingress or clsact Qdiscs");
1647 return -EINVAL;
1648 }
1649 if (q == p ||
1650 (p && check_loop(q, p, 0))) {
1651 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1652 return -ELOOP;
1653 }
1654 if (clid == TC_H_INGRESS) {
1655 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1656 return -EINVAL;
1657 }
1658 qdisc_refcount_inc(q);
1659 goto graft;
1660 } else {
1661 if (!q)
1662 goto create_n_graft;
1663
1664 /* This magic test requires explanation.
1665 *
1666 * We know, that some child q is already
1667 * attached to this parent and have choice:
1668 * 1) change it or 2) create/graft new one.
1669 * If the requested qdisc kind is different
1670 * than the existing one, then we choose graft.
1671 * If they are the same then this is "change"
1672 * operation - just let it fallthrough..
1673 *
1674 * 1. We are allowed to create/graft only
1675 * if the request is explicitly stating
1676 * "please create if it doesn't exist".
1677 *
1678 * 2. If the request is to exclusive create
1679 * then the qdisc tcm_handle is not expected
1680 * to exist, so that we choose create/graft too.
1681 *
1682 * 3. The last case is when no flags are set.
1683 * This will happen when for example tc
1684 * utility issues a "change" command.
1685 * Alas, it is sort of hole in API, we
1686 * cannot decide what to do unambiguously.
1687 * For now we select create/graft.
1688 */
1689 if (tca[TCA_KIND] &&
1690 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1691 if (req_create_or_replace(n) ||
1692 req_create_exclusive(n))
1693 goto create_n_graft;
1694 else if (req_change(n))
1695 goto create_n_graft2;
1696 }
1697 }
1698 }
1699 } else {
1700 if (!tcm->tcm_handle) {
1701 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1702 return -EINVAL;
1703 }
1704 q = qdisc_lookup(dev, tcm->tcm_handle);
1705 }
1706
1707 /* Change qdisc parameters */
1708 if (!q) {
1709 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1710 return -ENOENT;
1711 }
1712 if (n->nlmsg_flags & NLM_F_EXCL) {
1713 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1714 return -EEXIST;
1715 }
1716 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1717 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1718 return -EINVAL;
1719 }
1720 err = qdisc_change(q, tca, extack);
1721 if (err == 0)
1722 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1723 return err;
1724
1725 create_n_graft:
1726 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1727 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1728 return -ENOENT;
1729 }
1730 create_n_graft2:
1731 if (clid == TC_H_INGRESS) {
1732 if (dev_ingress_queue(dev)) {
1733 q = qdisc_create(dev, dev_ingress_queue(dev),
1734 tcm->tcm_parent, tcm->tcm_parent,
1735 tca, &err, extack);
1736 } else {
1737 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1738 err = -ENOENT;
1739 }
1740 } else {
1741 struct netdev_queue *dev_queue;
1742
1743 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1744 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1745 else if (p)
1746 dev_queue = p->dev_queue;
1747 else
1748 dev_queue = netdev_get_tx_queue(dev, 0);
1749
1750 q = qdisc_create(dev, dev_queue,
1751 tcm->tcm_parent, tcm->tcm_handle,
1752 tca, &err, extack);
1753 }
1754 if (q == NULL) {
1755 if (err == -EAGAIN)
1756 goto replay;
1757 return err;
1758 }
1759
1760 graft:
1761 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1762 if (err) {
1763 if (q)
1764 qdisc_put(q);
1765 return err;
1766 }
1767
1768 return 0;
1769 }
1770
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur,bool dump_invisible)1771 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1772 struct netlink_callback *cb,
1773 int *q_idx_p, int s_q_idx, bool recur,
1774 bool dump_invisible)
1775 {
1776 int ret = 0, q_idx = *q_idx_p;
1777 struct Qdisc *q;
1778 int b;
1779
1780 if (!root)
1781 return 0;
1782
1783 q = root;
1784 if (q_idx < s_q_idx) {
1785 q_idx++;
1786 } else {
1787 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1788 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1789 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1790 RTM_NEWQDISC, NULL) <= 0)
1791 goto done;
1792 q_idx++;
1793 }
1794
1795 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1796 * itself has already been dumped.
1797 *
1798 * If we've already dumped the top-level (ingress) qdisc above and the global
1799 * qdisc hashtable, we don't want to hit it again
1800 */
1801 if (!qdisc_dev(root) || !recur)
1802 goto out;
1803
1804 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1805 if (q_idx < s_q_idx) {
1806 q_idx++;
1807 continue;
1808 }
1809 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1810 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1811 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1812 RTM_NEWQDISC, NULL) <= 0)
1813 goto done;
1814 q_idx++;
1815 }
1816
1817 out:
1818 *q_idx_p = q_idx;
1819 return ret;
1820 done:
1821 ret = -1;
1822 goto out;
1823 }
1824
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1825 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1826 {
1827 struct net *net = sock_net(skb->sk);
1828 int idx, q_idx;
1829 int s_idx, s_q_idx;
1830 struct net_device *dev;
1831 const struct nlmsghdr *nlh = cb->nlh;
1832 struct nlattr *tca[TCA_MAX + 1];
1833 int err;
1834
1835 s_idx = cb->args[0];
1836 s_q_idx = q_idx = cb->args[1];
1837
1838 idx = 0;
1839 ASSERT_RTNL();
1840
1841 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1842 rtm_tca_policy, cb->extack);
1843 if (err < 0)
1844 return err;
1845
1846 for_each_netdev(net, dev) {
1847 struct netdev_queue *dev_queue;
1848
1849 if (idx < s_idx)
1850 goto cont;
1851 if (idx > s_idx)
1852 s_q_idx = 0;
1853 q_idx = 0;
1854
1855 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1856 skb, cb, &q_idx, s_q_idx,
1857 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1858 goto done;
1859
1860 dev_queue = dev_ingress_queue(dev);
1861 if (dev_queue &&
1862 tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1863 skb, cb, &q_idx, s_q_idx, false,
1864 tca[TCA_DUMP_INVISIBLE]) < 0)
1865 goto done;
1866
1867 cont:
1868 idx++;
1869 }
1870
1871 done:
1872 cb->args[0] = idx;
1873 cb->args[1] = q_idx;
1874
1875 return skb->len;
1876 }
1877
1878
1879
1880 /************************************************
1881 * Traffic classes manipulation. *
1882 ************************************************/
1883
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event,struct netlink_ext_ack * extack)1884 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1885 unsigned long cl, u32 portid, u32 seq, u16 flags,
1886 int event, struct netlink_ext_ack *extack)
1887 {
1888 struct tcmsg *tcm;
1889 struct nlmsghdr *nlh;
1890 unsigned char *b = skb_tail_pointer(skb);
1891 struct gnet_dump d;
1892 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1893
1894 cond_resched();
1895 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1896 if (!nlh)
1897 goto out_nlmsg_trim;
1898 tcm = nlmsg_data(nlh);
1899 tcm->tcm_family = AF_UNSPEC;
1900 tcm->tcm__pad1 = 0;
1901 tcm->tcm__pad2 = 0;
1902 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1903 tcm->tcm_parent = q->handle;
1904 tcm->tcm_handle = q->handle;
1905 tcm->tcm_info = 0;
1906 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1907 goto nla_put_failure;
1908 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1909 goto nla_put_failure;
1910
1911 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1912 NULL, &d, TCA_PAD) < 0)
1913 goto nla_put_failure;
1914
1915 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1916 goto nla_put_failure;
1917
1918 if (gnet_stats_finish_copy(&d) < 0)
1919 goto nla_put_failure;
1920
1921 if (extack && extack->_msg &&
1922 nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1923 goto out_nlmsg_trim;
1924
1925 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1926
1927 return skb->len;
1928
1929 out_nlmsg_trim:
1930 nla_put_failure:
1931 nlmsg_trim(skb, b);
1932 return -1;
1933 }
1934
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event,struct netlink_ext_ack * extack)1935 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1936 struct nlmsghdr *n, struct Qdisc *q,
1937 unsigned long cl, int event, struct netlink_ext_ack *extack)
1938 {
1939 struct sk_buff *skb;
1940 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1941
1942 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1943 if (!skb)
1944 return -ENOBUFS;
1945
1946 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1947 kfree_skb(skb);
1948 return -EINVAL;
1949 }
1950
1951 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1952 n->nlmsg_flags & NLM_F_ECHO);
1953 }
1954
tclass_del_notify(struct net * net,const struct Qdisc_class_ops * cops,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,struct netlink_ext_ack * extack)1955 static int tclass_del_notify(struct net *net,
1956 const struct Qdisc_class_ops *cops,
1957 struct sk_buff *oskb, struct nlmsghdr *n,
1958 struct Qdisc *q, unsigned long cl,
1959 struct netlink_ext_ack *extack)
1960 {
1961 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1962 struct sk_buff *skb;
1963 int err = 0;
1964
1965 if (!cops->delete)
1966 return -EOPNOTSUPP;
1967
1968 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1969 if (!skb)
1970 return -ENOBUFS;
1971
1972 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1973 RTM_DELTCLASS, extack) < 0) {
1974 kfree_skb(skb);
1975 return -EINVAL;
1976 }
1977
1978 err = cops->delete(q, cl, extack);
1979 if (err) {
1980 kfree_skb(skb);
1981 return err;
1982 }
1983
1984 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1985 n->nlmsg_flags & NLM_F_ECHO);
1986 return err;
1987 }
1988
1989 #ifdef CONFIG_NET_CLS
1990
1991 struct tcf_bind_args {
1992 struct tcf_walker w;
1993 unsigned long base;
1994 unsigned long cl;
1995 u32 classid;
1996 };
1997
tcf_node_bind(struct tcf_proto * tp,void * n,struct tcf_walker * arg)1998 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1999 {
2000 struct tcf_bind_args *a = (void *)arg;
2001
2002 if (n && tp->ops->bind_class) {
2003 struct Qdisc *q = tcf_block_q(tp->chain->block);
2004
2005 sch_tree_lock(q);
2006 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2007 sch_tree_unlock(q);
2008 }
2009 return 0;
2010 }
2011
2012 struct tc_bind_class_args {
2013 struct qdisc_walker w;
2014 unsigned long new_cl;
2015 u32 portid;
2016 u32 clid;
2017 };
2018
tc_bind_class_walker(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)2019 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2020 struct qdisc_walker *w)
2021 {
2022 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2023 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2024 struct tcf_block *block;
2025 struct tcf_chain *chain;
2026
2027 block = cops->tcf_block(q, cl, NULL);
2028 if (!block)
2029 return 0;
2030 for (chain = tcf_get_next_chain(block, NULL);
2031 chain;
2032 chain = tcf_get_next_chain(block, chain)) {
2033 struct tcf_proto *tp;
2034
2035 for (tp = tcf_get_next_proto(chain, NULL);
2036 tp; tp = tcf_get_next_proto(chain, tp)) {
2037 struct tcf_bind_args arg = {};
2038
2039 arg.w.fn = tcf_node_bind;
2040 arg.classid = a->clid;
2041 arg.base = cl;
2042 arg.cl = a->new_cl;
2043 tp->ops->walk(tp, &arg.w, true);
2044 }
2045 }
2046
2047 return 0;
2048 }
2049
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)2050 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2051 unsigned long new_cl)
2052 {
2053 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2054 struct tc_bind_class_args args = {};
2055
2056 if (!cops->tcf_block)
2057 return;
2058 args.portid = portid;
2059 args.clid = clid;
2060 args.new_cl = new_cl;
2061 args.w.fn = tc_bind_class_walker;
2062 q->ops->cl_ops->walk(q, &args.w);
2063 }
2064
2065 #else
2066
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)2067 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2068 unsigned long new_cl)
2069 {
2070 }
2071
2072 #endif
2073
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)2074 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2075 struct netlink_ext_ack *extack)
2076 {
2077 struct net *net = sock_net(skb->sk);
2078 struct tcmsg *tcm = nlmsg_data(n);
2079 struct nlattr *tca[TCA_MAX + 1];
2080 struct net_device *dev;
2081 struct Qdisc *q = NULL;
2082 const struct Qdisc_class_ops *cops;
2083 unsigned long cl = 0;
2084 unsigned long new_cl;
2085 u32 portid;
2086 u32 clid;
2087 u32 qid;
2088 int err;
2089
2090 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2091 rtm_tca_policy, extack);
2092 if (err < 0)
2093 return err;
2094
2095 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2096 if (!dev)
2097 return -ENODEV;
2098
2099 /*
2100 parent == TC_H_UNSPEC - unspecified parent.
2101 parent == TC_H_ROOT - class is root, which has no parent.
2102 parent == X:0 - parent is root class.
2103 parent == X:Y - parent is a node in hierarchy.
2104 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
2105
2106 handle == 0:0 - generate handle from kernel pool.
2107 handle == 0:Y - class is X:Y, where X:0 is qdisc.
2108 handle == X:Y - clear.
2109 handle == X:0 - root class.
2110 */
2111
2112 /* Step 1. Determine qdisc handle X:0 */
2113
2114 portid = tcm->tcm_parent;
2115 clid = tcm->tcm_handle;
2116 qid = TC_H_MAJ(clid);
2117
2118 if (portid != TC_H_ROOT) {
2119 u32 qid1 = TC_H_MAJ(portid);
2120
2121 if (qid && qid1) {
2122 /* If both majors are known, they must be identical. */
2123 if (qid != qid1)
2124 return -EINVAL;
2125 } else if (qid1) {
2126 qid = qid1;
2127 } else if (qid == 0)
2128 qid = rtnl_dereference(dev->qdisc)->handle;
2129
2130 /* Now qid is genuine qdisc handle consistent
2131 * both with parent and child.
2132 *
2133 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2134 */
2135 if (portid)
2136 portid = TC_H_MAKE(qid, portid);
2137 } else {
2138 if (qid == 0)
2139 qid = rtnl_dereference(dev->qdisc)->handle;
2140 }
2141
2142 /* OK. Locate qdisc */
2143 q = qdisc_lookup(dev, qid);
2144 if (!q)
2145 return -ENOENT;
2146
2147 /* An check that it supports classes */
2148 cops = q->ops->cl_ops;
2149 if (cops == NULL)
2150 return -EINVAL;
2151
2152 /* Now try to get class */
2153 if (clid == 0) {
2154 if (portid == TC_H_ROOT)
2155 clid = qid;
2156 } else
2157 clid = TC_H_MAKE(qid, clid);
2158
2159 if (clid)
2160 cl = cops->find(q, clid);
2161
2162 if (cl == 0) {
2163 err = -ENOENT;
2164 if (n->nlmsg_type != RTM_NEWTCLASS ||
2165 !(n->nlmsg_flags & NLM_F_CREATE))
2166 goto out;
2167 } else {
2168 switch (n->nlmsg_type) {
2169 case RTM_NEWTCLASS:
2170 err = -EEXIST;
2171 if (n->nlmsg_flags & NLM_F_EXCL)
2172 goto out;
2173 break;
2174 case RTM_DELTCLASS:
2175 err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2176 /* Unbind the class with flilters with 0 */
2177 tc_bind_tclass(q, portid, clid, 0);
2178 goto out;
2179 case RTM_GETTCLASS:
2180 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2181 goto out;
2182 default:
2183 err = -EINVAL;
2184 goto out;
2185 }
2186 }
2187
2188 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2189 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2190 return -EOPNOTSUPP;
2191 }
2192
2193 new_cl = cl;
2194 err = -EOPNOTSUPP;
2195 if (cops->change)
2196 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2197 if (err == 0) {
2198 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2199 /* We just create a new class, need to do reverse binding. */
2200 if (cl != new_cl)
2201 tc_bind_tclass(q, portid, clid, new_cl);
2202 }
2203 out:
2204 return err;
2205 }
2206
2207 struct qdisc_dump_args {
2208 struct qdisc_walker w;
2209 struct sk_buff *skb;
2210 struct netlink_callback *cb;
2211 };
2212
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)2213 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2214 struct qdisc_walker *arg)
2215 {
2216 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2217
2218 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2219 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2220 RTM_NEWTCLASS, NULL);
2221 }
2222
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2223 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2224 struct tcmsg *tcm, struct netlink_callback *cb,
2225 int *t_p, int s_t)
2226 {
2227 struct qdisc_dump_args arg;
2228
2229 if (tc_qdisc_dump_ignore(q, false) ||
2230 *t_p < s_t || !q->ops->cl_ops ||
2231 (tcm->tcm_parent &&
2232 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2233 (*t_p)++;
2234 return 0;
2235 }
2236 if (*t_p > s_t)
2237 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2238 arg.w.fn = qdisc_class_dump;
2239 arg.skb = skb;
2240 arg.cb = cb;
2241 arg.w.stop = 0;
2242 arg.w.skip = cb->args[1];
2243 arg.w.count = 0;
2244 q->ops->cl_ops->walk(q, &arg.w);
2245 cb->args[1] = arg.w.count;
2246 if (arg.w.stop)
2247 return -1;
2248 (*t_p)++;
2249 return 0;
2250 }
2251
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t,bool recur)2252 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2253 struct tcmsg *tcm, struct netlink_callback *cb,
2254 int *t_p, int s_t, bool recur)
2255 {
2256 struct Qdisc *q;
2257 int b;
2258
2259 if (!root)
2260 return 0;
2261
2262 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2263 return -1;
2264
2265 if (!qdisc_dev(root) || !recur)
2266 return 0;
2267
2268 if (tcm->tcm_parent) {
2269 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2270 if (q && q != root &&
2271 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2272 return -1;
2273 return 0;
2274 }
2275 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2276 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2277 return -1;
2278 }
2279
2280 return 0;
2281 }
2282
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)2283 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2284 {
2285 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2286 struct net *net = sock_net(skb->sk);
2287 struct netdev_queue *dev_queue;
2288 struct net_device *dev;
2289 int t, s_t;
2290
2291 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2292 return 0;
2293 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2294 if (!dev)
2295 return 0;
2296
2297 s_t = cb->args[0];
2298 t = 0;
2299
2300 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2301 skb, tcm, cb, &t, s_t, true) < 0)
2302 goto done;
2303
2304 dev_queue = dev_ingress_queue(dev);
2305 if (dev_queue &&
2306 tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2307 skb, tcm, cb, &t, s_t, false) < 0)
2308 goto done;
2309
2310 done:
2311 cb->args[0] = t;
2312
2313 dev_put(dev);
2314 return skb->len;
2315 }
2316
2317 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)2318 static int psched_show(struct seq_file *seq, void *v)
2319 {
2320 seq_printf(seq, "%08x %08x %08x %08x\n",
2321 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2322 1000000,
2323 (u32)NSEC_PER_SEC / hrtimer_resolution);
2324
2325 return 0;
2326 }
2327
psched_net_init(struct net * net)2328 static int __net_init psched_net_init(struct net *net)
2329 {
2330 struct proc_dir_entry *e;
2331
2332 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2333 if (e == NULL)
2334 return -ENOMEM;
2335
2336 return 0;
2337 }
2338
psched_net_exit(struct net * net)2339 static void __net_exit psched_net_exit(struct net *net)
2340 {
2341 remove_proc_entry("psched", net->proc_net);
2342 }
2343 #else
psched_net_init(struct net * net)2344 static int __net_init psched_net_init(struct net *net)
2345 {
2346 return 0;
2347 }
2348
psched_net_exit(struct net * net)2349 static void __net_exit psched_net_exit(struct net *net)
2350 {
2351 }
2352 #endif
2353
2354 static struct pernet_operations psched_net_ops = {
2355 .init = psched_net_init,
2356 .exit = psched_net_exit,
2357 };
2358
pktsched_init(void)2359 static int __init pktsched_init(void)
2360 {
2361 int err;
2362
2363 err = register_pernet_subsys(&psched_net_ops);
2364 if (err) {
2365 pr_err("pktsched_init: "
2366 "cannot initialize per netns operations\n");
2367 return err;
2368 }
2369
2370 register_qdisc(&pfifo_fast_ops);
2371 register_qdisc(&pfifo_qdisc_ops);
2372 register_qdisc(&bfifo_qdisc_ops);
2373 register_qdisc(&pfifo_head_drop_qdisc_ops);
2374 register_qdisc(&mq_qdisc_ops);
2375 register_qdisc(&noqueue_qdisc_ops);
2376
2377 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2378 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2379 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2380 0);
2381 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2382 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2383 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2384 0);
2385
2386 return 0;
2387 }
2388
2389 subsys_initcall(pktsched_init);
2390