1 /*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39
40 /*
41
42 Short review.
43 -------------
44
45 This file consists of two interrelated parts:
46
47 1. queueing disciplines manager frontend.
48 2. traffic classes manager frontend.
49
50 Generally, queueing discipline ("qdisc") is a black box,
51 which is able to enqueue packets and to dequeue them (when
52 device is ready to send something) in order and at times
53 determined by algorithm hidden in it.
54
55 qdisc's are divided to two categories:
56 - "queues", which have no internal structure visible from outside.
57 - "schedulers", which split all the packets to "traffic classes",
58 using "packet classifiers" (look at cls_api.c)
59
60 In turn, classes may have child qdiscs (as rule, queues)
61 attached to them etc. etc. etc.
62
63 The goal of the routines in this file is to translate
64 information supplied by user in the form of handles
65 to more intelligible for kernel form, to make some sanity
66 checks and part of work, which is common to all qdiscs
67 and to provide rtnetlink notifications.
68
69 All real intelligent work is done inside qdisc modules.
70
71
72
73 Every discipline has two major routines: enqueue and dequeue.
74
75 ---dequeue
76
77 dequeue usually returns a skb to send. It is allowed to return NULL,
78 but it does not mean that queue is empty, it just means that
79 discipline does not want to send anything this time.
80 Queue is really empty if q->q.qlen == 0.
81 For complicated disciplines with multiple queues q->q is not
82 real packet queue, but however q->q.qlen must be valid.
83
84 ---enqueue
85
86 enqueue returns 0, if packet was enqueued successfully.
87 If packet (this one or another one) was dropped, it returns
88 not zero error code.
89 NET_XMIT_DROP - this packet dropped
90 Expected action: do not backoff, but wait until queue will clear.
91 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
92 Expected action: backoff or ignore
93
94 Auxiliary routines:
95
96 ---peek
97
98 like dequeue but without removing a packet from the queue
99
100 ---reset
101
102 returns qdisc to initial state: purge all buffers, clear all
103 timers, counters (except for statistics) etc.
104
105 ---init
106
107 initializes newly created qdisc.
108
109 ---destroy
110
111 destroys resources allocated by init and during lifetime of qdisc.
112
113 ---change
114
115 changes qdisc parameters.
116 */
117
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120
121
122 /************************************************
123 * Queueing disciplines manipulation. *
124 ************************************************/
125
126
127 /* The list of all installed queueing disciplines. */
128
129 static struct Qdisc_ops *qdisc_base;
130
131 /* Register/unregister queueing discipline */
132
register_qdisc(struct Qdisc_ops * qops)133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135 struct Qdisc_ops *q, **qp;
136 int rc = -EEXIST;
137
138 write_lock(&qdisc_mod_lock);
139 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140 if (!strcmp(qops->id, q->id))
141 goto out;
142
143 if (qops->enqueue == NULL)
144 qops->enqueue = noop_qdisc_ops.enqueue;
145 if (qops->peek == NULL) {
146 if (qops->dequeue == NULL)
147 qops->peek = noop_qdisc_ops.peek;
148 else
149 goto out_einval;
150 }
151 if (qops->dequeue == NULL)
152 qops->dequeue = noop_qdisc_ops.dequeue;
153
154 if (qops->cl_ops) {
155 const struct Qdisc_class_ops *cops = qops->cl_ops;
156
157 if (!(cops->find && cops->walk && cops->leaf))
158 goto out_einval;
159
160 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161 goto out_einval;
162 }
163
164 qops->next = NULL;
165 *qp = qops;
166 rc = 0;
167 out:
168 write_unlock(&qdisc_mod_lock);
169 return rc;
170
171 out_einval:
172 rc = -EINVAL;
173 goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176
unregister_qdisc(struct Qdisc_ops * qops)177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179 struct Qdisc_ops *q, **qp;
180 int err = -ENOENT;
181
182 write_lock(&qdisc_mod_lock);
183 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184 if (q == qops)
185 break;
186 if (q) {
187 *qp = q->next;
188 q->next = NULL;
189 err = 0;
190 }
191 write_unlock(&qdisc_mod_lock);
192 return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195
196 /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)197 void qdisc_get_default(char *name, size_t len)
198 {
199 read_lock(&qdisc_mod_lock);
200 strlcpy(name, default_qdisc_ops->id, len);
201 read_unlock(&qdisc_mod_lock);
202 }
203
qdisc_lookup_default(const char * name)204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206 struct Qdisc_ops *q = NULL;
207
208 for (q = qdisc_base; q; q = q->next) {
209 if (!strcmp(name, q->id)) {
210 if (!try_module_get(q->owner))
211 q = NULL;
212 break;
213 }
214 }
215
216 return q;
217 }
218
219 /* Set new default qdisc to use */
qdisc_set_default(const char * name)220 int qdisc_set_default(const char *name)
221 {
222 const struct Qdisc_ops *ops;
223
224 if (!capable(CAP_NET_ADMIN))
225 return -EPERM;
226
227 write_lock(&qdisc_mod_lock);
228 ops = qdisc_lookup_default(name);
229 if (!ops) {
230 /* Not found, drop lock and try to load module */
231 write_unlock(&qdisc_mod_lock);
232 request_module("sch_%s", name);
233 write_lock(&qdisc_mod_lock);
234
235 ops = qdisc_lookup_default(name);
236 }
237
238 if (ops) {
239 /* Set new default */
240 module_put(default_qdisc_ops->owner);
241 default_qdisc_ops = ops;
242 }
243 write_unlock(&qdisc_mod_lock);
244
245 return ops ? 0 : -ENOENT;
246 }
247
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
sch_default_qdisc(void)250 static int __init sch_default_qdisc(void)
251 {
252 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256
257 /* We know handle. Find qdisc among all qdisc's attached to device
258 * (root qdisc, all its children, children of children etc.)
259 * Note: caller either uses rtnl or rcu_read_lock()
260 */
261
qdisc_match_from_root(struct Qdisc * root,u32 handle)262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264 struct Qdisc *q;
265
266 if (!qdisc_dev(root))
267 return (root->handle == handle ? root : NULL);
268
269 if (!(root->flags & TCQ_F_BUILTIN) &&
270 root->handle == handle)
271 return root;
272
273 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274 if (q->handle == handle)
275 return q;
276 }
277 return NULL;
278 }
279
qdisc_hash_add(struct Qdisc * q,bool invisible)280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283 ASSERT_RTNL();
284 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285 if (invisible)
286 q->flags |= TCQ_F_INVISIBLE;
287 }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
qdisc_hash_del(struct Qdisc * q)291 void qdisc_hash_del(struct Qdisc *q)
292 {
293 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294 ASSERT_RTNL();
295 hash_del_rcu(&q->hash);
296 }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
qdisc_lookup(struct net_device * dev,u32 handle)300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302 struct Qdisc *q;
303
304 if (!handle)
305 return NULL;
306 q = qdisc_match_from_root(dev->qdisc, handle);
307 if (q)
308 goto out;
309
310 if (dev_ingress_queue(dev))
311 q = qdisc_match_from_root(
312 dev_ingress_queue(dev)->qdisc_sleeping,
313 handle);
314 out:
315 return q;
316 }
317
qdisc_leaf(struct Qdisc * p,u32 classid)318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320 unsigned long cl;
321 struct Qdisc *leaf;
322 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323
324 if (cops == NULL)
325 return NULL;
326 cl = cops->find(p, classid);
327
328 if (cl == 0)
329 return NULL;
330 leaf = cops->leaf(p, cl);
331 return leaf;
332 }
333
334 /* Find queueing discipline by name */
335
qdisc_lookup_ops(struct nlattr * kind)336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338 struct Qdisc_ops *q = NULL;
339
340 if (kind) {
341 read_lock(&qdisc_mod_lock);
342 for (q = qdisc_base; q; q = q->next) {
343 if (nla_strcmp(kind, q->id) == 0) {
344 if (!try_module_get(q->owner))
345 q = NULL;
346 break;
347 }
348 }
349 read_unlock(&qdisc_mod_lock);
350 }
351 return q;
352 }
353
354 /* The linklayer setting were not transferred from iproute2, in older
355 * versions, and the rate tables lookup systems have been dropped in
356 * the kernel. To keep backward compatible with older iproute2 tc
357 * utils, we detect the linklayer setting by detecting if the rate
358 * table were modified.
359 *
360 * For linklayer ATM table entries, the rate table will be aligned to
361 * 48 bytes, thus some table entries will contain the same value. The
362 * mpu (min packet unit) is also encoded into the old rate table, thus
363 * starting from the mpu, we find low and high table entries for
364 * mapping this cell. If these entries contain the same value, when
365 * the rate tables have been modified for linklayer ATM.
366 *
367 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368 * and then roundup to the next cell, calc the table entry one below,
369 * and compare.
370 */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373 int low = roundup(r->mpu, 48);
374 int high = roundup(low+1, 48);
375 int cell_low = low >> r->cell_log;
376 int cell_high = (high >> r->cell_log) - 1;
377
378 /* rtab is too inaccurate at rates > 100Mbit/s */
379 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380 pr_debug("TC linklayer: Giving up ATM detection\n");
381 return TC_LINKLAYER_ETHERNET;
382 }
383
384 if ((cell_high > cell_low) && (cell_high < 256)
385 && (rtab[cell_low] == rtab[cell_high])) {
386 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387 cell_low, cell_high, rtab[cell_high]);
388 return TC_LINKLAYER_ATM;
389 }
390 return TC_LINKLAYER_ETHERNET;
391 }
392
393 static struct qdisc_rate_table *qdisc_rtab_list;
394
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab,struct netlink_ext_ack * extack)395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396 struct nlattr *tab,
397 struct netlink_ext_ack *extack)
398 {
399 struct qdisc_rate_table *rtab;
400
401 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402 nla_len(tab) != TC_RTAB_SIZE) {
403 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404 return NULL;
405 }
406
407 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409 !memcmp(&rtab->data, nla_data(tab), 1024)) {
410 rtab->refcnt++;
411 return rtab;
412 }
413 }
414
415 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416 if (rtab) {
417 rtab->rate = *r;
418 rtab->refcnt = 1;
419 memcpy(rtab->data, nla_data(tab), 1024);
420 if (r->linklayer == TC_LINKLAYER_UNAWARE)
421 r->linklayer = __detect_linklayer(r, rtab->data);
422 rtab->next = qdisc_rtab_list;
423 qdisc_rtab_list = rtab;
424 } else {
425 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426 }
427 return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430
qdisc_put_rtab(struct qdisc_rate_table * tab)431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433 struct qdisc_rate_table *rtab, **rtabp;
434
435 if (!tab || --tab->refcnt)
436 return;
437
438 for (rtabp = &qdisc_rtab_list;
439 (rtab = *rtabp) != NULL;
440 rtabp = &rtab->next) {
441 if (rtab == tab) {
442 *rtabp = rtab->next;
443 kfree(rtab);
444 return;
445 }
446 }
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449
450 static LIST_HEAD(qdisc_stab_list);
451
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
454 [TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456
qdisc_get_stab(struct nlattr * opt,struct netlink_ext_ack * extack)457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458 struct netlink_ext_ack *extack)
459 {
460 struct nlattr *tb[TCA_STAB_MAX + 1];
461 struct qdisc_size_table *stab;
462 struct tc_sizespec *s;
463 unsigned int tsize = 0;
464 u16 *tab = NULL;
465 int err;
466
467 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468 if (err < 0)
469 return ERR_PTR(err);
470 if (!tb[TCA_STAB_BASE]) {
471 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472 return ERR_PTR(-EINVAL);
473 }
474
475 s = nla_data(tb[TCA_STAB_BASE]);
476
477 if (s->tsize > 0) {
478 if (!tb[TCA_STAB_DATA]) {
479 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480 return ERR_PTR(-EINVAL);
481 }
482 tab = nla_data(tb[TCA_STAB_DATA]);
483 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484 }
485
486 if (tsize != s->tsize || (!tab && tsize > 0)) {
487 NL_SET_ERR_MSG(extack, "Invalid size of size table");
488 return ERR_PTR(-EINVAL);
489 }
490
491 list_for_each_entry(stab, &qdisc_stab_list, list) {
492 if (memcmp(&stab->szopts, s, sizeof(*s)))
493 continue;
494 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495 continue;
496 stab->refcnt++;
497 return stab;
498 }
499
500 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501 if (!stab)
502 return ERR_PTR(-ENOMEM);
503
504 stab->refcnt = 1;
505 stab->szopts = *s;
506 if (tsize > 0)
507 memcpy(stab->data, tab, tsize * sizeof(u16));
508
509 list_add_tail(&stab->list, &qdisc_stab_list);
510
511 return stab;
512 }
513
stab_kfree_rcu(struct rcu_head * head)514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516 kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518
qdisc_put_stab(struct qdisc_size_table * tab)519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521 if (!tab)
522 return;
523
524 if (--tab->refcnt == 0) {
525 list_del(&tab->list);
526 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527 }
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533 struct nlattr *nest;
534
535 nest = nla_nest_start(skb, TCA_STAB);
536 if (nest == NULL)
537 goto nla_put_failure;
538 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539 goto nla_put_failure;
540 nla_nest_end(skb, nest);
541
542 return skb->len;
543
544 nla_put_failure:
545 return -1;
546 }
547
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549 const struct qdisc_size_table *stab)
550 {
551 int pkt_len, slot;
552
553 pkt_len = skb->len + stab->szopts.overhead;
554 if (unlikely(!stab->szopts.tsize))
555 goto out;
556
557 slot = pkt_len + stab->szopts.cell_align;
558 if (unlikely(slot < 0))
559 slot = 0;
560
561 slot >>= stab->szopts.cell_log;
562 if (likely(slot < stab->szopts.tsize))
563 pkt_len = stab->data[slot];
564 else
565 pkt_len = stab->data[stab->szopts.tsize - 1] *
566 (slot / stab->szopts.tsize) +
567 stab->data[slot % stab->szopts.tsize];
568
569 pkt_len <<= stab->szopts.size_log;
570 out:
571 if (unlikely(pkt_len < 1))
572 pkt_len = 1;
573 qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581 txt, qdisc->ops->id, qdisc->handle >> 16);
582 qdisc->flags |= TCQ_F_WARN_NONWC;
583 }
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586
qdisc_watchdog(struct hrtimer * timer)587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590 timer);
591
592 rcu_read_lock();
593 __netif_schedule(qdisc_root(wd->qdisc));
594 rcu_read_unlock();
595
596 return HRTIMER_NORESTART;
597 }
598
qdisc_watchdog_init_clockid(struct qdisc_watchdog * wd,struct Qdisc * qdisc,clockid_t clockid)599 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
600 clockid_t clockid)
601 {
602 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
603 wd->timer.function = qdisc_watchdog;
604 wd->qdisc = qdisc;
605 }
606 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
607
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)608 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
609 {
610 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
611 }
612 EXPORT_SYMBOL(qdisc_watchdog_init);
613
qdisc_watchdog_schedule_ns(struct qdisc_watchdog * wd,u64 expires)614 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
615 {
616 if (test_bit(__QDISC_STATE_DEACTIVATED,
617 &qdisc_root_sleeping(wd->qdisc)->state))
618 return;
619
620 if (wd->last_expires == expires)
621 return;
622
623 wd->last_expires = expires;
624 hrtimer_start(&wd->timer,
625 ns_to_ktime(expires),
626 HRTIMER_MODE_ABS_PINNED);
627 }
628 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
629
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)630 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
631 {
632 hrtimer_cancel(&wd->timer);
633 }
634 EXPORT_SYMBOL(qdisc_watchdog_cancel);
635
qdisc_class_hash_alloc(unsigned int n)636 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
637 {
638 struct hlist_head *h;
639 unsigned int i;
640
641 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
642
643 if (h != NULL) {
644 for (i = 0; i < n; i++)
645 INIT_HLIST_HEAD(&h[i]);
646 }
647 return h;
648 }
649
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)650 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
651 {
652 struct Qdisc_class_common *cl;
653 struct hlist_node *next;
654 struct hlist_head *nhash, *ohash;
655 unsigned int nsize, nmask, osize;
656 unsigned int i, h;
657
658 /* Rehash when load factor exceeds 0.75 */
659 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
660 return;
661 nsize = clhash->hashsize * 2;
662 nmask = nsize - 1;
663 nhash = qdisc_class_hash_alloc(nsize);
664 if (nhash == NULL)
665 return;
666
667 ohash = clhash->hash;
668 osize = clhash->hashsize;
669
670 sch_tree_lock(sch);
671 for (i = 0; i < osize; i++) {
672 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
673 h = qdisc_class_hash(cl->classid, nmask);
674 hlist_add_head(&cl->hnode, &nhash[h]);
675 }
676 }
677 clhash->hash = nhash;
678 clhash->hashsize = nsize;
679 clhash->hashmask = nmask;
680 sch_tree_unlock(sch);
681
682 kvfree(ohash);
683 }
684 EXPORT_SYMBOL(qdisc_class_hash_grow);
685
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)686 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
687 {
688 unsigned int size = 4;
689
690 clhash->hash = qdisc_class_hash_alloc(size);
691 if (!clhash->hash)
692 return -ENOMEM;
693 clhash->hashsize = size;
694 clhash->hashmask = size - 1;
695 clhash->hashelems = 0;
696 return 0;
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_init);
699
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)700 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
701 {
702 kvfree(clhash->hash);
703 }
704 EXPORT_SYMBOL(qdisc_class_hash_destroy);
705
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)706 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
707 struct Qdisc_class_common *cl)
708 {
709 unsigned int h;
710
711 INIT_HLIST_NODE(&cl->hnode);
712 h = qdisc_class_hash(cl->classid, clhash->hashmask);
713 hlist_add_head(&cl->hnode, &clhash->hash[h]);
714 clhash->hashelems++;
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_insert);
717
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)718 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
719 struct Qdisc_class_common *cl)
720 {
721 hlist_del(&cl->hnode);
722 clhash->hashelems--;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_remove);
725
726 /* Allocate an unique handle from space managed by kernel
727 * Possible range is [8000-FFFF]:0000 (0x8000 values)
728 */
qdisc_alloc_handle(struct net_device * dev)729 static u32 qdisc_alloc_handle(struct net_device *dev)
730 {
731 int i = 0x8000;
732 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
733
734 do {
735 autohandle += TC_H_MAKE(0x10000U, 0);
736 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
737 autohandle = TC_H_MAKE(0x80000000U, 0);
738 if (!qdisc_lookup(dev, autohandle))
739 return autohandle;
740 cond_resched();
741 } while (--i > 0);
742
743 return 0;
744 }
745
qdisc_tree_reduce_backlog(struct Qdisc * sch,unsigned int n,unsigned int len)746 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
747 unsigned int len)
748 {
749 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
750 const struct Qdisc_class_ops *cops;
751 unsigned long cl;
752 u32 parentid;
753 bool notify;
754 int drops;
755
756 if (n == 0 && len == 0)
757 return;
758 drops = max_t(int, n, 0);
759 rcu_read_lock();
760 while ((parentid = sch->parent)) {
761 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
762 break;
763
764 if (sch->flags & TCQ_F_NOPARENT)
765 break;
766 /* Notify parent qdisc only if child qdisc becomes empty.
767 *
768 * If child was empty even before update then backlog
769 * counter is screwed and we skip notification because
770 * parent class is already passive.
771 *
772 * If the original child was offloaded then it is allowed
773 * to be seem as empty, so the parent is notified anyway.
774 */
775 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
776 !qdisc_is_offloaded);
777 /* TODO: perform the search on a per txq basis */
778 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
779 if (sch == NULL) {
780 WARN_ON_ONCE(parentid != TC_H_ROOT);
781 break;
782 }
783 cops = sch->ops->cl_ops;
784 if (notify && cops->qlen_notify) {
785 cl = cops->find(sch, parentid);
786 cops->qlen_notify(sch, cl);
787 }
788 sch->q.qlen -= n;
789 sch->qstats.backlog -= len;
790 __qdisc_qstats_drop(sch, drops);
791 }
792 rcu_read_unlock();
793 }
794 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
795
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event)796 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
797 u32 portid, u32 seq, u16 flags, int event)
798 {
799 struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
800 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
801 struct tcmsg *tcm;
802 struct nlmsghdr *nlh;
803 unsigned char *b = skb_tail_pointer(skb);
804 struct gnet_dump d;
805 struct qdisc_size_table *stab;
806 u32 block_index;
807 __u32 qlen;
808
809 cond_resched();
810 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
811 if (!nlh)
812 goto out_nlmsg_trim;
813 tcm = nlmsg_data(nlh);
814 tcm->tcm_family = AF_UNSPEC;
815 tcm->tcm__pad1 = 0;
816 tcm->tcm__pad2 = 0;
817 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
818 tcm->tcm_parent = clid;
819 tcm->tcm_handle = q->handle;
820 tcm->tcm_info = refcount_read(&q->refcnt);
821 if (nla_put_string(skb, TCA_KIND, q->ops->id))
822 goto nla_put_failure;
823 if (q->ops->ingress_block_get) {
824 block_index = q->ops->ingress_block_get(q);
825 if (block_index &&
826 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
827 goto nla_put_failure;
828 }
829 if (q->ops->egress_block_get) {
830 block_index = q->ops->egress_block_get(q);
831 if (block_index &&
832 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
833 goto nla_put_failure;
834 }
835 if (q->ops->dump && q->ops->dump(q, skb) < 0)
836 goto nla_put_failure;
837 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
838 goto nla_put_failure;
839 qlen = qdisc_qlen_sum(q);
840
841 stab = rtnl_dereference(q->stab);
842 if (stab && qdisc_dump_stab(skb, stab) < 0)
843 goto nla_put_failure;
844
845 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
846 NULL, &d, TCA_PAD) < 0)
847 goto nla_put_failure;
848
849 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
850 goto nla_put_failure;
851
852 if (qdisc_is_percpu_stats(q)) {
853 cpu_bstats = q->cpu_bstats;
854 cpu_qstats = q->cpu_qstats;
855 }
856
857 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
858 &d, cpu_bstats, &q->bstats) < 0 ||
859 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
860 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
861 goto nla_put_failure;
862
863 if (gnet_stats_finish_copy(&d) < 0)
864 goto nla_put_failure;
865
866 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
867 return skb->len;
868
869 out_nlmsg_trim:
870 nla_put_failure:
871 nlmsg_trim(skb, b);
872 return -1;
873 }
874
tc_qdisc_dump_ignore(struct Qdisc * q,bool dump_invisible)875 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
876 {
877 if (q->flags & TCQ_F_BUILTIN)
878 return true;
879 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
880 return true;
881
882 return false;
883 }
884
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)885 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
886 struct nlmsghdr *n, u32 clid,
887 struct Qdisc *old, struct Qdisc *new)
888 {
889 struct sk_buff *skb;
890 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
891
892 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
893 if (!skb)
894 return -ENOBUFS;
895
896 if (old && !tc_qdisc_dump_ignore(old, false)) {
897 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
898 0, RTM_DELQDISC) < 0)
899 goto err_out;
900 }
901 if (new && !tc_qdisc_dump_ignore(new, false)) {
902 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
903 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
904 goto err_out;
905 }
906
907 if (skb->len)
908 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
909 n->nlmsg_flags & NLM_F_ECHO);
910
911 err_out:
912 kfree_skb(skb);
913 return -EINVAL;
914 }
915
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)916 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
917 struct nlmsghdr *n, u32 clid,
918 struct Qdisc *old, struct Qdisc *new)
919 {
920 if (new || old)
921 qdisc_notify(net, skb, n, clid, old, new);
922
923 if (old)
924 qdisc_destroy(old);
925 }
926
927 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
928 * to device "dev".
929 *
930 * When appropriate send a netlink notification using 'skb'
931 * and "n".
932 *
933 * On success, destroy old qdisc.
934 */
935
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old,struct netlink_ext_ack * extack)936 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
937 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
938 struct Qdisc *new, struct Qdisc *old,
939 struct netlink_ext_ack *extack)
940 {
941 struct Qdisc *q = old;
942 struct net *net = dev_net(dev);
943 int err = 0;
944
945 if (parent == NULL) {
946 unsigned int i, num_q, ingress;
947
948 ingress = 0;
949 num_q = dev->num_tx_queues;
950 if ((q && q->flags & TCQ_F_INGRESS) ||
951 (new && new->flags & TCQ_F_INGRESS)) {
952 num_q = 1;
953 ingress = 1;
954 if (!dev_ingress_queue(dev)) {
955 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
956 return -ENOENT;
957 }
958 }
959
960 if (dev->flags & IFF_UP)
961 dev_deactivate(dev);
962
963 if (new && new->ops->attach)
964 goto skip;
965
966 for (i = 0; i < num_q; i++) {
967 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
968
969 if (!ingress)
970 dev_queue = netdev_get_tx_queue(dev, i);
971
972 old = dev_graft_qdisc(dev_queue, new);
973 if (new && i > 0)
974 qdisc_refcount_inc(new);
975
976 if (!ingress)
977 qdisc_destroy(old);
978 }
979
980 skip:
981 if (!ingress) {
982 notify_and_destroy(net, skb, n, classid,
983 dev->qdisc, new);
984 if (new && !new->ops->attach)
985 qdisc_refcount_inc(new);
986 dev->qdisc = new ? : &noop_qdisc;
987
988 if (new && new->ops->attach)
989 new->ops->attach(new);
990 } else {
991 notify_and_destroy(net, skb, n, classid, old, new);
992 }
993
994 if (dev->flags & IFF_UP)
995 dev_activate(dev);
996 } else {
997 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
998
999 /* Only support running class lockless if parent is lockless */
1000 if (new && (new->flags & TCQ_F_NOLOCK) &&
1001 parent && !(parent->flags & TCQ_F_NOLOCK))
1002 new->flags &= ~TCQ_F_NOLOCK;
1003
1004 err = -EOPNOTSUPP;
1005 if (cops && cops->graft) {
1006 unsigned long cl = cops->find(parent, classid);
1007
1008 if (cl) {
1009 if (new && new->ops == &noqueue_qdisc_ops) {
1010 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1011 err = -EINVAL;
1012 } else {
1013 err = cops->graft(parent, cl, new, &old, extack);
1014 }
1015 } else {
1016 NL_SET_ERR_MSG(extack, "Specified class not found");
1017 err = -ENOENT;
1018 }
1019 }
1020 if (!err)
1021 notify_and_destroy(net, skb, n, classid, old, new);
1022 }
1023 return err;
1024 }
1025
qdisc_block_indexes_set(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1026 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1027 struct netlink_ext_ack *extack)
1028 {
1029 u32 block_index;
1030
1031 if (tca[TCA_INGRESS_BLOCK]) {
1032 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1033
1034 if (!block_index) {
1035 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1036 return -EINVAL;
1037 }
1038 if (!sch->ops->ingress_block_set) {
1039 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1040 return -EOPNOTSUPP;
1041 }
1042 sch->ops->ingress_block_set(sch, block_index);
1043 }
1044 if (tca[TCA_EGRESS_BLOCK]) {
1045 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1046
1047 if (!block_index) {
1048 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1049 return -EINVAL;
1050 }
1051 if (!sch->ops->egress_block_set) {
1052 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1053 return -EOPNOTSUPP;
1054 }
1055 sch->ops->egress_block_set(sch, block_index);
1056 }
1057 return 0;
1058 }
1059
1060 /* lockdep annotation is needed for ingress; egress gets it only for name */
1061 static struct lock_class_key qdisc_tx_lock;
1062 static struct lock_class_key qdisc_rx_lock;
1063
1064 /*
1065 Allocate and initialize new qdisc.
1066
1067 Parameters are passed via opt.
1068 */
1069
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp,struct netlink_ext_ack * extack)1070 static struct Qdisc *qdisc_create(struct net_device *dev,
1071 struct netdev_queue *dev_queue,
1072 struct Qdisc *p, u32 parent, u32 handle,
1073 struct nlattr **tca, int *errp,
1074 struct netlink_ext_ack *extack)
1075 {
1076 int err;
1077 struct nlattr *kind = tca[TCA_KIND];
1078 struct Qdisc *sch;
1079 struct Qdisc_ops *ops;
1080 struct qdisc_size_table *stab;
1081
1082 ops = qdisc_lookup_ops(kind);
1083 #ifdef CONFIG_MODULES
1084 if (ops == NULL && kind != NULL) {
1085 char name[IFNAMSIZ];
1086 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1087 /* We dropped the RTNL semaphore in order to
1088 * perform the module load. So, even if we
1089 * succeeded in loading the module we have to
1090 * tell the caller to replay the request. We
1091 * indicate this using -EAGAIN.
1092 * We replay the request because the device may
1093 * go away in the mean time.
1094 */
1095 rtnl_unlock();
1096 request_module("sch_%s", name);
1097 rtnl_lock();
1098 ops = qdisc_lookup_ops(kind);
1099 if (ops != NULL) {
1100 /* We will try again qdisc_lookup_ops,
1101 * so don't keep a reference.
1102 */
1103 module_put(ops->owner);
1104 err = -EAGAIN;
1105 goto err_out;
1106 }
1107 }
1108 }
1109 #endif
1110
1111 err = -ENOENT;
1112 if (!ops) {
1113 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1114 goto err_out;
1115 }
1116
1117 sch = qdisc_alloc(dev_queue, ops, extack);
1118 if (IS_ERR(sch)) {
1119 err = PTR_ERR(sch);
1120 goto err_out2;
1121 }
1122
1123 sch->parent = parent;
1124
1125 if (handle == TC_H_INGRESS) {
1126 sch->flags |= TCQ_F_INGRESS;
1127 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1128 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1129 } else {
1130 if (handle == 0) {
1131 handle = qdisc_alloc_handle(dev);
1132 err = -ENOMEM;
1133 if (handle == 0)
1134 goto err_out3;
1135 }
1136 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1137 if (!netif_is_multiqueue(dev))
1138 sch->flags |= TCQ_F_ONETXQUEUE;
1139 }
1140
1141 sch->handle = handle;
1142
1143 /* This exist to keep backward compatible with a userspace
1144 * loophole, what allowed userspace to get IFF_NO_QUEUE
1145 * facility on older kernels by setting tx_queue_len=0 (prior
1146 * to qdisc init), and then forgot to reinit tx_queue_len
1147 * before again attaching a qdisc.
1148 */
1149 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1150 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1151 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1152 }
1153
1154 err = qdisc_block_indexes_set(sch, tca, extack);
1155 if (err)
1156 goto err_out3;
1157
1158 if (ops->init) {
1159 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1160 if (err != 0)
1161 goto err_out5;
1162 }
1163
1164 if (tca[TCA_STAB]) {
1165 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1166 if (IS_ERR(stab)) {
1167 err = PTR_ERR(stab);
1168 goto err_out4;
1169 }
1170 rcu_assign_pointer(sch->stab, stab);
1171 }
1172 if (tca[TCA_RATE]) {
1173 seqcount_t *running;
1174
1175 err = -EOPNOTSUPP;
1176 if (sch->flags & TCQ_F_MQROOT) {
1177 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1178 goto err_out4;
1179 }
1180
1181 if (sch->parent != TC_H_ROOT &&
1182 !(sch->flags & TCQ_F_INGRESS) &&
1183 (!p || !(p->flags & TCQ_F_MQROOT)))
1184 running = qdisc_root_sleeping_running(sch);
1185 else
1186 running = &sch->running;
1187
1188 err = gen_new_estimator(&sch->bstats,
1189 sch->cpu_bstats,
1190 &sch->rate_est,
1191 NULL,
1192 running,
1193 tca[TCA_RATE]);
1194 if (err) {
1195 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1196 goto err_out4;
1197 }
1198 }
1199
1200 qdisc_hash_add(sch, false);
1201
1202 return sch;
1203
1204 err_out5:
1205 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1206 if (ops->destroy)
1207 ops->destroy(sch);
1208 err_out3:
1209 dev_put(dev);
1210 qdisc_free(sch);
1211 err_out2:
1212 module_put(ops->owner);
1213 err_out:
1214 *errp = err;
1215 return NULL;
1216
1217 err_out4:
1218 /*
1219 * Any broken qdiscs that would require a ops->reset() here?
1220 * The qdisc was never in action so it shouldn't be necessary.
1221 */
1222 qdisc_put_stab(rtnl_dereference(sch->stab));
1223 if (ops->destroy)
1224 ops->destroy(sch);
1225 goto err_out3;
1226 }
1227
qdisc_change(struct Qdisc * sch,struct nlattr ** tca,struct netlink_ext_ack * extack)1228 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1229 struct netlink_ext_ack *extack)
1230 {
1231 struct qdisc_size_table *ostab, *stab = NULL;
1232 int err = 0;
1233
1234 if (tca[TCA_OPTIONS]) {
1235 if (!sch->ops->change) {
1236 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1237 return -EINVAL;
1238 }
1239 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1240 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1241 return -EOPNOTSUPP;
1242 }
1243 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1244 if (err)
1245 return err;
1246 }
1247
1248 if (tca[TCA_STAB]) {
1249 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1250 if (IS_ERR(stab))
1251 return PTR_ERR(stab);
1252 }
1253
1254 ostab = rtnl_dereference(sch->stab);
1255 rcu_assign_pointer(sch->stab, stab);
1256 qdisc_put_stab(ostab);
1257
1258 if (tca[TCA_RATE]) {
1259 /* NB: ignores errors from replace_estimator
1260 because change can't be undone. */
1261 if (sch->flags & TCQ_F_MQROOT)
1262 goto out;
1263 gen_replace_estimator(&sch->bstats,
1264 sch->cpu_bstats,
1265 &sch->rate_est,
1266 NULL,
1267 qdisc_root_sleeping_running(sch),
1268 tca[TCA_RATE]);
1269 }
1270 out:
1271 return 0;
1272 }
1273
1274 struct check_loop_arg {
1275 struct qdisc_walker w;
1276 struct Qdisc *p;
1277 int depth;
1278 };
1279
1280 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1281 struct qdisc_walker *w);
1282
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1283 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1284 {
1285 struct check_loop_arg arg;
1286
1287 if (q->ops->cl_ops == NULL)
1288 return 0;
1289
1290 arg.w.stop = arg.w.skip = arg.w.count = 0;
1291 arg.w.fn = check_loop_fn;
1292 arg.depth = depth;
1293 arg.p = p;
1294 q->ops->cl_ops->walk(q, &arg.w);
1295 return arg.w.stop ? -ELOOP : 0;
1296 }
1297
1298 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1299 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1300 {
1301 struct Qdisc *leaf;
1302 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1303 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1304
1305 leaf = cops->leaf(q, cl);
1306 if (leaf) {
1307 if (leaf == arg->p || arg->depth > 7)
1308 return -ELOOP;
1309 return check_loop(leaf, arg->p, arg->depth + 1);
1310 }
1311 return 0;
1312 }
1313
1314 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1315 [TCA_KIND] = { .type = NLA_NUL_STRING,
1316 .len = IFNAMSIZ - 1 },
1317 [TCA_RATE] = { .type = NLA_BINARY,
1318 .len = sizeof(struct tc_estimator) },
1319 [TCA_STAB] = { .type = NLA_NESTED },
1320 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1321 [TCA_CHAIN] = { .type = NLA_U32 },
1322 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1323 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1324 };
1325
1326 /*
1327 * Delete/get qdisc.
1328 */
1329
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1330 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1331 struct netlink_ext_ack *extack)
1332 {
1333 struct net *net = sock_net(skb->sk);
1334 struct tcmsg *tcm = nlmsg_data(n);
1335 struct nlattr *tca[TCA_MAX + 1];
1336 struct net_device *dev;
1337 u32 clid;
1338 struct Qdisc *q = NULL;
1339 struct Qdisc *p = NULL;
1340 int err;
1341
1342 if ((n->nlmsg_type != RTM_GETQDISC) &&
1343 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1344 return -EPERM;
1345
1346 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1347 extack);
1348 if (err < 0)
1349 return err;
1350
1351 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1352 if (!dev)
1353 return -ENODEV;
1354
1355 clid = tcm->tcm_parent;
1356 if (clid) {
1357 if (clid != TC_H_ROOT) {
1358 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1359 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1360 if (!p) {
1361 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1362 return -ENOENT;
1363 }
1364 q = qdisc_leaf(p, clid);
1365 } else if (dev_ingress_queue(dev)) {
1366 q = dev_ingress_queue(dev)->qdisc_sleeping;
1367 }
1368 } else {
1369 q = dev->qdisc;
1370 }
1371 if (!q) {
1372 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1373 return -ENOENT;
1374 }
1375
1376 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1377 NL_SET_ERR_MSG(extack, "Invalid handle");
1378 return -EINVAL;
1379 }
1380 } else {
1381 q = qdisc_lookup(dev, tcm->tcm_handle);
1382 if (!q) {
1383 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1384 return -ENOENT;
1385 }
1386 }
1387
1388 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1389 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1390 return -EINVAL;
1391 }
1392
1393 if (n->nlmsg_type == RTM_DELQDISC) {
1394 if (!clid) {
1395 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1396 return -EINVAL;
1397 }
1398 if (q->handle == 0) {
1399 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1400 return -ENOENT;
1401 }
1402 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1403 if (err != 0)
1404 return err;
1405 } else {
1406 qdisc_notify(net, skb, n, clid, NULL, q);
1407 }
1408 return 0;
1409 }
1410
1411 /*
1412 * Create/change qdisc.
1413 */
1414
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1415 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1416 struct netlink_ext_ack *extack)
1417 {
1418 struct net *net = sock_net(skb->sk);
1419 struct tcmsg *tcm;
1420 struct nlattr *tca[TCA_MAX + 1];
1421 struct net_device *dev;
1422 u32 clid;
1423 struct Qdisc *q, *p;
1424 int err;
1425
1426 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1427 return -EPERM;
1428
1429 replay:
1430 /* Reinit, just in case something touches this. */
1431 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1432 extack);
1433 if (err < 0)
1434 return err;
1435
1436 tcm = nlmsg_data(n);
1437 clid = tcm->tcm_parent;
1438 q = p = NULL;
1439
1440 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1441 if (!dev)
1442 return -ENODEV;
1443
1444
1445 if (clid) {
1446 if (clid != TC_H_ROOT) {
1447 if (clid != TC_H_INGRESS) {
1448 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1449 if (!p) {
1450 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1451 return -ENOENT;
1452 }
1453 q = qdisc_leaf(p, clid);
1454 } else if (dev_ingress_queue_create(dev)) {
1455 q = dev_ingress_queue(dev)->qdisc_sleeping;
1456 }
1457 } else {
1458 q = dev->qdisc;
1459 }
1460
1461 /* It may be default qdisc, ignore it */
1462 if (q && q->handle == 0)
1463 q = NULL;
1464
1465 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1466 if (tcm->tcm_handle) {
1467 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1468 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1469 return -EEXIST;
1470 }
1471 if (TC_H_MIN(tcm->tcm_handle)) {
1472 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1473 return -EINVAL;
1474 }
1475 q = qdisc_lookup(dev, tcm->tcm_handle);
1476 if (!q)
1477 goto create_n_graft;
1478 if (n->nlmsg_flags & NLM_F_EXCL) {
1479 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1480 return -EEXIST;
1481 }
1482 if (tca[TCA_KIND] &&
1483 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1484 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1485 return -EINVAL;
1486 }
1487 if (q == p ||
1488 (p && check_loop(q, p, 0))) {
1489 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1490 return -ELOOP;
1491 }
1492 qdisc_refcount_inc(q);
1493 goto graft;
1494 } else {
1495 if (!q)
1496 goto create_n_graft;
1497
1498 /* This magic test requires explanation.
1499 *
1500 * We know, that some child q is already
1501 * attached to this parent and have choice:
1502 * either to change it or to create/graft new one.
1503 *
1504 * 1. We are allowed to create/graft only
1505 * if CREATE and REPLACE flags are set.
1506 *
1507 * 2. If EXCL is set, requestor wanted to say,
1508 * that qdisc tcm_handle is not expected
1509 * to exist, so that we choose create/graft too.
1510 *
1511 * 3. The last case is when no flags are set.
1512 * Alas, it is sort of hole in API, we
1513 * cannot decide what to do unambiguously.
1514 * For now we select create/graft, if
1515 * user gave KIND, which does not match existing.
1516 */
1517 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1518 (n->nlmsg_flags & NLM_F_REPLACE) &&
1519 ((n->nlmsg_flags & NLM_F_EXCL) ||
1520 (tca[TCA_KIND] &&
1521 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1522 goto create_n_graft;
1523 }
1524 }
1525 } else {
1526 if (!tcm->tcm_handle) {
1527 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1528 return -EINVAL;
1529 }
1530 q = qdisc_lookup(dev, tcm->tcm_handle);
1531 }
1532
1533 /* Change qdisc parameters */
1534 if (!q) {
1535 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1536 return -ENOENT;
1537 }
1538 if (n->nlmsg_flags & NLM_F_EXCL) {
1539 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1540 return -EEXIST;
1541 }
1542 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1543 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1544 return -EINVAL;
1545 }
1546 err = qdisc_change(q, tca, extack);
1547 if (err == 0)
1548 qdisc_notify(net, skb, n, clid, NULL, q);
1549 return err;
1550
1551 create_n_graft:
1552 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1553 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1554 return -ENOENT;
1555 }
1556 if (clid == TC_H_INGRESS) {
1557 if (dev_ingress_queue(dev)) {
1558 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1559 tcm->tcm_parent, tcm->tcm_parent,
1560 tca, &err, extack);
1561 } else {
1562 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1563 err = -ENOENT;
1564 }
1565 } else {
1566 struct netdev_queue *dev_queue;
1567
1568 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1569 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1570 else if (p)
1571 dev_queue = p->dev_queue;
1572 else
1573 dev_queue = netdev_get_tx_queue(dev, 0);
1574
1575 q = qdisc_create(dev, dev_queue, p,
1576 tcm->tcm_parent, tcm->tcm_handle,
1577 tca, &err, extack);
1578 }
1579 if (q == NULL) {
1580 if (err == -EAGAIN)
1581 goto replay;
1582 return err;
1583 }
1584
1585 graft:
1586 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1587 if (err) {
1588 if (q)
1589 qdisc_destroy(q);
1590 return err;
1591 }
1592
1593 return 0;
1594 }
1595
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur,bool dump_invisible)1596 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1597 struct netlink_callback *cb,
1598 int *q_idx_p, int s_q_idx, bool recur,
1599 bool dump_invisible)
1600 {
1601 int ret = 0, q_idx = *q_idx_p;
1602 struct Qdisc *q;
1603 int b;
1604
1605 if (!root)
1606 return 0;
1607
1608 q = root;
1609 if (q_idx < s_q_idx) {
1610 q_idx++;
1611 } else {
1612 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1613 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1614 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1615 RTM_NEWQDISC) <= 0)
1616 goto done;
1617 q_idx++;
1618 }
1619
1620 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1621 * itself has already been dumped.
1622 *
1623 * If we've already dumped the top-level (ingress) qdisc above and the global
1624 * qdisc hashtable, we don't want to hit it again
1625 */
1626 if (!qdisc_dev(root) || !recur)
1627 goto out;
1628
1629 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1630 if (q_idx < s_q_idx) {
1631 q_idx++;
1632 continue;
1633 }
1634 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1635 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1636 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1637 RTM_NEWQDISC) <= 0)
1638 goto done;
1639 q_idx++;
1640 }
1641
1642 out:
1643 *q_idx_p = q_idx;
1644 return ret;
1645 done:
1646 ret = -1;
1647 goto out;
1648 }
1649
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1650 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1651 {
1652 struct net *net = sock_net(skb->sk);
1653 int idx, q_idx;
1654 int s_idx, s_q_idx;
1655 struct net_device *dev;
1656 const struct nlmsghdr *nlh = cb->nlh;
1657 struct nlattr *tca[TCA_MAX + 1];
1658 int err;
1659
1660 s_idx = cb->args[0];
1661 s_q_idx = q_idx = cb->args[1];
1662
1663 idx = 0;
1664 ASSERT_RTNL();
1665
1666 err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1667 rtm_tca_policy, NULL);
1668 if (err < 0)
1669 return err;
1670
1671 for_each_netdev(net, dev) {
1672 struct netdev_queue *dev_queue;
1673
1674 if (idx < s_idx)
1675 goto cont;
1676 if (idx > s_idx)
1677 s_q_idx = 0;
1678 q_idx = 0;
1679
1680 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1681 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1682 goto done;
1683
1684 dev_queue = dev_ingress_queue(dev);
1685 if (dev_queue &&
1686 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1687 &q_idx, s_q_idx, false,
1688 tca[TCA_DUMP_INVISIBLE]) < 0)
1689 goto done;
1690
1691 cont:
1692 idx++;
1693 }
1694
1695 done:
1696 cb->args[0] = idx;
1697 cb->args[1] = q_idx;
1698
1699 return skb->len;
1700 }
1701
1702
1703
1704 /************************************************
1705 * Traffic classes manipulation. *
1706 ************************************************/
1707
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event)1708 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1709 unsigned long cl,
1710 u32 portid, u32 seq, u16 flags, int event)
1711 {
1712 struct tcmsg *tcm;
1713 struct nlmsghdr *nlh;
1714 unsigned char *b = skb_tail_pointer(skb);
1715 struct gnet_dump d;
1716 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1717
1718 cond_resched();
1719 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1720 if (!nlh)
1721 goto out_nlmsg_trim;
1722 tcm = nlmsg_data(nlh);
1723 tcm->tcm_family = AF_UNSPEC;
1724 tcm->tcm__pad1 = 0;
1725 tcm->tcm__pad2 = 0;
1726 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1727 tcm->tcm_parent = q->handle;
1728 tcm->tcm_handle = q->handle;
1729 tcm->tcm_info = 0;
1730 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1731 goto nla_put_failure;
1732 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1733 goto nla_put_failure;
1734
1735 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1736 NULL, &d, TCA_PAD) < 0)
1737 goto nla_put_failure;
1738
1739 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1740 goto nla_put_failure;
1741
1742 if (gnet_stats_finish_copy(&d) < 0)
1743 goto nla_put_failure;
1744
1745 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1746 return skb->len;
1747
1748 out_nlmsg_trim:
1749 nla_put_failure:
1750 nlmsg_trim(skb, b);
1751 return -1;
1752 }
1753
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1754 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1755 struct nlmsghdr *n, struct Qdisc *q,
1756 unsigned long cl, int event)
1757 {
1758 struct sk_buff *skb;
1759 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1760
1761 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1762 if (!skb)
1763 return -ENOBUFS;
1764
1765 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1766 kfree_skb(skb);
1767 return -EINVAL;
1768 }
1769
1770 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1771 n->nlmsg_flags & NLM_F_ECHO);
1772 }
1773
tclass_del_notify(struct net * net,const struct Qdisc_class_ops * cops,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl)1774 static int tclass_del_notify(struct net *net,
1775 const struct Qdisc_class_ops *cops,
1776 struct sk_buff *oskb, struct nlmsghdr *n,
1777 struct Qdisc *q, unsigned long cl)
1778 {
1779 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1780 struct sk_buff *skb;
1781 int err = 0;
1782
1783 if (!cops->delete)
1784 return -EOPNOTSUPP;
1785
1786 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1787 if (!skb)
1788 return -ENOBUFS;
1789
1790 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1791 RTM_DELTCLASS) < 0) {
1792 kfree_skb(skb);
1793 return -EINVAL;
1794 }
1795
1796 err = cops->delete(q, cl);
1797 if (err) {
1798 kfree_skb(skb);
1799 return err;
1800 }
1801
1802 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1803 n->nlmsg_flags & NLM_F_ECHO);
1804 }
1805
1806 #ifdef CONFIG_NET_CLS
1807
1808 struct tcf_bind_args {
1809 struct tcf_walker w;
1810 unsigned long base;
1811 unsigned long cl;
1812 u32 classid;
1813 };
1814
tcf_node_bind(struct tcf_proto * tp,void * n,struct tcf_walker * arg)1815 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1816 {
1817 struct tcf_bind_args *a = (void *)arg;
1818
1819 if (tp->ops->bind_class) {
1820 struct Qdisc *q = tcf_block_q(tp->chain->block);
1821
1822 sch_tree_lock(q);
1823 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1824 sch_tree_unlock(q);
1825 }
1826 return 0;
1827 }
1828
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1829 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1830 unsigned long new_cl)
1831 {
1832 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1833 struct tcf_block *block;
1834 struct tcf_chain *chain;
1835 unsigned long cl;
1836
1837 cl = cops->find(q, portid);
1838 if (!cl)
1839 return;
1840 if (!cops->tcf_block)
1841 return;
1842 block = cops->tcf_block(q, cl, NULL);
1843 if (!block)
1844 return;
1845 list_for_each_entry(chain, &block->chain_list, list) {
1846 struct tcf_proto *tp;
1847
1848 for (tp = rtnl_dereference(chain->filter_chain);
1849 tp; tp = rtnl_dereference(tp->next)) {
1850 struct tcf_bind_args arg = {};
1851
1852 arg.w.fn = tcf_node_bind;
1853 arg.classid = clid;
1854 arg.base = cl;
1855 arg.cl = new_cl;
1856 tp->ops->walk(tp, &arg.w);
1857 }
1858 }
1859 }
1860
1861 #else
1862
tc_bind_tclass(struct Qdisc * q,u32 portid,u32 clid,unsigned long new_cl)1863 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1864 unsigned long new_cl)
1865 {
1866 }
1867
1868 #endif
1869
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n,struct netlink_ext_ack * extack)1870 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1871 struct netlink_ext_ack *extack)
1872 {
1873 struct net *net = sock_net(skb->sk);
1874 struct tcmsg *tcm = nlmsg_data(n);
1875 struct nlattr *tca[TCA_MAX + 1];
1876 struct net_device *dev;
1877 struct Qdisc *q = NULL;
1878 const struct Qdisc_class_ops *cops;
1879 unsigned long cl = 0;
1880 unsigned long new_cl;
1881 u32 portid;
1882 u32 clid;
1883 u32 qid;
1884 int err;
1885
1886 if ((n->nlmsg_type != RTM_GETTCLASS) &&
1887 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1888 return -EPERM;
1889
1890 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1891 extack);
1892 if (err < 0)
1893 return err;
1894
1895 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1896 if (!dev)
1897 return -ENODEV;
1898
1899 /*
1900 parent == TC_H_UNSPEC - unspecified parent.
1901 parent == TC_H_ROOT - class is root, which has no parent.
1902 parent == X:0 - parent is root class.
1903 parent == X:Y - parent is a node in hierarchy.
1904 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1905
1906 handle == 0:0 - generate handle from kernel pool.
1907 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1908 handle == X:Y - clear.
1909 handle == X:0 - root class.
1910 */
1911
1912 /* Step 1. Determine qdisc handle X:0 */
1913
1914 portid = tcm->tcm_parent;
1915 clid = tcm->tcm_handle;
1916 qid = TC_H_MAJ(clid);
1917
1918 if (portid != TC_H_ROOT) {
1919 u32 qid1 = TC_H_MAJ(portid);
1920
1921 if (qid && qid1) {
1922 /* If both majors are known, they must be identical. */
1923 if (qid != qid1)
1924 return -EINVAL;
1925 } else if (qid1) {
1926 qid = qid1;
1927 } else if (qid == 0)
1928 qid = dev->qdisc->handle;
1929
1930 /* Now qid is genuine qdisc handle consistent
1931 * both with parent and child.
1932 *
1933 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1934 */
1935 if (portid)
1936 portid = TC_H_MAKE(qid, portid);
1937 } else {
1938 if (qid == 0)
1939 qid = dev->qdisc->handle;
1940 }
1941
1942 /* OK. Locate qdisc */
1943 q = qdisc_lookup(dev, qid);
1944 if (!q)
1945 return -ENOENT;
1946
1947 /* An check that it supports classes */
1948 cops = q->ops->cl_ops;
1949 if (cops == NULL)
1950 return -EINVAL;
1951
1952 /* Now try to get class */
1953 if (clid == 0) {
1954 if (portid == TC_H_ROOT)
1955 clid = qid;
1956 } else
1957 clid = TC_H_MAKE(qid, clid);
1958
1959 if (clid)
1960 cl = cops->find(q, clid);
1961
1962 if (cl == 0) {
1963 err = -ENOENT;
1964 if (n->nlmsg_type != RTM_NEWTCLASS ||
1965 !(n->nlmsg_flags & NLM_F_CREATE))
1966 goto out;
1967 } else {
1968 switch (n->nlmsg_type) {
1969 case RTM_NEWTCLASS:
1970 err = -EEXIST;
1971 if (n->nlmsg_flags & NLM_F_EXCL)
1972 goto out;
1973 break;
1974 case RTM_DELTCLASS:
1975 err = tclass_del_notify(net, cops, skb, n, q, cl);
1976 /* Unbind the class with flilters with 0 */
1977 tc_bind_tclass(q, portid, clid, 0);
1978 goto out;
1979 case RTM_GETTCLASS:
1980 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1981 goto out;
1982 default:
1983 err = -EINVAL;
1984 goto out;
1985 }
1986 }
1987
1988 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1989 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1990 return -EOPNOTSUPP;
1991 }
1992
1993 new_cl = cl;
1994 err = -EOPNOTSUPP;
1995 if (cops->change)
1996 err = cops->change(q, clid, portid, tca, &new_cl, extack);
1997 if (err == 0) {
1998 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1999 /* We just create a new class, need to do reverse binding. */
2000 if (cl != new_cl)
2001 tc_bind_tclass(q, portid, clid, new_cl);
2002 }
2003 out:
2004 return err;
2005 }
2006
2007 struct qdisc_dump_args {
2008 struct qdisc_walker w;
2009 struct sk_buff *skb;
2010 struct netlink_callback *cb;
2011 };
2012
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)2013 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2014 struct qdisc_walker *arg)
2015 {
2016 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2017
2018 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2019 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2020 RTM_NEWTCLASS);
2021 }
2022
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2023 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2024 struct tcmsg *tcm, struct netlink_callback *cb,
2025 int *t_p, int s_t)
2026 {
2027 struct qdisc_dump_args arg;
2028
2029 if (tc_qdisc_dump_ignore(q, false) ||
2030 *t_p < s_t || !q->ops->cl_ops ||
2031 (tcm->tcm_parent &&
2032 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2033 (*t_p)++;
2034 return 0;
2035 }
2036 if (*t_p > s_t)
2037 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2038 arg.w.fn = qdisc_class_dump;
2039 arg.skb = skb;
2040 arg.cb = cb;
2041 arg.w.stop = 0;
2042 arg.w.skip = cb->args[1];
2043 arg.w.count = 0;
2044 q->ops->cl_ops->walk(q, &arg.w);
2045 cb->args[1] = arg.w.count;
2046 if (arg.w.stop)
2047 return -1;
2048 (*t_p)++;
2049 return 0;
2050 }
2051
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)2052 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2053 struct tcmsg *tcm, struct netlink_callback *cb,
2054 int *t_p, int s_t)
2055 {
2056 struct Qdisc *q;
2057 int b;
2058
2059 if (!root)
2060 return 0;
2061
2062 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2063 return -1;
2064
2065 if (!qdisc_dev(root))
2066 return 0;
2067
2068 if (tcm->tcm_parent) {
2069 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2070 if (q && q != root &&
2071 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2072 return -1;
2073 return 0;
2074 }
2075 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2076 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2077 return -1;
2078 }
2079
2080 return 0;
2081 }
2082
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)2083 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2084 {
2085 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2086 struct net *net = sock_net(skb->sk);
2087 struct netdev_queue *dev_queue;
2088 struct net_device *dev;
2089 int t, s_t;
2090
2091 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2092 return 0;
2093 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2094 if (!dev)
2095 return 0;
2096
2097 s_t = cb->args[0];
2098 t = 0;
2099
2100 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2101 goto done;
2102
2103 dev_queue = dev_ingress_queue(dev);
2104 if (dev_queue &&
2105 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2106 &t, s_t) < 0)
2107 goto done;
2108
2109 done:
2110 cb->args[0] = t;
2111
2112 dev_put(dev);
2113 return skb->len;
2114 }
2115
2116 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)2117 static int psched_show(struct seq_file *seq, void *v)
2118 {
2119 seq_printf(seq, "%08x %08x %08x %08x\n",
2120 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2121 1000000,
2122 (u32)NSEC_PER_SEC / hrtimer_resolution);
2123
2124 return 0;
2125 }
2126
psched_net_init(struct net * net)2127 static int __net_init psched_net_init(struct net *net)
2128 {
2129 struct proc_dir_entry *e;
2130
2131 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2132 if (e == NULL)
2133 return -ENOMEM;
2134
2135 return 0;
2136 }
2137
psched_net_exit(struct net * net)2138 static void __net_exit psched_net_exit(struct net *net)
2139 {
2140 remove_proc_entry("psched", net->proc_net);
2141 }
2142 #else
psched_net_init(struct net * net)2143 static int __net_init psched_net_init(struct net *net)
2144 {
2145 return 0;
2146 }
2147
psched_net_exit(struct net * net)2148 static void __net_exit psched_net_exit(struct net *net)
2149 {
2150 }
2151 #endif
2152
2153 static struct pernet_operations psched_net_ops = {
2154 .init = psched_net_init,
2155 .exit = psched_net_exit,
2156 };
2157
pktsched_init(void)2158 static int __init pktsched_init(void)
2159 {
2160 int err;
2161
2162 err = register_pernet_subsys(&psched_net_ops);
2163 if (err) {
2164 pr_err("pktsched_init: "
2165 "cannot initialize per netns operations\n");
2166 return err;
2167 }
2168
2169 register_qdisc(&pfifo_fast_ops);
2170 register_qdisc(&pfifo_qdisc_ops);
2171 register_qdisc(&bfifo_qdisc_ops);
2172 register_qdisc(&pfifo_head_drop_qdisc_ops);
2173 register_qdisc(&mq_qdisc_ops);
2174 register_qdisc(&noqueue_qdisc_ops);
2175
2176 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2177 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2178 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2179 0);
2180 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2181 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2182 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2183 0);
2184
2185 return 0;
2186 }
2187
2188 subsys_initcall(pktsched_init);
2189