• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * net/sched/sch_api.c	Packet scheduler API.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17 
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 
39 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
40 			struct nlmsghdr *n, u32 clid,
41 			struct Qdisc *old, struct Qdisc *new);
42 static int tclass_notify(struct net *net, struct sk_buff *oskb,
43 			 struct nlmsghdr *n, struct Qdisc *q,
44 			 unsigned long cl, int event);
45 
46 /*
47 
48    Short review.
49    -------------
50 
51    This file consists of two interrelated parts:
52 
53    1. queueing disciplines manager frontend.
54    2. traffic classes manager frontend.
55 
56    Generally, queueing discipline ("qdisc") is a black box,
57    which is able to enqueue packets and to dequeue them (when
58    device is ready to send something) in order and at times
59    determined by algorithm hidden in it.
60 
61    qdisc's are divided to two categories:
62    - "queues", which have no internal structure visible from outside.
63    - "schedulers", which split all the packets to "traffic classes",
64      using "packet classifiers" (look at cls_api.c)
65 
66    In turn, classes may have child qdiscs (as rule, queues)
67    attached to them etc. etc. etc.
68 
69    The goal of the routines in this file is to translate
70    information supplied by user in the form of handles
71    to more intelligible for kernel form, to make some sanity
72    checks and part of work, which is common to all qdiscs
73    and to provide rtnetlink notifications.
74 
75    All real intelligent work is done inside qdisc modules.
76 
77 
78 
79    Every discipline has two major routines: enqueue and dequeue.
80 
81    ---dequeue
82 
83    dequeue usually returns a skb to send. It is allowed to return NULL,
84    but it does not mean that queue is empty, it just means that
85    discipline does not want to send anything this time.
86    Queue is really empty if q->q.qlen == 0.
87    For complicated disciplines with multiple queues q->q is not
88    real packet queue, but however q->q.qlen must be valid.
89 
90    ---enqueue
91 
92    enqueue returns 0, if packet was enqueued successfully.
93    If packet (this one or another one) was dropped, it returns
94    not zero error code.
95    NET_XMIT_DROP 	- this packet dropped
96      Expected action: do not backoff, but wait until queue will clear.
97    NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
98      Expected action: backoff or ignore
99 
100    Auxiliary routines:
101 
102    ---peek
103 
104    like dequeue but without removing a packet from the queue
105 
106    ---reset
107 
108    returns qdisc to initial state: purge all buffers, clear all
109    timers, counters (except for statistics) etc.
110 
111    ---init
112 
113    initializes newly created qdisc.
114 
115    ---destroy
116 
117    destroys resources allocated by init and during lifetime of qdisc.
118 
119    ---change
120 
121    changes qdisc parameters.
122  */
123 
124 /* Protects list of registered TC modules. It is pure SMP lock. */
125 static DEFINE_RWLOCK(qdisc_mod_lock);
126 
127 
128 /************************************************
129  *	Queueing disciplines manipulation.	*
130  ************************************************/
131 
132 
133 /* The list of all installed queueing disciplines. */
134 
135 static struct Qdisc_ops *qdisc_base;
136 
137 /* Register/unregister queueing discipline */
138 
register_qdisc(struct Qdisc_ops * qops)139 int register_qdisc(struct Qdisc_ops *qops)
140 {
141 	struct Qdisc_ops *q, **qp;
142 	int rc = -EEXIST;
143 
144 	write_lock(&qdisc_mod_lock);
145 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
146 		if (!strcmp(qops->id, q->id))
147 			goto out;
148 
149 	if (qops->enqueue == NULL)
150 		qops->enqueue = noop_qdisc_ops.enqueue;
151 	if (qops->peek == NULL) {
152 		if (qops->dequeue == NULL)
153 			qops->peek = noop_qdisc_ops.peek;
154 		else
155 			goto out_einval;
156 	}
157 	if (qops->dequeue == NULL)
158 		qops->dequeue = noop_qdisc_ops.dequeue;
159 
160 	if (qops->cl_ops) {
161 		const struct Qdisc_class_ops *cops = qops->cl_ops;
162 
163 		if (!(cops->get && cops->put && cops->walk && cops->leaf))
164 			goto out_einval;
165 
166 		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
167 			goto out_einval;
168 	}
169 
170 	qops->next = NULL;
171 	*qp = qops;
172 	rc = 0;
173 out:
174 	write_unlock(&qdisc_mod_lock);
175 	return rc;
176 
177 out_einval:
178 	rc = -EINVAL;
179 	goto out;
180 }
181 EXPORT_SYMBOL(register_qdisc);
182 
unregister_qdisc(struct Qdisc_ops * qops)183 int unregister_qdisc(struct Qdisc_ops *qops)
184 {
185 	struct Qdisc_ops *q, **qp;
186 	int err = -ENOENT;
187 
188 	write_lock(&qdisc_mod_lock);
189 	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
190 		if (q == qops)
191 			break;
192 	if (q) {
193 		*qp = q->next;
194 		q->next = NULL;
195 		err = 0;
196 	}
197 	write_unlock(&qdisc_mod_lock);
198 	return err;
199 }
200 EXPORT_SYMBOL(unregister_qdisc);
201 
202 /* Get default qdisc if not otherwise specified */
qdisc_get_default(char * name,size_t len)203 void qdisc_get_default(char *name, size_t len)
204 {
205 	read_lock(&qdisc_mod_lock);
206 	strlcpy(name, default_qdisc_ops->id, len);
207 	read_unlock(&qdisc_mod_lock);
208 }
209 
qdisc_lookup_default(const char * name)210 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
211 {
212 	struct Qdisc_ops *q = NULL;
213 
214 	for (q = qdisc_base; q; q = q->next) {
215 		if (!strcmp(name, q->id)) {
216 			if (!try_module_get(q->owner))
217 				q = NULL;
218 			break;
219 		}
220 	}
221 
222 	return q;
223 }
224 
225 /* Set new default qdisc to use */
qdisc_set_default(const char * name)226 int qdisc_set_default(const char *name)
227 {
228 	const struct Qdisc_ops *ops;
229 
230 	if (!capable(CAP_NET_ADMIN))
231 		return -EPERM;
232 
233 	write_lock(&qdisc_mod_lock);
234 	ops = qdisc_lookup_default(name);
235 	if (!ops) {
236 		/* Not found, drop lock and try to load module */
237 		write_unlock(&qdisc_mod_lock);
238 		request_module("sch_%s", name);
239 		write_lock(&qdisc_mod_lock);
240 
241 		ops = qdisc_lookup_default(name);
242 	}
243 
244 	if (ops) {
245 		/* Set new default */
246 		module_put(default_qdisc_ops->owner);
247 		default_qdisc_ops = ops;
248 	}
249 	write_unlock(&qdisc_mod_lock);
250 
251 	return ops ? 0 : -ENOENT;
252 }
253 
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258 
qdisc_match_from_root(struct Qdisc * root,u32 handle)259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261 	struct Qdisc *q;
262 
263 	if (!qdisc_dev(root))
264 		return (root->handle == handle ? root : NULL);
265 
266 	if (!(root->flags & TCQ_F_BUILTIN) &&
267 	    root->handle == handle)
268 		return root;
269 
270 	hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
271 		if (q->handle == handle)
272 			return q;
273 	}
274 	return NULL;
275 }
276 
qdisc_hash_add(struct Qdisc * q)277 void qdisc_hash_add(struct Qdisc *q)
278 {
279 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
280 		ASSERT_RTNL();
281 		hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
282 	}
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285 
qdisc_hash_del(struct Qdisc * q)286 void qdisc_hash_del(struct Qdisc *q)
287 {
288 	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289 		ASSERT_RTNL();
290 		hash_del_rcu(&q->hash);
291 	}
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294 
qdisc_lookup(struct net_device * dev,u32 handle)295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297 	struct Qdisc *q;
298 
299 	if (!handle)
300 		return NULL;
301 	q = qdisc_match_from_root(dev->qdisc, handle);
302 	if (q)
303 		goto out;
304 
305 	if (dev_ingress_queue(dev))
306 		q = qdisc_match_from_root(
307 			dev_ingress_queue(dev)->qdisc_sleeping,
308 			handle);
309 out:
310 	return q;
311 }
312 
qdisc_leaf(struct Qdisc * p,u32 classid)313 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
314 {
315 	unsigned long cl;
316 	struct Qdisc *leaf;
317 	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
318 
319 	if (cops == NULL)
320 		return NULL;
321 	cl = cops->get(p, classid);
322 
323 	if (cl == 0)
324 		return NULL;
325 	leaf = cops->leaf(p, cl);
326 	cops->put(p, cl);
327 	return leaf;
328 }
329 
330 /* Find queueing discipline by name */
331 
qdisc_lookup_ops(struct nlattr * kind)332 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
333 {
334 	struct Qdisc_ops *q = NULL;
335 
336 	if (kind) {
337 		read_lock(&qdisc_mod_lock);
338 		for (q = qdisc_base; q; q = q->next) {
339 			if (nla_strcmp(kind, q->id) == 0) {
340 				if (!try_module_get(q->owner))
341 					q = NULL;
342 				break;
343 			}
344 		}
345 		read_unlock(&qdisc_mod_lock);
346 	}
347 	return q;
348 }
349 
350 /* The linklayer setting were not transferred from iproute2, in older
351  * versions, and the rate tables lookup systems have been dropped in
352  * the kernel. To keep backward compatible with older iproute2 tc
353  * utils, we detect the linklayer setting by detecting if the rate
354  * table were modified.
355  *
356  * For linklayer ATM table entries, the rate table will be aligned to
357  * 48 bytes, thus some table entries will contain the same value.  The
358  * mpu (min packet unit) is also encoded into the old rate table, thus
359  * starting from the mpu, we find low and high table entries for
360  * mapping this cell.  If these entries contain the same value, when
361  * the rate tables have been modified for linklayer ATM.
362  *
363  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
364  * and then roundup to the next cell, calc the table entry one below,
365  * and compare.
366  */
__detect_linklayer(struct tc_ratespec * r,__u32 * rtab)367 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
368 {
369 	int low       = roundup(r->mpu, 48);
370 	int high      = roundup(low+1, 48);
371 	int cell_low  = low >> r->cell_log;
372 	int cell_high = (high >> r->cell_log) - 1;
373 
374 	/* rtab is too inaccurate at rates > 100Mbit/s */
375 	if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
376 		pr_debug("TC linklayer: Giving up ATM detection\n");
377 		return TC_LINKLAYER_ETHERNET;
378 	}
379 
380 	if ((cell_high > cell_low) && (cell_high < 256)
381 	    && (rtab[cell_low] == rtab[cell_high])) {
382 		pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
383 			 cell_low, cell_high, rtab[cell_high]);
384 		return TC_LINKLAYER_ATM;
385 	}
386 	return TC_LINKLAYER_ETHERNET;
387 }
388 
389 static struct qdisc_rate_table *qdisc_rtab_list;
390 
qdisc_get_rtab(struct tc_ratespec * r,struct nlattr * tab)391 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
392 					struct nlattr *tab)
393 {
394 	struct qdisc_rate_table *rtab;
395 
396 	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
397 	    nla_len(tab) != TC_RTAB_SIZE)
398 		return NULL;
399 
400 	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
401 		if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
402 		    !memcmp(&rtab->data, nla_data(tab), 1024)) {
403 			rtab->refcnt++;
404 			return rtab;
405 		}
406 	}
407 
408 	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
409 	if (rtab) {
410 		rtab->rate = *r;
411 		rtab->refcnt = 1;
412 		memcpy(rtab->data, nla_data(tab), 1024);
413 		if (r->linklayer == TC_LINKLAYER_UNAWARE)
414 			r->linklayer = __detect_linklayer(r, rtab->data);
415 		rtab->next = qdisc_rtab_list;
416 		qdisc_rtab_list = rtab;
417 	}
418 	return rtab;
419 }
420 EXPORT_SYMBOL(qdisc_get_rtab);
421 
qdisc_put_rtab(struct qdisc_rate_table * tab)422 void qdisc_put_rtab(struct qdisc_rate_table *tab)
423 {
424 	struct qdisc_rate_table *rtab, **rtabp;
425 
426 	if (!tab || --tab->refcnt)
427 		return;
428 
429 	for (rtabp = &qdisc_rtab_list;
430 	     (rtab = *rtabp) != NULL;
431 	     rtabp = &rtab->next) {
432 		if (rtab == tab) {
433 			*rtabp = rtab->next;
434 			kfree(rtab);
435 			return;
436 		}
437 	}
438 }
439 EXPORT_SYMBOL(qdisc_put_rtab);
440 
441 static LIST_HEAD(qdisc_stab_list);
442 static DEFINE_SPINLOCK(qdisc_stab_lock);
443 
444 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
445 	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
446 	[TCA_STAB_DATA] = { .type = NLA_BINARY },
447 };
448 
qdisc_get_stab(struct nlattr * opt)449 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
450 {
451 	struct nlattr *tb[TCA_STAB_MAX + 1];
452 	struct qdisc_size_table *stab;
453 	struct tc_sizespec *s;
454 	unsigned int tsize = 0;
455 	u16 *tab = NULL;
456 	int err;
457 
458 	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
459 	if (err < 0)
460 		return ERR_PTR(err);
461 	if (!tb[TCA_STAB_BASE])
462 		return ERR_PTR(-EINVAL);
463 
464 	s = nla_data(tb[TCA_STAB_BASE]);
465 
466 	if (s->tsize > 0) {
467 		if (!tb[TCA_STAB_DATA])
468 			return ERR_PTR(-EINVAL);
469 		tab = nla_data(tb[TCA_STAB_DATA]);
470 		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
471 	}
472 
473 	if (tsize != s->tsize || (!tab && tsize > 0))
474 		return ERR_PTR(-EINVAL);
475 
476 	spin_lock(&qdisc_stab_lock);
477 
478 	list_for_each_entry(stab, &qdisc_stab_list, list) {
479 		if (memcmp(&stab->szopts, s, sizeof(*s)))
480 			continue;
481 		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
482 			continue;
483 		stab->refcnt++;
484 		spin_unlock(&qdisc_stab_lock);
485 		return stab;
486 	}
487 
488 	spin_unlock(&qdisc_stab_lock);
489 
490 	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
491 	if (!stab)
492 		return ERR_PTR(-ENOMEM);
493 
494 	stab->refcnt = 1;
495 	stab->szopts = *s;
496 	if (tsize > 0)
497 		memcpy(stab->data, tab, tsize * sizeof(u16));
498 
499 	spin_lock(&qdisc_stab_lock);
500 	list_add_tail(&stab->list, &qdisc_stab_list);
501 	spin_unlock(&qdisc_stab_lock);
502 
503 	return stab;
504 }
505 
stab_kfree_rcu(struct rcu_head * head)506 static void stab_kfree_rcu(struct rcu_head *head)
507 {
508 	kfree(container_of(head, struct qdisc_size_table, rcu));
509 }
510 
qdisc_put_stab(struct qdisc_size_table * tab)511 void qdisc_put_stab(struct qdisc_size_table *tab)
512 {
513 	if (!tab)
514 		return;
515 
516 	spin_lock(&qdisc_stab_lock);
517 
518 	if (--tab->refcnt == 0) {
519 		list_del(&tab->list);
520 		call_rcu_bh(&tab->rcu, stab_kfree_rcu);
521 	}
522 
523 	spin_unlock(&qdisc_stab_lock);
524 }
525 EXPORT_SYMBOL(qdisc_put_stab);
526 
qdisc_dump_stab(struct sk_buff * skb,struct qdisc_size_table * stab)527 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
528 {
529 	struct nlattr *nest;
530 
531 	nest = nla_nest_start(skb, TCA_STAB);
532 	if (nest == NULL)
533 		goto nla_put_failure;
534 	if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
535 		goto nla_put_failure;
536 	nla_nest_end(skb, nest);
537 
538 	return skb->len;
539 
540 nla_put_failure:
541 	return -1;
542 }
543 
__qdisc_calculate_pkt_len(struct sk_buff * skb,const struct qdisc_size_table * stab)544 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
545 			       const struct qdisc_size_table *stab)
546 {
547 	int pkt_len, slot;
548 
549 	pkt_len = skb->len + stab->szopts.overhead;
550 	if (unlikely(!stab->szopts.tsize))
551 		goto out;
552 
553 	slot = pkt_len + stab->szopts.cell_align;
554 	if (unlikely(slot < 0))
555 		slot = 0;
556 
557 	slot >>= stab->szopts.cell_log;
558 	if (likely(slot < stab->szopts.tsize))
559 		pkt_len = stab->data[slot];
560 	else
561 		pkt_len = stab->data[stab->szopts.tsize - 1] *
562 				(slot / stab->szopts.tsize) +
563 				stab->data[slot % stab->szopts.tsize];
564 
565 	pkt_len <<= stab->szopts.size_log;
566 out:
567 	if (unlikely(pkt_len < 1))
568 		pkt_len = 1;
569 	qdisc_skb_cb(skb)->pkt_len = pkt_len;
570 }
571 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
572 
qdisc_warn_nonwc(const char * txt,struct Qdisc * qdisc)573 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
574 {
575 	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
576 		pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
577 			txt, qdisc->ops->id, qdisc->handle >> 16);
578 		qdisc->flags |= TCQ_F_WARN_NONWC;
579 	}
580 }
581 EXPORT_SYMBOL(qdisc_warn_nonwc);
582 
qdisc_watchdog(struct hrtimer * timer)583 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
584 {
585 	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
586 						 timer);
587 
588 	rcu_read_lock();
589 	__netif_schedule(qdisc_root(wd->qdisc));
590 	rcu_read_unlock();
591 
592 	return HRTIMER_NORESTART;
593 }
594 
qdisc_watchdog_init(struct qdisc_watchdog * wd,struct Qdisc * qdisc)595 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
596 {
597 	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
598 	wd->timer.function = qdisc_watchdog;
599 	wd->qdisc = qdisc;
600 }
601 EXPORT_SYMBOL(qdisc_watchdog_init);
602 
qdisc_watchdog_schedule_ns(struct qdisc_watchdog * wd,u64 expires)603 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
604 {
605 	if (test_bit(__QDISC_STATE_DEACTIVATED,
606 		     &qdisc_root_sleeping(wd->qdisc)->state))
607 		return;
608 
609 	if (wd->last_expires == expires)
610 		return;
611 
612 	wd->last_expires = expires;
613 	hrtimer_start(&wd->timer,
614 		      ns_to_ktime(expires),
615 		      HRTIMER_MODE_ABS_PINNED);
616 }
617 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
618 
qdisc_watchdog_cancel(struct qdisc_watchdog * wd)619 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
620 {
621 	hrtimer_cancel(&wd->timer);
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_cancel);
624 
qdisc_class_hash_alloc(unsigned int n)625 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
626 {
627 	unsigned int size = n * sizeof(struct hlist_head), i;
628 	struct hlist_head *h;
629 
630 	if (size <= PAGE_SIZE)
631 		h = kmalloc(size, GFP_KERNEL);
632 	else
633 		h = (struct hlist_head *)
634 			__get_free_pages(GFP_KERNEL, get_order(size));
635 
636 	if (h != NULL) {
637 		for (i = 0; i < n; i++)
638 			INIT_HLIST_HEAD(&h[i]);
639 	}
640 	return h;
641 }
642 
qdisc_class_hash_free(struct hlist_head * h,unsigned int n)643 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
644 {
645 	unsigned int size = n * sizeof(struct hlist_head);
646 
647 	if (size <= PAGE_SIZE)
648 		kfree(h);
649 	else
650 		free_pages((unsigned long)h, get_order(size));
651 }
652 
qdisc_class_hash_grow(struct Qdisc * sch,struct Qdisc_class_hash * clhash)653 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
654 {
655 	struct Qdisc_class_common *cl;
656 	struct hlist_node *next;
657 	struct hlist_head *nhash, *ohash;
658 	unsigned int nsize, nmask, osize;
659 	unsigned int i, h;
660 
661 	/* Rehash when load factor exceeds 0.75 */
662 	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
663 		return;
664 	nsize = clhash->hashsize * 2;
665 	nmask = nsize - 1;
666 	nhash = qdisc_class_hash_alloc(nsize);
667 	if (nhash == NULL)
668 		return;
669 
670 	ohash = clhash->hash;
671 	osize = clhash->hashsize;
672 
673 	sch_tree_lock(sch);
674 	for (i = 0; i < osize; i++) {
675 		hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
676 			h = qdisc_class_hash(cl->classid, nmask);
677 			hlist_add_head(&cl->hnode, &nhash[h]);
678 		}
679 	}
680 	clhash->hash     = nhash;
681 	clhash->hashsize = nsize;
682 	clhash->hashmask = nmask;
683 	sch_tree_unlock(sch);
684 
685 	qdisc_class_hash_free(ohash, osize);
686 }
687 EXPORT_SYMBOL(qdisc_class_hash_grow);
688 
qdisc_class_hash_init(struct Qdisc_class_hash * clhash)689 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
690 {
691 	unsigned int size = 4;
692 
693 	clhash->hash = qdisc_class_hash_alloc(size);
694 	if (clhash->hash == NULL)
695 		return -ENOMEM;
696 	clhash->hashsize  = size;
697 	clhash->hashmask  = size - 1;
698 	clhash->hashelems = 0;
699 	return 0;
700 }
701 EXPORT_SYMBOL(qdisc_class_hash_init);
702 
qdisc_class_hash_destroy(struct Qdisc_class_hash * clhash)703 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
704 {
705 	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
706 }
707 EXPORT_SYMBOL(qdisc_class_hash_destroy);
708 
qdisc_class_hash_insert(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)709 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
710 			     struct Qdisc_class_common *cl)
711 {
712 	unsigned int h;
713 
714 	INIT_HLIST_NODE(&cl->hnode);
715 	h = qdisc_class_hash(cl->classid, clhash->hashmask);
716 	hlist_add_head(&cl->hnode, &clhash->hash[h]);
717 	clhash->hashelems++;
718 }
719 EXPORT_SYMBOL(qdisc_class_hash_insert);
720 
qdisc_class_hash_remove(struct Qdisc_class_hash * clhash,struct Qdisc_class_common * cl)721 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
722 			     struct Qdisc_class_common *cl)
723 {
724 	hlist_del(&cl->hnode);
725 	clhash->hashelems--;
726 }
727 EXPORT_SYMBOL(qdisc_class_hash_remove);
728 
729 /* Allocate an unique handle from space managed by kernel
730  * Possible range is [8000-FFFF]:0000 (0x8000 values)
731  */
qdisc_alloc_handle(struct net_device * dev)732 static u32 qdisc_alloc_handle(struct net_device *dev)
733 {
734 	int i = 0x8000;
735 	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
736 
737 	do {
738 		autohandle += TC_H_MAKE(0x10000U, 0);
739 		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
740 			autohandle = TC_H_MAKE(0x80000000U, 0);
741 		if (!qdisc_lookup(dev, autohandle))
742 			return autohandle;
743 		cond_resched();
744 	} while	(--i > 0);
745 
746 	return 0;
747 }
748 
qdisc_tree_reduce_backlog(struct Qdisc * sch,unsigned int n,unsigned int len)749 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
750 			       unsigned int len)
751 {
752 	const struct Qdisc_class_ops *cops;
753 	unsigned long cl;
754 	u32 parentid;
755 	int drops;
756 
757 	if (n == 0 && len == 0)
758 		return;
759 	drops = max_t(int, n, 0);
760 	rcu_read_lock();
761 	while ((parentid = sch->parent)) {
762 		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
763 			break;
764 
765 		if (sch->flags & TCQ_F_NOPARENT)
766 			break;
767 		/* TODO: perform the search on a per txq basis */
768 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
769 		if (sch == NULL) {
770 			WARN_ON_ONCE(parentid != TC_H_ROOT);
771 			break;
772 		}
773 		cops = sch->ops->cl_ops;
774 		if (cops->qlen_notify) {
775 			cl = cops->get(sch, parentid);
776 			cops->qlen_notify(sch, cl);
777 			cops->put(sch, cl);
778 		}
779 		sch->q.qlen -= n;
780 		sch->qstats.backlog -= len;
781 		__qdisc_qstats_drop(sch, drops);
782 	}
783 	rcu_read_unlock();
784 }
785 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
786 
notify_and_destroy(struct net * net,struct sk_buff * skb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)787 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
788 			       struct nlmsghdr *n, u32 clid,
789 			       struct Qdisc *old, struct Qdisc *new)
790 {
791 	if (new || old)
792 		qdisc_notify(net, skb, n, clid, old, new);
793 
794 	if (old)
795 		qdisc_destroy(old);
796 }
797 
798 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
799  * to device "dev".
800  *
801  * When appropriate send a netlink notification using 'skb'
802  * and "n".
803  *
804  * On success, destroy old qdisc.
805  */
806 
qdisc_graft(struct net_device * dev,struct Qdisc * parent,struct sk_buff * skb,struct nlmsghdr * n,u32 classid,struct Qdisc * new,struct Qdisc * old)807 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
808 		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
809 		       struct Qdisc *new, struct Qdisc *old)
810 {
811 	struct Qdisc *q = old;
812 	struct net *net = dev_net(dev);
813 	int err = 0;
814 
815 	if (parent == NULL) {
816 		unsigned int i, num_q, ingress;
817 
818 		ingress = 0;
819 		num_q = dev->num_tx_queues;
820 		if ((q && q->flags & TCQ_F_INGRESS) ||
821 		    (new && new->flags & TCQ_F_INGRESS)) {
822 			num_q = 1;
823 			ingress = 1;
824 			if (!dev_ingress_queue(dev))
825 				return -ENOENT;
826 		}
827 
828 		if (dev->flags & IFF_UP)
829 			dev_deactivate(dev);
830 
831 		if (new && new->ops->attach)
832 			goto skip;
833 
834 		for (i = 0; i < num_q; i++) {
835 			struct netdev_queue *dev_queue = dev_ingress_queue(dev);
836 
837 			if (!ingress)
838 				dev_queue = netdev_get_tx_queue(dev, i);
839 
840 			old = dev_graft_qdisc(dev_queue, new);
841 			if (new && i > 0)
842 				atomic_inc(&new->refcnt);
843 
844 			if (!ingress)
845 				qdisc_destroy(old);
846 		}
847 
848 skip:
849 		if (!ingress) {
850 			notify_and_destroy(net, skb, n, classid,
851 					   dev->qdisc, new);
852 			if (new && !new->ops->attach)
853 				atomic_inc(&new->refcnt);
854 			dev->qdisc = new ? : &noop_qdisc;
855 
856 			if (new && new->ops->attach)
857 				new->ops->attach(new);
858 		} else {
859 			notify_and_destroy(net, skb, n, classid, old, new);
860 		}
861 
862 		if (dev->flags & IFF_UP)
863 			dev_activate(dev);
864 	} else {
865 		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
866 
867 		err = -EOPNOTSUPP;
868 		if (cops && cops->graft) {
869 			unsigned long cl = cops->get(parent, classid);
870 			if (cl) {
871 				err = cops->graft(parent, cl, new, &old);
872 				cops->put(parent, cl);
873 			} else
874 				err = -ENOENT;
875 		}
876 		if (!err)
877 			notify_and_destroy(net, skb, n, classid, old, new);
878 	}
879 	return err;
880 }
881 
882 /* lockdep annotation is needed for ingress; egress gets it only for name */
883 static struct lock_class_key qdisc_tx_lock;
884 static struct lock_class_key qdisc_rx_lock;
885 
886 /*
887    Allocate and initialize new qdisc.
888 
889    Parameters are passed via opt.
890  */
891 
qdisc_create(struct net_device * dev,struct netdev_queue * dev_queue,struct Qdisc * p,u32 parent,u32 handle,struct nlattr ** tca,int * errp)892 static struct Qdisc *qdisc_create(struct net_device *dev,
893 				  struct netdev_queue *dev_queue,
894 				  struct Qdisc *p, u32 parent, u32 handle,
895 				  struct nlattr **tca, int *errp)
896 {
897 	int err;
898 	struct nlattr *kind = tca[TCA_KIND];
899 	struct Qdisc *sch;
900 	struct Qdisc_ops *ops;
901 	struct qdisc_size_table *stab;
902 
903 	ops = qdisc_lookup_ops(kind);
904 #ifdef CONFIG_MODULES
905 	if (ops == NULL && kind != NULL) {
906 		char name[IFNAMSIZ];
907 		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
908 			/* We dropped the RTNL semaphore in order to
909 			 * perform the module load.  So, even if we
910 			 * succeeded in loading the module we have to
911 			 * tell the caller to replay the request.  We
912 			 * indicate this using -EAGAIN.
913 			 * We replay the request because the device may
914 			 * go away in the mean time.
915 			 */
916 			rtnl_unlock();
917 			request_module("sch_%s", name);
918 			rtnl_lock();
919 			ops = qdisc_lookup_ops(kind);
920 			if (ops != NULL) {
921 				/* We will try again qdisc_lookup_ops,
922 				 * so don't keep a reference.
923 				 */
924 				module_put(ops->owner);
925 				err = -EAGAIN;
926 				goto err_out;
927 			}
928 		}
929 	}
930 #endif
931 
932 	err = -ENOENT;
933 	if (ops == NULL)
934 		goto err_out;
935 
936 	sch = qdisc_alloc(dev_queue, ops);
937 	if (IS_ERR(sch)) {
938 		err = PTR_ERR(sch);
939 		goto err_out2;
940 	}
941 
942 	sch->parent = parent;
943 
944 	if (handle == TC_H_INGRESS) {
945 		sch->flags |= TCQ_F_INGRESS;
946 		handle = TC_H_MAKE(TC_H_INGRESS, 0);
947 		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
948 	} else {
949 		if (handle == 0) {
950 			handle = qdisc_alloc_handle(dev);
951 			err = -ENOMEM;
952 			if (handle == 0)
953 				goto err_out3;
954 		}
955 		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
956 		if (!netif_is_multiqueue(dev))
957 			sch->flags |= TCQ_F_ONETXQUEUE;
958 	}
959 
960 	sch->handle = handle;
961 
962 	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
963 		if (qdisc_is_percpu_stats(sch)) {
964 			sch->cpu_bstats =
965 				netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
966 			if (!sch->cpu_bstats)
967 				goto err_out4;
968 
969 			sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
970 			if (!sch->cpu_qstats)
971 				goto err_out4;
972 		}
973 
974 		if (tca[TCA_STAB]) {
975 			stab = qdisc_get_stab(tca[TCA_STAB]);
976 			if (IS_ERR(stab)) {
977 				err = PTR_ERR(stab);
978 				goto err_out4;
979 			}
980 			rcu_assign_pointer(sch->stab, stab);
981 		}
982 		if (tca[TCA_RATE]) {
983 			seqcount_t *running;
984 
985 			err = -EOPNOTSUPP;
986 			if (sch->flags & TCQ_F_MQROOT)
987 				goto err_out4;
988 
989 			if ((sch->parent != TC_H_ROOT) &&
990 			    !(sch->flags & TCQ_F_INGRESS) &&
991 			    (!p || !(p->flags & TCQ_F_MQROOT)))
992 				running = qdisc_root_sleeping_running(sch);
993 			else
994 				running = &sch->running;
995 
996 			err = gen_new_estimator(&sch->bstats,
997 						sch->cpu_bstats,
998 						&sch->rate_est,
999 						NULL,
1000 						running,
1001 						tca[TCA_RATE]);
1002 			if (err)
1003 				goto err_out4;
1004 		}
1005 
1006 		qdisc_hash_add(sch);
1007 
1008 		return sch;
1009 	}
1010 	/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1011 	if (ops->destroy)
1012 		ops->destroy(sch);
1013 err_out3:
1014 	dev_put(dev);
1015 	kfree((char *) sch - sch->padded);
1016 err_out2:
1017 	module_put(ops->owner);
1018 err_out:
1019 	*errp = err;
1020 	return NULL;
1021 
1022 err_out4:
1023 	free_percpu(sch->cpu_bstats);
1024 	free_percpu(sch->cpu_qstats);
1025 	/*
1026 	 * Any broken qdiscs that would require a ops->reset() here?
1027 	 * The qdisc was never in action so it shouldn't be necessary.
1028 	 */
1029 	qdisc_put_stab(rtnl_dereference(sch->stab));
1030 	if (ops->destroy)
1031 		ops->destroy(sch);
1032 	goto err_out3;
1033 }
1034 
qdisc_change(struct Qdisc * sch,struct nlattr ** tca)1035 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1036 {
1037 	struct qdisc_size_table *ostab, *stab = NULL;
1038 	int err = 0;
1039 
1040 	if (tca[TCA_OPTIONS]) {
1041 		if (sch->ops->change == NULL)
1042 			return -EINVAL;
1043 		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1044 		if (err)
1045 			return err;
1046 	}
1047 
1048 	if (tca[TCA_STAB]) {
1049 		stab = qdisc_get_stab(tca[TCA_STAB]);
1050 		if (IS_ERR(stab))
1051 			return PTR_ERR(stab);
1052 	}
1053 
1054 	ostab = rtnl_dereference(sch->stab);
1055 	rcu_assign_pointer(sch->stab, stab);
1056 	qdisc_put_stab(ostab);
1057 
1058 	if (tca[TCA_RATE]) {
1059 		/* NB: ignores errors from replace_estimator
1060 		   because change can't be undone. */
1061 		if (sch->flags & TCQ_F_MQROOT)
1062 			goto out;
1063 		gen_replace_estimator(&sch->bstats,
1064 				      sch->cpu_bstats,
1065 				      &sch->rate_est,
1066 				      NULL,
1067 				      qdisc_root_sleeping_running(sch),
1068 				      tca[TCA_RATE]);
1069 	}
1070 out:
1071 	return 0;
1072 }
1073 
1074 struct check_loop_arg {
1075 	struct qdisc_walker	w;
1076 	struct Qdisc		*p;
1077 	int			depth;
1078 };
1079 
1080 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1081 			 struct qdisc_walker *w);
1082 
check_loop(struct Qdisc * q,struct Qdisc * p,int depth)1083 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1084 {
1085 	struct check_loop_arg	arg;
1086 
1087 	if (q->ops->cl_ops == NULL)
1088 		return 0;
1089 
1090 	arg.w.stop = arg.w.skip = arg.w.count = 0;
1091 	arg.w.fn = check_loop_fn;
1092 	arg.depth = depth;
1093 	arg.p = p;
1094 	q->ops->cl_ops->walk(q, &arg.w);
1095 	return arg.w.stop ? -ELOOP : 0;
1096 }
1097 
1098 static int
check_loop_fn(struct Qdisc * q,unsigned long cl,struct qdisc_walker * w)1099 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1100 {
1101 	struct Qdisc *leaf;
1102 	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1103 	struct check_loop_arg *arg = (struct check_loop_arg *)w;
1104 
1105 	leaf = cops->leaf(q, cl);
1106 	if (leaf) {
1107 		if (leaf == arg->p || arg->depth > 7)
1108 			return -ELOOP;
1109 		return check_loop(leaf, arg->p, arg->depth + 1);
1110 	}
1111 	return 0;
1112 }
1113 
1114 /*
1115  * Delete/get qdisc.
1116  */
1117 
tc_get_qdisc(struct sk_buff * skb,struct nlmsghdr * n)1118 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1119 {
1120 	struct net *net = sock_net(skb->sk);
1121 	struct tcmsg *tcm = nlmsg_data(n);
1122 	struct nlattr *tca[TCA_MAX + 1];
1123 	struct net_device *dev;
1124 	u32 clid;
1125 	struct Qdisc *q = NULL;
1126 	struct Qdisc *p = NULL;
1127 	int err;
1128 
1129 	if ((n->nlmsg_type != RTM_GETQDISC) &&
1130 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1131 		return -EPERM;
1132 
1133 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1134 	if (err < 0)
1135 		return err;
1136 
1137 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1138 	if (!dev)
1139 		return -ENODEV;
1140 
1141 	clid = tcm->tcm_parent;
1142 	if (clid) {
1143 		if (clid != TC_H_ROOT) {
1144 			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1145 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1146 				if (!p)
1147 					return -ENOENT;
1148 				q = qdisc_leaf(p, clid);
1149 			} else if (dev_ingress_queue(dev)) {
1150 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1151 			}
1152 		} else {
1153 			q = dev->qdisc;
1154 		}
1155 		if (!q)
1156 			return -ENOENT;
1157 
1158 		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1159 			return -EINVAL;
1160 	} else {
1161 		q = qdisc_lookup(dev, tcm->tcm_handle);
1162 		if (!q)
1163 			return -ENOENT;
1164 	}
1165 
1166 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1167 		return -EINVAL;
1168 
1169 	if (n->nlmsg_type == RTM_DELQDISC) {
1170 		if (!clid)
1171 			return -EINVAL;
1172 		if (q->handle == 0)
1173 			return -ENOENT;
1174 		err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1175 		if (err != 0)
1176 			return err;
1177 	} else {
1178 		qdisc_notify(net, skb, n, clid, NULL, q);
1179 	}
1180 	return 0;
1181 }
1182 
1183 /*
1184  * Create/change qdisc.
1185  */
1186 
tc_modify_qdisc(struct sk_buff * skb,struct nlmsghdr * n)1187 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1188 {
1189 	struct net *net = sock_net(skb->sk);
1190 	struct tcmsg *tcm;
1191 	struct nlattr *tca[TCA_MAX + 1];
1192 	struct net_device *dev;
1193 	u32 clid;
1194 	struct Qdisc *q, *p;
1195 	int err;
1196 
1197 	if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1198 		return -EPERM;
1199 
1200 replay:
1201 	/* Reinit, just in case something touches this. */
1202 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1203 	if (err < 0)
1204 		return err;
1205 
1206 	tcm = nlmsg_data(n);
1207 	clid = tcm->tcm_parent;
1208 	q = p = NULL;
1209 
1210 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1211 	if (!dev)
1212 		return -ENODEV;
1213 
1214 
1215 	if (clid) {
1216 		if (clid != TC_H_ROOT) {
1217 			if (clid != TC_H_INGRESS) {
1218 				p = qdisc_lookup(dev, TC_H_MAJ(clid));
1219 				if (!p)
1220 					return -ENOENT;
1221 				q = qdisc_leaf(p, clid);
1222 			} else if (dev_ingress_queue_create(dev)) {
1223 				q = dev_ingress_queue(dev)->qdisc_sleeping;
1224 			}
1225 		} else {
1226 			q = dev->qdisc;
1227 		}
1228 
1229 		/* It may be default qdisc, ignore it */
1230 		if (q && q->handle == 0)
1231 			q = NULL;
1232 
1233 		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1234 			if (tcm->tcm_handle) {
1235 				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1236 					return -EEXIST;
1237 				if (TC_H_MIN(tcm->tcm_handle))
1238 					return -EINVAL;
1239 				q = qdisc_lookup(dev, tcm->tcm_handle);
1240 				if (!q)
1241 					goto create_n_graft;
1242 				if (n->nlmsg_flags & NLM_F_EXCL)
1243 					return -EEXIST;
1244 				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1245 					return -EINVAL;
1246 				if (q == p ||
1247 				    (p && check_loop(q, p, 0)))
1248 					return -ELOOP;
1249 				atomic_inc(&q->refcnt);
1250 				goto graft;
1251 			} else {
1252 				if (!q)
1253 					goto create_n_graft;
1254 
1255 				/* This magic test requires explanation.
1256 				 *
1257 				 *   We know, that some child q is already
1258 				 *   attached to this parent and have choice:
1259 				 *   either to change it or to create/graft new one.
1260 				 *
1261 				 *   1. We are allowed to create/graft only
1262 				 *   if CREATE and REPLACE flags are set.
1263 				 *
1264 				 *   2. If EXCL is set, requestor wanted to say,
1265 				 *   that qdisc tcm_handle is not expected
1266 				 *   to exist, so that we choose create/graft too.
1267 				 *
1268 				 *   3. The last case is when no flags are set.
1269 				 *   Alas, it is sort of hole in API, we
1270 				 *   cannot decide what to do unambiguously.
1271 				 *   For now we select create/graft, if
1272 				 *   user gave KIND, which does not match existing.
1273 				 */
1274 				if ((n->nlmsg_flags & NLM_F_CREATE) &&
1275 				    (n->nlmsg_flags & NLM_F_REPLACE) &&
1276 				    ((n->nlmsg_flags & NLM_F_EXCL) ||
1277 				     (tca[TCA_KIND] &&
1278 				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1279 					goto create_n_graft;
1280 			}
1281 		}
1282 	} else {
1283 		if (!tcm->tcm_handle)
1284 			return -EINVAL;
1285 		q = qdisc_lookup(dev, tcm->tcm_handle);
1286 	}
1287 
1288 	/* Change qdisc parameters */
1289 	if (q == NULL)
1290 		return -ENOENT;
1291 	if (n->nlmsg_flags & NLM_F_EXCL)
1292 		return -EEXIST;
1293 	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1294 		return -EINVAL;
1295 	err = qdisc_change(q, tca);
1296 	if (err == 0)
1297 		qdisc_notify(net, skb, n, clid, NULL, q);
1298 	return err;
1299 
1300 create_n_graft:
1301 	if (!(n->nlmsg_flags & NLM_F_CREATE))
1302 		return -ENOENT;
1303 	if (clid == TC_H_INGRESS) {
1304 		if (dev_ingress_queue(dev))
1305 			q = qdisc_create(dev, dev_ingress_queue(dev), p,
1306 					 tcm->tcm_parent, tcm->tcm_parent,
1307 					 tca, &err);
1308 		else
1309 			err = -ENOENT;
1310 	} else {
1311 		struct netdev_queue *dev_queue;
1312 
1313 		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1314 			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1315 		else if (p)
1316 			dev_queue = p->dev_queue;
1317 		else
1318 			dev_queue = netdev_get_tx_queue(dev, 0);
1319 
1320 		q = qdisc_create(dev, dev_queue, p,
1321 				 tcm->tcm_parent, tcm->tcm_handle,
1322 				 tca, &err);
1323 	}
1324 	if (q == NULL) {
1325 		if (err == -EAGAIN)
1326 			goto replay;
1327 		return err;
1328 	}
1329 
1330 graft:
1331 	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1332 	if (err) {
1333 		if (q)
1334 			qdisc_destroy(q);
1335 		return err;
1336 	}
1337 
1338 	return 0;
1339 }
1340 
tc_fill_qdisc(struct sk_buff * skb,struct Qdisc * q,u32 clid,u32 portid,u32 seq,u16 flags,int event)1341 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1342 			 u32 portid, u32 seq, u16 flags, int event)
1343 {
1344 	struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1345 	struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1346 	struct tcmsg *tcm;
1347 	struct nlmsghdr  *nlh;
1348 	unsigned char *b = skb_tail_pointer(skb);
1349 	struct gnet_dump d;
1350 	struct qdisc_size_table *stab;
1351 	__u32 qlen;
1352 
1353 	cond_resched();
1354 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1355 	if (!nlh)
1356 		goto out_nlmsg_trim;
1357 	tcm = nlmsg_data(nlh);
1358 	tcm->tcm_family = AF_UNSPEC;
1359 	tcm->tcm__pad1 = 0;
1360 	tcm->tcm__pad2 = 0;
1361 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1362 	tcm->tcm_parent = clid;
1363 	tcm->tcm_handle = q->handle;
1364 	tcm->tcm_info = atomic_read(&q->refcnt);
1365 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1366 		goto nla_put_failure;
1367 	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1368 		goto nla_put_failure;
1369 	qlen = q->q.qlen;
1370 
1371 	stab = rtnl_dereference(q->stab);
1372 	if (stab && qdisc_dump_stab(skb, stab) < 0)
1373 		goto nla_put_failure;
1374 
1375 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1376 					 NULL, &d, TCA_PAD) < 0)
1377 		goto nla_put_failure;
1378 
1379 	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1380 		goto nla_put_failure;
1381 
1382 	if (qdisc_is_percpu_stats(q)) {
1383 		cpu_bstats = q->cpu_bstats;
1384 		cpu_qstats = q->cpu_qstats;
1385 	}
1386 
1387 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
1388 				  &d, cpu_bstats, &q->bstats) < 0 ||
1389 	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1390 	    gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1391 		goto nla_put_failure;
1392 
1393 	if (gnet_stats_finish_copy(&d) < 0)
1394 		goto nla_put_failure;
1395 
1396 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1397 	return skb->len;
1398 
1399 out_nlmsg_trim:
1400 nla_put_failure:
1401 	nlmsg_trim(skb, b);
1402 	return -1;
1403 }
1404 
tc_qdisc_dump_ignore(struct Qdisc * q)1405 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1406 {
1407 	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1408 }
1409 
qdisc_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,u32 clid,struct Qdisc * old,struct Qdisc * new)1410 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1411 			struct nlmsghdr *n, u32 clid,
1412 			struct Qdisc *old, struct Qdisc *new)
1413 {
1414 	struct sk_buff *skb;
1415 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1416 
1417 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1418 	if (!skb)
1419 		return -ENOBUFS;
1420 
1421 	if (old && !tc_qdisc_dump_ignore(old)) {
1422 		if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1423 				  0, RTM_DELQDISC) < 0)
1424 			goto err_out;
1425 	}
1426 	if (new && !tc_qdisc_dump_ignore(new)) {
1427 		if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1428 				  old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1429 			goto err_out;
1430 	}
1431 
1432 	if (skb->len)
1433 		return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1434 				      n->nlmsg_flags & NLM_F_ECHO);
1435 
1436 err_out:
1437 	kfree_skb(skb);
1438 	return -EINVAL;
1439 }
1440 
tc_dump_qdisc_root(struct Qdisc * root,struct sk_buff * skb,struct netlink_callback * cb,int * q_idx_p,int s_q_idx,bool recur)1441 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1442 			      struct netlink_callback *cb,
1443 			      int *q_idx_p, int s_q_idx, bool recur)
1444 {
1445 	int ret = 0, q_idx = *q_idx_p;
1446 	struct Qdisc *q;
1447 	int b;
1448 
1449 	if (!root)
1450 		return 0;
1451 
1452 	q = root;
1453 	if (q_idx < s_q_idx) {
1454 		q_idx++;
1455 	} else {
1456 		if (!tc_qdisc_dump_ignore(q) &&
1457 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1458 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1459 				  RTM_NEWQDISC) <= 0)
1460 			goto done;
1461 		q_idx++;
1462 	}
1463 
1464 	/* If dumping singletons, there is no qdisc_dev(root) and the singleton
1465 	 * itself has already been dumped.
1466 	 *
1467 	 * If we've already dumped the top-level (ingress) qdisc above and the global
1468 	 * qdisc hashtable, we don't want to hit it again
1469 	 */
1470 	if (!qdisc_dev(root) || !recur)
1471 		goto out;
1472 
1473 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1474 		if (q_idx < s_q_idx) {
1475 			q_idx++;
1476 			continue;
1477 		}
1478 		if (!tc_qdisc_dump_ignore(q) &&
1479 		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1480 				  cb->nlh->nlmsg_seq, NLM_F_MULTI,
1481 				  RTM_NEWQDISC) <= 0)
1482 			goto done;
1483 		q_idx++;
1484 	}
1485 
1486 out:
1487 	*q_idx_p = q_idx;
1488 	return ret;
1489 done:
1490 	ret = -1;
1491 	goto out;
1492 }
1493 
tc_dump_qdisc(struct sk_buff * skb,struct netlink_callback * cb)1494 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1495 {
1496 	struct net *net = sock_net(skb->sk);
1497 	int idx, q_idx;
1498 	int s_idx, s_q_idx;
1499 	struct net_device *dev;
1500 
1501 	s_idx = cb->args[0];
1502 	s_q_idx = q_idx = cb->args[1];
1503 
1504 	idx = 0;
1505 	ASSERT_RTNL();
1506 	for_each_netdev(net, dev) {
1507 		struct netdev_queue *dev_queue;
1508 
1509 		if (idx < s_idx)
1510 			goto cont;
1511 		if (idx > s_idx)
1512 			s_q_idx = 0;
1513 		q_idx = 0;
1514 
1515 		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1516 				       true) < 0)
1517 			goto done;
1518 
1519 		dev_queue = dev_ingress_queue(dev);
1520 		if (dev_queue &&
1521 		    tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1522 				       &q_idx, s_q_idx, false) < 0)
1523 			goto done;
1524 
1525 cont:
1526 		idx++;
1527 	}
1528 
1529 done:
1530 	cb->args[0] = idx;
1531 	cb->args[1] = q_idx;
1532 
1533 	return skb->len;
1534 }
1535 
1536 
1537 
1538 /************************************************
1539  *	Traffic classes manipulation.		*
1540  ************************************************/
1541 
1542 
1543 
tc_ctl_tclass(struct sk_buff * skb,struct nlmsghdr * n)1544 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1545 {
1546 	struct net *net = sock_net(skb->sk);
1547 	struct tcmsg *tcm = nlmsg_data(n);
1548 	struct nlattr *tca[TCA_MAX + 1];
1549 	struct net_device *dev;
1550 	struct Qdisc *q = NULL;
1551 	const struct Qdisc_class_ops *cops;
1552 	unsigned long cl = 0;
1553 	unsigned long new_cl;
1554 	u32 portid;
1555 	u32 clid;
1556 	u32 qid;
1557 	int err;
1558 
1559 	if ((n->nlmsg_type != RTM_GETTCLASS) &&
1560 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1561 		return -EPERM;
1562 
1563 	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1564 	if (err < 0)
1565 		return err;
1566 
1567 	dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1568 	if (!dev)
1569 		return -ENODEV;
1570 
1571 	/*
1572 	   parent == TC_H_UNSPEC - unspecified parent.
1573 	   parent == TC_H_ROOT   - class is root, which has no parent.
1574 	   parent == X:0	 - parent is root class.
1575 	   parent == X:Y	 - parent is a node in hierarchy.
1576 	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1577 
1578 	   handle == 0:0	 - generate handle from kernel pool.
1579 	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1580 	   handle == X:Y	 - clear.
1581 	   handle == X:0	 - root class.
1582 	 */
1583 
1584 	/* Step 1. Determine qdisc handle X:0 */
1585 
1586 	portid = tcm->tcm_parent;
1587 	clid = tcm->tcm_handle;
1588 	qid = TC_H_MAJ(clid);
1589 
1590 	if (portid != TC_H_ROOT) {
1591 		u32 qid1 = TC_H_MAJ(portid);
1592 
1593 		if (qid && qid1) {
1594 			/* If both majors are known, they must be identical. */
1595 			if (qid != qid1)
1596 				return -EINVAL;
1597 		} else if (qid1) {
1598 			qid = qid1;
1599 		} else if (qid == 0)
1600 			qid = dev->qdisc->handle;
1601 
1602 		/* Now qid is genuine qdisc handle consistent
1603 		 * both with parent and child.
1604 		 *
1605 		 * TC_H_MAJ(portid) still may be unspecified, complete it now.
1606 		 */
1607 		if (portid)
1608 			portid = TC_H_MAKE(qid, portid);
1609 	} else {
1610 		if (qid == 0)
1611 			qid = dev->qdisc->handle;
1612 	}
1613 
1614 	/* OK. Locate qdisc */
1615 	q = qdisc_lookup(dev, qid);
1616 	if (!q)
1617 		return -ENOENT;
1618 
1619 	/* An check that it supports classes */
1620 	cops = q->ops->cl_ops;
1621 	if (cops == NULL)
1622 		return -EINVAL;
1623 
1624 	/* Now try to get class */
1625 	if (clid == 0) {
1626 		if (portid == TC_H_ROOT)
1627 			clid = qid;
1628 	} else
1629 		clid = TC_H_MAKE(qid, clid);
1630 
1631 	if (clid)
1632 		cl = cops->get(q, clid);
1633 
1634 	if (cl == 0) {
1635 		err = -ENOENT;
1636 		if (n->nlmsg_type != RTM_NEWTCLASS ||
1637 		    !(n->nlmsg_flags & NLM_F_CREATE))
1638 			goto out;
1639 	} else {
1640 		switch (n->nlmsg_type) {
1641 		case RTM_NEWTCLASS:
1642 			err = -EEXIST;
1643 			if (n->nlmsg_flags & NLM_F_EXCL)
1644 				goto out;
1645 			break;
1646 		case RTM_DELTCLASS:
1647 			err = -EOPNOTSUPP;
1648 			if (cops->delete)
1649 				err = cops->delete(q, cl);
1650 			if (err == 0)
1651 				tclass_notify(net, skb, n, q, cl,
1652 					      RTM_DELTCLASS);
1653 			goto out;
1654 		case RTM_GETTCLASS:
1655 			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1656 			goto out;
1657 		default:
1658 			err = -EINVAL;
1659 			goto out;
1660 		}
1661 	}
1662 
1663 	new_cl = cl;
1664 	err = -EOPNOTSUPP;
1665 	if (cops->change)
1666 		err = cops->change(q, clid, portid, tca, &new_cl);
1667 	if (err == 0)
1668 		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1669 
1670 out:
1671 	if (cl)
1672 		cops->put(q, cl);
1673 
1674 	return err;
1675 }
1676 
1677 
tc_fill_tclass(struct sk_buff * skb,struct Qdisc * q,unsigned long cl,u32 portid,u32 seq,u16 flags,int event)1678 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1679 			  unsigned long cl,
1680 			  u32 portid, u32 seq, u16 flags, int event)
1681 {
1682 	struct tcmsg *tcm;
1683 	struct nlmsghdr  *nlh;
1684 	unsigned char *b = skb_tail_pointer(skb);
1685 	struct gnet_dump d;
1686 	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1687 
1688 	cond_resched();
1689 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1690 	if (!nlh)
1691 		goto out_nlmsg_trim;
1692 	tcm = nlmsg_data(nlh);
1693 	tcm->tcm_family = AF_UNSPEC;
1694 	tcm->tcm__pad1 = 0;
1695 	tcm->tcm__pad2 = 0;
1696 	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1697 	tcm->tcm_parent = q->handle;
1698 	tcm->tcm_handle = q->handle;
1699 	tcm->tcm_info = 0;
1700 	if (nla_put_string(skb, TCA_KIND, q->ops->id))
1701 		goto nla_put_failure;
1702 	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1703 		goto nla_put_failure;
1704 
1705 	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1706 					 NULL, &d, TCA_PAD) < 0)
1707 		goto nla_put_failure;
1708 
1709 	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1710 		goto nla_put_failure;
1711 
1712 	if (gnet_stats_finish_copy(&d) < 0)
1713 		goto nla_put_failure;
1714 
1715 	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1716 	return skb->len;
1717 
1718 out_nlmsg_trim:
1719 nla_put_failure:
1720 	nlmsg_trim(skb, b);
1721 	return -1;
1722 }
1723 
tclass_notify(struct net * net,struct sk_buff * oskb,struct nlmsghdr * n,struct Qdisc * q,unsigned long cl,int event)1724 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1725 			 struct nlmsghdr *n, struct Qdisc *q,
1726 			 unsigned long cl, int event)
1727 {
1728 	struct sk_buff *skb;
1729 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1730 
1731 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1732 	if (!skb)
1733 		return -ENOBUFS;
1734 
1735 	if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1736 		kfree_skb(skb);
1737 		return -EINVAL;
1738 	}
1739 
1740 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1741 			      n->nlmsg_flags & NLM_F_ECHO);
1742 }
1743 
1744 struct qdisc_dump_args {
1745 	struct qdisc_walker	w;
1746 	struct sk_buff		*skb;
1747 	struct netlink_callback	*cb;
1748 };
1749 
qdisc_class_dump(struct Qdisc * q,unsigned long cl,struct qdisc_walker * arg)1750 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1751 			    struct qdisc_walker *arg)
1752 {
1753 	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1754 
1755 	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1756 			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1757 			      RTM_NEWTCLASS);
1758 }
1759 
tc_dump_tclass_qdisc(struct Qdisc * q,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1760 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1761 				struct tcmsg *tcm, struct netlink_callback *cb,
1762 				int *t_p, int s_t)
1763 {
1764 	struct qdisc_dump_args arg;
1765 
1766 	if (tc_qdisc_dump_ignore(q) ||
1767 	    *t_p < s_t || !q->ops->cl_ops ||
1768 	    (tcm->tcm_parent &&
1769 	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1770 		(*t_p)++;
1771 		return 0;
1772 	}
1773 	if (*t_p > s_t)
1774 		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1775 	arg.w.fn = qdisc_class_dump;
1776 	arg.skb = skb;
1777 	arg.cb = cb;
1778 	arg.w.stop  = 0;
1779 	arg.w.skip = cb->args[1];
1780 	arg.w.count = 0;
1781 	q->ops->cl_ops->walk(q, &arg.w);
1782 	cb->args[1] = arg.w.count;
1783 	if (arg.w.stop)
1784 		return -1;
1785 	(*t_p)++;
1786 	return 0;
1787 }
1788 
tc_dump_tclass_root(struct Qdisc * root,struct sk_buff * skb,struct tcmsg * tcm,struct netlink_callback * cb,int * t_p,int s_t)1789 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1790 			       struct tcmsg *tcm, struct netlink_callback *cb,
1791 			       int *t_p, int s_t)
1792 {
1793 	struct Qdisc *q;
1794 	int b;
1795 
1796 	if (!root)
1797 		return 0;
1798 
1799 	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1800 		return -1;
1801 
1802 	if (!qdisc_dev(root))
1803 		return 0;
1804 
1805 	hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1806 		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1807 			return -1;
1808 	}
1809 
1810 	return 0;
1811 }
1812 
tc_dump_tclass(struct sk_buff * skb,struct netlink_callback * cb)1813 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1814 {
1815 	struct tcmsg *tcm = nlmsg_data(cb->nlh);
1816 	struct net *net = sock_net(skb->sk);
1817 	struct netdev_queue *dev_queue;
1818 	struct net_device *dev;
1819 	int t, s_t;
1820 
1821 	if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1822 		return 0;
1823 	dev = dev_get_by_index(net, tcm->tcm_ifindex);
1824 	if (!dev)
1825 		return 0;
1826 
1827 	s_t = cb->args[0];
1828 	t = 0;
1829 
1830 	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1831 		goto done;
1832 
1833 	dev_queue = dev_ingress_queue(dev);
1834 	if (dev_queue &&
1835 	    tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1836 				&t, s_t) < 0)
1837 		goto done;
1838 
1839 done:
1840 	cb->args[0] = t;
1841 
1842 	dev_put(dev);
1843 	return skb->len;
1844 }
1845 
1846 /* Main classifier routine: scans classifier chain attached
1847  * to this qdisc, (optionally) tests for protocol and asks
1848  * specific classifiers.
1849  */
tc_classify(struct sk_buff * skb,const struct tcf_proto * tp,struct tcf_result * res,bool compat_mode)1850 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1851 		struct tcf_result *res, bool compat_mode)
1852 {
1853 	__be16 protocol = tc_skb_protocol(skb);
1854 #ifdef CONFIG_NET_CLS_ACT
1855 	const struct tcf_proto *old_tp = tp;
1856 	int limit = 0;
1857 
1858 reclassify:
1859 #endif
1860 	for (; tp; tp = rcu_dereference_bh(tp->next)) {
1861 		int err;
1862 
1863 		if (tp->protocol != protocol &&
1864 		    tp->protocol != htons(ETH_P_ALL))
1865 			continue;
1866 
1867 		err = tp->classify(skb, tp, res);
1868 #ifdef CONFIG_NET_CLS_ACT
1869 		if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
1870 			goto reset;
1871 #endif
1872 		if (err >= 0)
1873 			return err;
1874 	}
1875 
1876 	return TC_ACT_UNSPEC; /* signal: continue lookup */
1877 #ifdef CONFIG_NET_CLS_ACT
1878 reset:
1879 	if (unlikely(limit++ >= MAX_REC_LOOP)) {
1880 		net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1881 				       tp->q->ops->id, tp->prio & 0xffff,
1882 				       ntohs(tp->protocol));
1883 		return TC_ACT_SHOT;
1884 	}
1885 
1886 	tp = old_tp;
1887 	protocol = tc_skb_protocol(skb);
1888 	goto reclassify;
1889 #endif
1890 }
1891 EXPORT_SYMBOL(tc_classify);
1892 
tcf_destroy(struct tcf_proto * tp,bool force)1893 bool tcf_destroy(struct tcf_proto *tp, bool force)
1894 {
1895 	if (tp->ops->destroy(tp, force)) {
1896 		module_put(tp->ops->owner);
1897 		kfree_rcu(tp, rcu);
1898 		return true;
1899 	}
1900 
1901 	return false;
1902 }
1903 
tcf_destroy_chain(struct tcf_proto __rcu ** fl)1904 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1905 {
1906 	struct tcf_proto *tp;
1907 
1908 	while ((tp = rtnl_dereference(*fl)) != NULL) {
1909 		RCU_INIT_POINTER(*fl, tp->next);
1910 		tcf_destroy(tp, true);
1911 	}
1912 }
1913 EXPORT_SYMBOL(tcf_destroy_chain);
1914 
1915 #ifdef CONFIG_PROC_FS
psched_show(struct seq_file * seq,void * v)1916 static int psched_show(struct seq_file *seq, void *v)
1917 {
1918 	seq_printf(seq, "%08x %08x %08x %08x\n",
1919 		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1920 		   1000000,
1921 		   (u32)NSEC_PER_SEC / hrtimer_resolution);
1922 
1923 	return 0;
1924 }
1925 
psched_open(struct inode * inode,struct file * file)1926 static int psched_open(struct inode *inode, struct file *file)
1927 {
1928 	return single_open(file, psched_show, NULL);
1929 }
1930 
1931 static const struct file_operations psched_fops = {
1932 	.owner = THIS_MODULE,
1933 	.open = psched_open,
1934 	.read  = seq_read,
1935 	.llseek = seq_lseek,
1936 	.release = single_release,
1937 };
1938 
psched_net_init(struct net * net)1939 static int __net_init psched_net_init(struct net *net)
1940 {
1941 	struct proc_dir_entry *e;
1942 
1943 	e = proc_create("psched", 0, net->proc_net, &psched_fops);
1944 	if (e == NULL)
1945 		return -ENOMEM;
1946 
1947 	return 0;
1948 }
1949 
psched_net_exit(struct net * net)1950 static void __net_exit psched_net_exit(struct net *net)
1951 {
1952 	remove_proc_entry("psched", net->proc_net);
1953 }
1954 #else
psched_net_init(struct net * net)1955 static int __net_init psched_net_init(struct net *net)
1956 {
1957 	return 0;
1958 }
1959 
psched_net_exit(struct net * net)1960 static void __net_exit psched_net_exit(struct net *net)
1961 {
1962 }
1963 #endif
1964 
1965 static struct pernet_operations psched_net_ops = {
1966 	.init = psched_net_init,
1967 	.exit = psched_net_exit,
1968 };
1969 
pktsched_init(void)1970 static int __init pktsched_init(void)
1971 {
1972 	int err;
1973 
1974 	err = register_pernet_subsys(&psched_net_ops);
1975 	if (err) {
1976 		pr_err("pktsched_init: "
1977 		       "cannot initialize per netns operations\n");
1978 		return err;
1979 	}
1980 
1981 	register_qdisc(&pfifo_fast_ops);
1982 	register_qdisc(&pfifo_qdisc_ops);
1983 	register_qdisc(&bfifo_qdisc_ops);
1984 	register_qdisc(&pfifo_head_drop_qdisc_ops);
1985 	register_qdisc(&mq_qdisc_ops);
1986 	register_qdisc(&noqueue_qdisc_ops);
1987 
1988 	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1989 	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1990 	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
1991 		      NULL);
1992 	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1993 	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1994 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
1995 		      NULL);
1996 
1997 	return 0;
1998 }
1999 
2000 subsys_initcall(pktsched_init);
2001