• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_cbq.c	Class-Based Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
18 
19 
20 /*	Class-Based Queueing (CBQ) algorithm.
21 	=======================================
22 
23 	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24 		 Management Models for Packet Networks",
25 		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26 
27 		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28 
29 		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30 		 Parameters", 1996
31 
32 		 [4] Sally Floyd and Michael Speer, "Experimental Results
33 		 for Class-Based Queueing", 1998, not published.
34 
35 	-----------------------------------------------------------------------
36 
37 	Algorithm skeleton was taken from NS simulator cbq.cc.
38 	If someone wants to check this code against the LBL version,
39 	he should take into account that ONLY the skeleton was borrowed,
40 	the implementation is different. Particularly:
41 
42 	--- The WRR algorithm is different. Our version looks more
43 	reasonable (I hope) and works when quanta are allowed to be
44 	less than MTU, which is always the case when real time classes
45 	have small rates. Note, that the statement of [3] is
46 	incomplete, delay may actually be estimated even if class
47 	per-round allotment is less than MTU. Namely, if per-round
48 	allotment is W*r_i, and r_1+...+r_k = r < 1
49 
50 	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51 
52 	In the worst case we have IntServ estimate with D = W*r+k*MTU
53 	and C = MTU*r. The proof (if correct at all) is trivial.
54 
55 
56 	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
57 	interpret some places, which look like wrong translations
58 	from NS. Anyone is advised to find these differences
59 	and explain to me, why I am wrong 8).
60 
61 	--- Linux has no EOI event, so that we cannot estimate true class
62 	idle time. Workaround is to consider the next dequeue event
63 	as sign that previous packet is finished. This is wrong because of
64 	internal device queueing, but on a permanently loaded link it is true.
65 	Moreover, combined with clock integrator, this scheme looks
66 	very close to an ideal solution.  */
67 
68 struct cbq_sched_data;
69 
70 
71 struct cbq_class {
72 	struct Qdisc_class_common common;
73 	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
74 
75 /* Parameters */
76 	unsigned char		priority;	/* class priority */
77 	unsigned char		priority2;	/* priority to be used after overlimit */
78 	unsigned char		ewma_log;	/* time constant for idle time calculation */
79 
80 	u32			defmap;
81 
82 	/* Link-sharing scheduler parameters */
83 	long			maxidle;	/* Class parameters: see below. */
84 	long			offtime;
85 	long			minidle;
86 	u32			avpkt;
87 	struct qdisc_rate_table	*R_tab;
88 
89 	/* General scheduler (WRR) parameters */
90 	long			allot;
91 	long			quantum;	/* Allotment per WRR round */
92 	long			weight;		/* Relative allotment: see below */
93 
94 	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
95 	struct cbq_class	*split;		/* Ptr to split node */
96 	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
97 	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
98 	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
99 						   parent otherwise */
100 	struct cbq_class	*sibling;	/* Sibling chain */
101 	struct cbq_class	*children;	/* Pointer to children chain */
102 
103 	struct Qdisc		*q;		/* Elementary queueing discipline */
104 
105 
106 /* Variables */
107 	unsigned char		cpriority;	/* Effective priority */
108 	unsigned char		delayed;
109 	unsigned char		level;		/* level of the class in hierarchy:
110 						   0 for leaf classes, and maximal
111 						   level of children + 1 for nodes.
112 						 */
113 
114 	psched_time_t		last;		/* Last end of service */
115 	psched_time_t		undertime;
116 	long			avgidle;
117 	long			deficit;	/* Saved deficit for WRR */
118 	psched_time_t		penalized;
119 	struct gnet_stats_basic_packed bstats;
120 	struct gnet_stats_queue qstats;
121 	struct net_rate_estimator __rcu *rate_est;
122 	struct tc_cbq_xstats	xstats;
123 
124 	struct tcf_proto __rcu	*filter_list;
125 	struct tcf_block	*block;
126 
127 	int			filters;
128 
129 	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
130 };
131 
132 struct cbq_sched_data {
133 	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
134 	int			nclasses[TC_CBQ_MAXPRIO + 1];
135 	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
136 
137 	struct cbq_class	link;
138 
139 	unsigned int		activemask;
140 	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
141 								   with backlog */
142 
143 #ifdef CONFIG_NET_CLS_ACT
144 	struct cbq_class	*rx_class;
145 #endif
146 	struct cbq_class	*tx_class;
147 	struct cbq_class	*tx_borrowed;
148 	int			tx_len;
149 	psched_time_t		now;		/* Cached timestamp */
150 	unsigned int		pmask;
151 
152 	struct hrtimer		delay_timer;
153 	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
154 						   started when CBQ has
155 						   backlog, but cannot
156 						   transmit just now */
157 	psched_tdiff_t		wd_expires;
158 	int			toplevel;
159 	u32			hgenerator;
160 };
161 
162 
163 #define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
164 
165 static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data * q,u32 classid)166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
167 {
168 	struct Qdisc_class_common *clc;
169 
170 	clc = qdisc_class_find(&q->clhash, classid);
171 	if (clc == NULL)
172 		return NULL;
173 	return container_of(clc, struct cbq_class, common);
174 }
175 
176 #ifdef CONFIG_NET_CLS_ACT
177 
178 static struct cbq_class *
cbq_reclassify(struct sk_buff * skb,struct cbq_class * this)179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
180 {
181 	struct cbq_class *cl;
182 
183 	for (cl = this->tparent; cl; cl = cl->tparent) {
184 		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
185 
186 		if (new != NULL && new != this)
187 			return new;
188 	}
189 	return NULL;
190 }
191 
192 #endif
193 
194 /* Classify packet. The procedure is pretty complicated, but
195  * it allows us to combine link sharing and priority scheduling
196  * transparently.
197  *
198  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199  * so that it resolves to split nodes. Then packets are classified
200  * by logical priority, or a more specific classifier may be attached
201  * to the split node.
202  */
203 
204 static struct cbq_class *
cbq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
206 {
207 	struct cbq_sched_data *q = qdisc_priv(sch);
208 	struct cbq_class *head = &q->link;
209 	struct cbq_class **defmap;
210 	struct cbq_class *cl = NULL;
211 	u32 prio = skb->priority;
212 	struct tcf_proto *fl;
213 	struct tcf_result res;
214 
215 	/*
216 	 *  Step 1. If skb->priority points to one of our classes, use it.
217 	 */
218 	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219 	    (cl = cbq_class_lookup(q, prio)) != NULL)
220 		return cl;
221 
222 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
223 	for (;;) {
224 		int result = 0;
225 		defmap = head->defaults;
226 
227 		fl = rcu_dereference_bh(head->filter_list);
228 		/*
229 		 * Step 2+n. Apply classifier.
230 		 */
231 		result = tcf_classify(skb, fl, &res, true);
232 		if (!fl || result < 0)
233 			goto fallback;
234 		if (result == TC_ACT_SHOT)
235 			return NULL;
236 
237 		cl = (void *)res.class;
238 		if (!cl) {
239 			if (TC_H_MAJ(res.classid))
240 				cl = cbq_class_lookup(q, res.classid);
241 			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
242 				cl = defmap[TC_PRIO_BESTEFFORT];
243 
244 			if (cl == NULL)
245 				goto fallback;
246 		}
247 		if (cl->level >= head->level)
248 			goto fallback;
249 #ifdef CONFIG_NET_CLS_ACT
250 		switch (result) {
251 		case TC_ACT_QUEUED:
252 		case TC_ACT_STOLEN:
253 		case TC_ACT_TRAP:
254 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
255 			fallthrough;
256 		case TC_ACT_RECLASSIFY:
257 			return cbq_reclassify(skb, cl);
258 		}
259 #endif
260 		if (cl->level == 0)
261 			return cl;
262 
263 		/*
264 		 * Step 3+n. If classifier selected a link sharing class,
265 		 *	   apply agency specific classifier.
266 		 *	   Repeat this procdure until we hit a leaf node.
267 		 */
268 		head = cl;
269 	}
270 
271 fallback:
272 	cl = head;
273 
274 	/*
275 	 * Step 4. No success...
276 	 */
277 	if (TC_H_MAJ(prio) == 0 &&
278 	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
279 	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
280 		return head;
281 
282 	return cl;
283 }
284 
285 /*
286  * A packet has just been enqueued on the empty class.
287  * cbq_activate_class adds it to the tail of active class list
288  * of its priority band.
289  */
290 
cbq_activate_class(struct cbq_class * cl)291 static inline void cbq_activate_class(struct cbq_class *cl)
292 {
293 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
294 	int prio = cl->cpriority;
295 	struct cbq_class *cl_tail;
296 
297 	cl_tail = q->active[prio];
298 	q->active[prio] = cl;
299 
300 	if (cl_tail != NULL) {
301 		cl->next_alive = cl_tail->next_alive;
302 		cl_tail->next_alive = cl;
303 	} else {
304 		cl->next_alive = cl;
305 		q->activemask |= (1<<prio);
306 	}
307 }
308 
309 /*
310  * Unlink class from active chain.
311  * Note that this same procedure is done directly in cbq_dequeue*
312  * during round-robin procedure.
313  */
314 
cbq_deactivate_class(struct cbq_class * this)315 static void cbq_deactivate_class(struct cbq_class *this)
316 {
317 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
318 	int prio = this->cpriority;
319 	struct cbq_class *cl;
320 	struct cbq_class *cl_prev = q->active[prio];
321 
322 	do {
323 		cl = cl_prev->next_alive;
324 		if (cl == this) {
325 			cl_prev->next_alive = cl->next_alive;
326 			cl->next_alive = NULL;
327 
328 			if (cl == q->active[prio]) {
329 				q->active[prio] = cl_prev;
330 				if (cl == q->active[prio]) {
331 					q->active[prio] = NULL;
332 					q->activemask &= ~(1<<prio);
333 					return;
334 				}
335 			}
336 			return;
337 		}
338 	} while ((cl_prev = cl) != q->active[prio]);
339 }
340 
341 static void
cbq_mark_toplevel(struct cbq_sched_data * q,struct cbq_class * cl)342 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
343 {
344 	int toplevel = q->toplevel;
345 
346 	if (toplevel > cl->level) {
347 		psched_time_t now = psched_get_time();
348 
349 		do {
350 			if (cl->undertime < now) {
351 				q->toplevel = cl->level;
352 				return;
353 			}
354 		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
355 	}
356 }
357 
358 static int
cbq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)359 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
360 	    struct sk_buff **to_free)
361 {
362 	struct cbq_sched_data *q = qdisc_priv(sch);
363 	int ret;
364 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
365 
366 #ifdef CONFIG_NET_CLS_ACT
367 	q->rx_class = cl;
368 #endif
369 	if (cl == NULL) {
370 		if (ret & __NET_XMIT_BYPASS)
371 			qdisc_qstats_drop(sch);
372 		__qdisc_drop(skb, to_free);
373 		return ret;
374 	}
375 
376 	ret = qdisc_enqueue(skb, cl->q, to_free);
377 	if (ret == NET_XMIT_SUCCESS) {
378 		sch->q.qlen++;
379 		cbq_mark_toplevel(q, cl);
380 		if (!cl->next_alive)
381 			cbq_activate_class(cl);
382 		return ret;
383 	}
384 
385 	if (net_xmit_drop_count(ret)) {
386 		qdisc_qstats_drop(sch);
387 		cbq_mark_toplevel(q, cl);
388 		cl->qstats.drops++;
389 	}
390 	return ret;
391 }
392 
393 /* Overlimit action: penalize leaf class by adding offtime */
cbq_overlimit(struct cbq_class * cl)394 static void cbq_overlimit(struct cbq_class *cl)
395 {
396 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
397 	psched_tdiff_t delay = cl->undertime - q->now;
398 
399 	if (!cl->delayed) {
400 		delay += cl->offtime;
401 
402 		/*
403 		 * Class goes to sleep, so that it will have no
404 		 * chance to work avgidle. Let's forgive it 8)
405 		 *
406 		 * BTW cbq-2.0 has a crap in this
407 		 * place, apparently they forgot to shift it by cl->ewma_log.
408 		 */
409 		if (cl->avgidle < 0)
410 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
411 		if (cl->avgidle < cl->minidle)
412 			cl->avgidle = cl->minidle;
413 		if (delay <= 0)
414 			delay = 1;
415 		cl->undertime = q->now + delay;
416 
417 		cl->xstats.overactions++;
418 		cl->delayed = 1;
419 	}
420 	if (q->wd_expires == 0 || q->wd_expires > delay)
421 		q->wd_expires = delay;
422 
423 	/* Dirty work! We must schedule wakeups based on
424 	 * real available rate, rather than leaf rate,
425 	 * which may be tiny (even zero).
426 	 */
427 	if (q->toplevel == TC_CBQ_MAXLEVEL) {
428 		struct cbq_class *b;
429 		psched_tdiff_t base_delay = q->wd_expires;
430 
431 		for (b = cl->borrow; b; b = b->borrow) {
432 			delay = b->undertime - q->now;
433 			if (delay < base_delay) {
434 				if (delay <= 0)
435 					delay = 1;
436 				base_delay = delay;
437 			}
438 		}
439 
440 		q->wd_expires = base_delay;
441 	}
442 }
443 
cbq_undelay_prio(struct cbq_sched_data * q,int prio,psched_time_t now)444 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
445 				       psched_time_t now)
446 {
447 	struct cbq_class *cl;
448 	struct cbq_class *cl_prev = q->active[prio];
449 	psched_time_t sched = now;
450 
451 	if (cl_prev == NULL)
452 		return 0;
453 
454 	do {
455 		cl = cl_prev->next_alive;
456 		if (now - cl->penalized > 0) {
457 			cl_prev->next_alive = cl->next_alive;
458 			cl->next_alive = NULL;
459 			cl->cpriority = cl->priority;
460 			cl->delayed = 0;
461 			cbq_activate_class(cl);
462 
463 			if (cl == q->active[prio]) {
464 				q->active[prio] = cl_prev;
465 				if (cl == q->active[prio]) {
466 					q->active[prio] = NULL;
467 					return 0;
468 				}
469 			}
470 
471 			cl = cl_prev->next_alive;
472 		} else if (sched - cl->penalized > 0)
473 			sched = cl->penalized;
474 	} while ((cl_prev = cl) != q->active[prio]);
475 
476 	return sched - now;
477 }
478 
cbq_undelay(struct hrtimer * timer)479 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
480 {
481 	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
482 						delay_timer);
483 	struct Qdisc *sch = q->watchdog.qdisc;
484 	psched_time_t now;
485 	psched_tdiff_t delay = 0;
486 	unsigned int pmask;
487 
488 	now = psched_get_time();
489 
490 	pmask = q->pmask;
491 	q->pmask = 0;
492 
493 	while (pmask) {
494 		int prio = ffz(~pmask);
495 		psched_tdiff_t tmp;
496 
497 		pmask &= ~(1<<prio);
498 
499 		tmp = cbq_undelay_prio(q, prio, now);
500 		if (tmp > 0) {
501 			q->pmask |= 1<<prio;
502 			if (tmp < delay || delay == 0)
503 				delay = tmp;
504 		}
505 	}
506 
507 	if (delay) {
508 		ktime_t time;
509 
510 		time = 0;
511 		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
512 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
513 	}
514 
515 	__netif_schedule(qdisc_root(sch));
516 	return HRTIMER_NORESTART;
517 }
518 
519 /*
520  * It is mission critical procedure.
521  *
522  * We "regenerate" toplevel cutoff, if transmitting class
523  * has backlog and it is not regulated. It is not part of
524  * original CBQ description, but looks more reasonable.
525  * Probably, it is wrong. This question needs further investigation.
526  */
527 
528 static inline void
cbq_update_toplevel(struct cbq_sched_data * q,struct cbq_class * cl,struct cbq_class * borrowed)529 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
530 		    struct cbq_class *borrowed)
531 {
532 	if (cl && q->toplevel >= borrowed->level) {
533 		if (cl->q->q.qlen > 1) {
534 			do {
535 				if (borrowed->undertime == PSCHED_PASTPERFECT) {
536 					q->toplevel = borrowed->level;
537 					return;
538 				}
539 			} while ((borrowed = borrowed->borrow) != NULL);
540 		}
541 #if 0
542 	/* It is not necessary now. Uncommenting it
543 	   will save CPU cycles, but decrease fairness.
544 	 */
545 		q->toplevel = TC_CBQ_MAXLEVEL;
546 #endif
547 	}
548 }
549 
550 static void
cbq_update(struct cbq_sched_data * q)551 cbq_update(struct cbq_sched_data *q)
552 {
553 	struct cbq_class *this = q->tx_class;
554 	struct cbq_class *cl = this;
555 	int len = q->tx_len;
556 	psched_time_t now;
557 
558 	q->tx_class = NULL;
559 	/* Time integrator. We calculate EOS time
560 	 * by adding expected packet transmission time.
561 	 */
562 	now = q->now + L2T(&q->link, len);
563 
564 	for ( ; cl; cl = cl->share) {
565 		long avgidle = cl->avgidle;
566 		long idle;
567 
568 		cl->bstats.packets++;
569 		cl->bstats.bytes += len;
570 
571 		/*
572 		 * (now - last) is total time between packet right edges.
573 		 * (last_pktlen/rate) is "virtual" busy time, so that
574 		 *
575 		 *	idle = (now - last) - last_pktlen/rate
576 		 */
577 
578 		idle = now - cl->last;
579 		if ((unsigned long)idle > 128*1024*1024) {
580 			avgidle = cl->maxidle;
581 		} else {
582 			idle -= L2T(cl, len);
583 
584 		/* true_avgidle := (1-W)*true_avgidle + W*idle,
585 		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
586 		 * cl->avgidle == true_avgidle/W,
587 		 * hence:
588 		 */
589 			avgidle += idle - (avgidle>>cl->ewma_log);
590 		}
591 
592 		if (avgidle <= 0) {
593 			/* Overlimit or at-limit */
594 
595 			if (avgidle < cl->minidle)
596 				avgidle = cl->minidle;
597 
598 			cl->avgidle = avgidle;
599 
600 			/* Calculate expected time, when this class
601 			 * will be allowed to send.
602 			 * It will occur, when:
603 			 * (1-W)*true_avgidle + W*delay = 0, i.e.
604 			 * idle = (1/W - 1)*(-true_avgidle)
605 			 * or
606 			 * idle = (1 - W)*(-cl->avgidle);
607 			 */
608 			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
609 
610 			/*
611 			 * That is not all.
612 			 * To maintain the rate allocated to the class,
613 			 * we add to undertime virtual clock,
614 			 * necessary to complete transmitted packet.
615 			 * (len/phys_bandwidth has been already passed
616 			 * to the moment of cbq_update)
617 			 */
618 
619 			idle -= L2T(&q->link, len);
620 			idle += L2T(cl, len);
621 
622 			cl->undertime = now + idle;
623 		} else {
624 			/* Underlimit */
625 
626 			cl->undertime = PSCHED_PASTPERFECT;
627 			if (avgidle > cl->maxidle)
628 				cl->avgidle = cl->maxidle;
629 			else
630 				cl->avgidle = avgidle;
631 		}
632 		if ((s64)(now - cl->last) > 0)
633 			cl->last = now;
634 	}
635 
636 	cbq_update_toplevel(q, this, q->tx_borrowed);
637 }
638 
639 static inline struct cbq_class *
cbq_under_limit(struct cbq_class * cl)640 cbq_under_limit(struct cbq_class *cl)
641 {
642 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
643 	struct cbq_class *this_cl = cl;
644 
645 	if (cl->tparent == NULL)
646 		return cl;
647 
648 	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
649 		cl->delayed = 0;
650 		return cl;
651 	}
652 
653 	do {
654 		/* It is very suspicious place. Now overlimit
655 		 * action is generated for not bounded classes
656 		 * only if link is completely congested.
657 		 * Though it is in agree with ancestor-only paradigm,
658 		 * it looks very stupid. Particularly,
659 		 * it means that this chunk of code will either
660 		 * never be called or result in strong amplification
661 		 * of burstiness. Dangerous, silly, and, however,
662 		 * no another solution exists.
663 		 */
664 		cl = cl->borrow;
665 		if (!cl) {
666 			this_cl->qstats.overlimits++;
667 			cbq_overlimit(this_cl);
668 			return NULL;
669 		}
670 		if (cl->level > q->toplevel)
671 			return NULL;
672 	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
673 
674 	cl->delayed = 0;
675 	return cl;
676 }
677 
678 static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc * sch,int prio)679 cbq_dequeue_prio(struct Qdisc *sch, int prio)
680 {
681 	struct cbq_sched_data *q = qdisc_priv(sch);
682 	struct cbq_class *cl_tail, *cl_prev, *cl;
683 	struct sk_buff *skb;
684 	int deficit;
685 
686 	cl_tail = cl_prev = q->active[prio];
687 	cl = cl_prev->next_alive;
688 
689 	do {
690 		deficit = 0;
691 
692 		/* Start round */
693 		do {
694 			struct cbq_class *borrow = cl;
695 
696 			if (cl->q->q.qlen &&
697 			    (borrow = cbq_under_limit(cl)) == NULL)
698 				goto skip_class;
699 
700 			if (cl->deficit <= 0) {
701 				/* Class exhausted its allotment per
702 				 * this round. Switch to the next one.
703 				 */
704 				deficit = 1;
705 				cl->deficit += cl->quantum;
706 				goto next_class;
707 			}
708 
709 			skb = cl->q->dequeue(cl->q);
710 
711 			/* Class did not give us any skb :-(
712 			 * It could occur even if cl->q->q.qlen != 0
713 			 * f.e. if cl->q == "tbf"
714 			 */
715 			if (skb == NULL)
716 				goto skip_class;
717 
718 			cl->deficit -= qdisc_pkt_len(skb);
719 			q->tx_class = cl;
720 			q->tx_borrowed = borrow;
721 			if (borrow != cl) {
722 #ifndef CBQ_XSTATS_BORROWS_BYTES
723 				borrow->xstats.borrows++;
724 				cl->xstats.borrows++;
725 #else
726 				borrow->xstats.borrows += qdisc_pkt_len(skb);
727 				cl->xstats.borrows += qdisc_pkt_len(skb);
728 #endif
729 			}
730 			q->tx_len = qdisc_pkt_len(skb);
731 
732 			if (cl->deficit <= 0) {
733 				q->active[prio] = cl;
734 				cl = cl->next_alive;
735 				cl->deficit += cl->quantum;
736 			}
737 			return skb;
738 
739 skip_class:
740 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
741 				/* Class is empty or penalized.
742 				 * Unlink it from active chain.
743 				 */
744 				cl_prev->next_alive = cl->next_alive;
745 				cl->next_alive = NULL;
746 
747 				/* Did cl_tail point to it? */
748 				if (cl == cl_tail) {
749 					/* Repair it! */
750 					cl_tail = cl_prev;
751 
752 					/* Was it the last class in this band? */
753 					if (cl == cl_tail) {
754 						/* Kill the band! */
755 						q->active[prio] = NULL;
756 						q->activemask &= ~(1<<prio);
757 						if (cl->q->q.qlen)
758 							cbq_activate_class(cl);
759 						return NULL;
760 					}
761 
762 					q->active[prio] = cl_tail;
763 				}
764 				if (cl->q->q.qlen)
765 					cbq_activate_class(cl);
766 
767 				cl = cl_prev;
768 			}
769 
770 next_class:
771 			cl_prev = cl;
772 			cl = cl->next_alive;
773 		} while (cl_prev != cl_tail);
774 	} while (deficit);
775 
776 	q->active[prio] = cl_prev;
777 
778 	return NULL;
779 }
780 
781 static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc * sch)782 cbq_dequeue_1(struct Qdisc *sch)
783 {
784 	struct cbq_sched_data *q = qdisc_priv(sch);
785 	struct sk_buff *skb;
786 	unsigned int activemask;
787 
788 	activemask = q->activemask & 0xFF;
789 	while (activemask) {
790 		int prio = ffz(~activemask);
791 		activemask &= ~(1<<prio);
792 		skb = cbq_dequeue_prio(sch, prio);
793 		if (skb)
794 			return skb;
795 	}
796 	return NULL;
797 }
798 
799 static struct sk_buff *
cbq_dequeue(struct Qdisc * sch)800 cbq_dequeue(struct Qdisc *sch)
801 {
802 	struct sk_buff *skb;
803 	struct cbq_sched_data *q = qdisc_priv(sch);
804 	psched_time_t now;
805 
806 	now = psched_get_time();
807 
808 	if (q->tx_class)
809 		cbq_update(q);
810 
811 	q->now = now;
812 
813 	for (;;) {
814 		q->wd_expires = 0;
815 
816 		skb = cbq_dequeue_1(sch);
817 		if (skb) {
818 			qdisc_bstats_update(sch, skb);
819 			sch->q.qlen--;
820 			return skb;
821 		}
822 
823 		/* All the classes are overlimit.
824 		 *
825 		 * It is possible, if:
826 		 *
827 		 * 1. Scheduler is empty.
828 		 * 2. Toplevel cutoff inhibited borrowing.
829 		 * 3. Root class is overlimit.
830 		 *
831 		 * Reset 2d and 3d conditions and retry.
832 		 *
833 		 * Note, that NS and cbq-2.0 are buggy, peeking
834 		 * an arbitrary class is appropriate for ancestor-only
835 		 * sharing, but not for toplevel algorithm.
836 		 *
837 		 * Our version is better, but slower, because it requires
838 		 * two passes, but it is unavoidable with top-level sharing.
839 		 */
840 
841 		if (q->toplevel == TC_CBQ_MAXLEVEL &&
842 		    q->link.undertime == PSCHED_PASTPERFECT)
843 			break;
844 
845 		q->toplevel = TC_CBQ_MAXLEVEL;
846 		q->link.undertime = PSCHED_PASTPERFECT;
847 	}
848 
849 	/* No packets in scheduler or nobody wants to give them to us :-(
850 	 * Sigh... start watchdog timer in the last case.
851 	 */
852 
853 	if (sch->q.qlen) {
854 		qdisc_qstats_overlimit(sch);
855 		if (q->wd_expires)
856 			qdisc_watchdog_schedule(&q->watchdog,
857 						now + q->wd_expires);
858 	}
859 	return NULL;
860 }
861 
862 /* CBQ class maintanance routines */
863 
cbq_adjust_levels(struct cbq_class * this)864 static void cbq_adjust_levels(struct cbq_class *this)
865 {
866 	if (this == NULL)
867 		return;
868 
869 	do {
870 		int level = 0;
871 		struct cbq_class *cl;
872 
873 		cl = this->children;
874 		if (cl) {
875 			do {
876 				if (cl->level > level)
877 					level = cl->level;
878 			} while ((cl = cl->sibling) != this->children);
879 		}
880 		this->level = level + 1;
881 	} while ((this = this->tparent) != NULL);
882 }
883 
cbq_normalize_quanta(struct cbq_sched_data * q,int prio)884 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
885 {
886 	struct cbq_class *cl;
887 	unsigned int h;
888 
889 	if (q->quanta[prio] == 0)
890 		return;
891 
892 	for (h = 0; h < q->clhash.hashsize; h++) {
893 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
894 			/* BUGGGG... Beware! This expression suffer of
895 			 * arithmetic overflows!
896 			 */
897 			if (cl->priority == prio) {
898 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
899 					q->quanta[prio];
900 			}
901 			if (cl->quantum <= 0 ||
902 			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
903 				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
904 					cl->common.classid, cl->quantum);
905 				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
906 			}
907 		}
908 	}
909 }
910 
cbq_sync_defmap(struct cbq_class * cl)911 static void cbq_sync_defmap(struct cbq_class *cl)
912 {
913 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
914 	struct cbq_class *split = cl->split;
915 	unsigned int h;
916 	int i;
917 
918 	if (split == NULL)
919 		return;
920 
921 	for (i = 0; i <= TC_PRIO_MAX; i++) {
922 		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
923 			split->defaults[i] = NULL;
924 	}
925 
926 	for (i = 0; i <= TC_PRIO_MAX; i++) {
927 		int level = split->level;
928 
929 		if (split->defaults[i])
930 			continue;
931 
932 		for (h = 0; h < q->clhash.hashsize; h++) {
933 			struct cbq_class *c;
934 
935 			hlist_for_each_entry(c, &q->clhash.hash[h],
936 					     common.hnode) {
937 				if (c->split == split && c->level < level &&
938 				    c->defmap & (1<<i)) {
939 					split->defaults[i] = c;
940 					level = c->level;
941 				}
942 			}
943 		}
944 	}
945 }
946 
cbq_change_defmap(struct cbq_class * cl,u32 splitid,u32 def,u32 mask)947 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
948 {
949 	struct cbq_class *split = NULL;
950 
951 	if (splitid == 0) {
952 		split = cl->split;
953 		if (!split)
954 			return;
955 		splitid = split->common.classid;
956 	}
957 
958 	if (split == NULL || split->common.classid != splitid) {
959 		for (split = cl->tparent; split; split = split->tparent)
960 			if (split->common.classid == splitid)
961 				break;
962 	}
963 
964 	if (split == NULL)
965 		return;
966 
967 	if (cl->split != split) {
968 		cl->defmap = 0;
969 		cbq_sync_defmap(cl);
970 		cl->split = split;
971 		cl->defmap = def & mask;
972 	} else
973 		cl->defmap = (cl->defmap & ~mask) | (def & mask);
974 
975 	cbq_sync_defmap(cl);
976 }
977 
cbq_unlink_class(struct cbq_class * this)978 static void cbq_unlink_class(struct cbq_class *this)
979 {
980 	struct cbq_class *cl, **clp;
981 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
982 
983 	qdisc_class_hash_remove(&q->clhash, &this->common);
984 
985 	if (this->tparent) {
986 		clp = &this->sibling;
987 		cl = *clp;
988 		do {
989 			if (cl == this) {
990 				*clp = cl->sibling;
991 				break;
992 			}
993 			clp = &cl->sibling;
994 		} while ((cl = *clp) != this->sibling);
995 
996 		if (this->tparent->children == this) {
997 			this->tparent->children = this->sibling;
998 			if (this->sibling == this)
999 				this->tparent->children = NULL;
1000 		}
1001 	} else {
1002 		WARN_ON(this->sibling != this);
1003 	}
1004 }
1005 
cbq_link_class(struct cbq_class * this)1006 static void cbq_link_class(struct cbq_class *this)
1007 {
1008 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009 	struct cbq_class *parent = this->tparent;
1010 
1011 	this->sibling = this;
1012 	qdisc_class_hash_insert(&q->clhash, &this->common);
1013 
1014 	if (parent == NULL)
1015 		return;
1016 
1017 	if (parent->children == NULL) {
1018 		parent->children = this;
1019 	} else {
1020 		this->sibling = parent->children->sibling;
1021 		parent->children->sibling = this;
1022 	}
1023 }
1024 
1025 static void
cbq_reset(struct Qdisc * sch)1026 cbq_reset(struct Qdisc *sch)
1027 {
1028 	struct cbq_sched_data *q = qdisc_priv(sch);
1029 	struct cbq_class *cl;
1030 	int prio;
1031 	unsigned int h;
1032 
1033 	q->activemask = 0;
1034 	q->pmask = 0;
1035 	q->tx_class = NULL;
1036 	q->tx_borrowed = NULL;
1037 	qdisc_watchdog_cancel(&q->watchdog);
1038 	hrtimer_cancel(&q->delay_timer);
1039 	q->toplevel = TC_CBQ_MAXLEVEL;
1040 	q->now = psched_get_time();
1041 
1042 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043 		q->active[prio] = NULL;
1044 
1045 	for (h = 0; h < q->clhash.hashsize; h++) {
1046 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047 			qdisc_reset(cl->q);
1048 
1049 			cl->next_alive = NULL;
1050 			cl->undertime = PSCHED_PASTPERFECT;
1051 			cl->avgidle = cl->maxidle;
1052 			cl->deficit = cl->quantum;
1053 			cl->cpriority = cl->priority;
1054 		}
1055 	}
1056 }
1057 
1058 
cbq_set_lss(struct cbq_class * cl,struct tc_cbq_lssopt * lss)1059 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1060 {
1061 	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1062 		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1063 		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1064 	}
1065 	if (lss->change & TCF_CBQ_LSS_EWMA)
1066 		cl->ewma_log = lss->ewma_log;
1067 	if (lss->change & TCF_CBQ_LSS_AVPKT)
1068 		cl->avpkt = lss->avpkt;
1069 	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1070 		cl->minidle = -(long)lss->minidle;
1071 	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1072 		cl->maxidle = lss->maxidle;
1073 		cl->avgidle = lss->maxidle;
1074 	}
1075 	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1076 		cl->offtime = lss->offtime;
1077 	return 0;
1078 }
1079 
cbq_rmprio(struct cbq_sched_data * q,struct cbq_class * cl)1080 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1081 {
1082 	q->nclasses[cl->priority]--;
1083 	q->quanta[cl->priority] -= cl->weight;
1084 	cbq_normalize_quanta(q, cl->priority);
1085 }
1086 
cbq_addprio(struct cbq_sched_data * q,struct cbq_class * cl)1087 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1088 {
1089 	q->nclasses[cl->priority]++;
1090 	q->quanta[cl->priority] += cl->weight;
1091 	cbq_normalize_quanta(q, cl->priority);
1092 }
1093 
cbq_set_wrr(struct cbq_class * cl,struct tc_cbq_wrropt * wrr)1094 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1095 {
1096 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1097 
1098 	if (wrr->allot)
1099 		cl->allot = wrr->allot;
1100 	if (wrr->weight)
1101 		cl->weight = wrr->weight;
1102 	if (wrr->priority) {
1103 		cl->priority = wrr->priority - 1;
1104 		cl->cpriority = cl->priority;
1105 		if (cl->priority >= cl->priority2)
1106 			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1107 	}
1108 
1109 	cbq_addprio(q, cl);
1110 	return 0;
1111 }
1112 
cbq_set_fopt(struct cbq_class * cl,struct tc_cbq_fopt * fopt)1113 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1114 {
1115 	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1116 	return 0;
1117 }
1118 
1119 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1120 	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1121 	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1122 	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1123 	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1124 	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1125 	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1126 	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1127 };
1128 
cbq_opt_parse(struct nlattr * tb[TCA_CBQ_MAX+1],struct nlattr * opt,struct netlink_ext_ack * extack)1129 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1130 			 struct nlattr *opt,
1131 			 struct netlink_ext_ack *extack)
1132 {
1133 	int err;
1134 
1135 	if (!opt) {
1136 		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1137 		return -EINVAL;
1138 	}
1139 
1140 	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1141 					  cbq_policy, extack);
1142 	if (err < 0)
1143 		return err;
1144 
1145 	if (tb[TCA_CBQ_WRROPT]) {
1146 		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1147 
1148 		if (wrr->priority > TC_CBQ_MAXPRIO) {
1149 			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1150 			err = -EINVAL;
1151 		}
1152 	}
1153 	return err;
1154 }
1155 
cbq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)1156 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1157 		    struct netlink_ext_ack *extack)
1158 {
1159 	struct cbq_sched_data *q = qdisc_priv(sch);
1160 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1161 	struct tc_ratespec *r;
1162 	int err;
1163 
1164 	qdisc_watchdog_init(&q->watchdog, sch);
1165 	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1166 	q->delay_timer.function = cbq_undelay;
1167 
1168 	err = cbq_opt_parse(tb, opt, extack);
1169 	if (err < 0)
1170 		return err;
1171 
1172 	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1173 		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1174 		return -EINVAL;
1175 	}
1176 
1177 	r = nla_data(tb[TCA_CBQ_RATE]);
1178 
1179 	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1180 	if (!q->link.R_tab)
1181 		return -EINVAL;
1182 
1183 	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1184 	if (err)
1185 		goto put_rtab;
1186 
1187 	err = qdisc_class_hash_init(&q->clhash);
1188 	if (err < 0)
1189 		goto put_block;
1190 
1191 	q->link.sibling = &q->link;
1192 	q->link.common.classid = sch->handle;
1193 	q->link.qdisc = sch;
1194 	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1195 				      sch->handle, NULL);
1196 	if (!q->link.q)
1197 		q->link.q = &noop_qdisc;
1198 	else
1199 		qdisc_hash_add(q->link.q, true);
1200 
1201 	q->link.priority = TC_CBQ_MAXPRIO - 1;
1202 	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1203 	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1204 	q->link.allot = psched_mtu(qdisc_dev(sch));
1205 	q->link.quantum = q->link.allot;
1206 	q->link.weight = q->link.R_tab->rate.rate;
1207 
1208 	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1209 	q->link.avpkt = q->link.allot/2;
1210 	q->link.minidle = -0x7FFFFFFF;
1211 
1212 	q->toplevel = TC_CBQ_MAXLEVEL;
1213 	q->now = psched_get_time();
1214 
1215 	cbq_link_class(&q->link);
1216 
1217 	if (tb[TCA_CBQ_LSSOPT])
1218 		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1219 
1220 	cbq_addprio(q, &q->link);
1221 	return 0;
1222 
1223 put_block:
1224 	tcf_block_put(q->link.block);
1225 
1226 put_rtab:
1227 	qdisc_put_rtab(q->link.R_tab);
1228 	return err;
1229 }
1230 
cbq_dump_rate(struct sk_buff * skb,struct cbq_class * cl)1231 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1232 {
1233 	unsigned char *b = skb_tail_pointer(skb);
1234 
1235 	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1236 		goto nla_put_failure;
1237 	return skb->len;
1238 
1239 nla_put_failure:
1240 	nlmsg_trim(skb, b);
1241 	return -1;
1242 }
1243 
cbq_dump_lss(struct sk_buff * skb,struct cbq_class * cl)1244 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1245 {
1246 	unsigned char *b = skb_tail_pointer(skb);
1247 	struct tc_cbq_lssopt opt;
1248 
1249 	opt.flags = 0;
1250 	if (cl->borrow == NULL)
1251 		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1252 	if (cl->share == NULL)
1253 		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1254 	opt.ewma_log = cl->ewma_log;
1255 	opt.level = cl->level;
1256 	opt.avpkt = cl->avpkt;
1257 	opt.maxidle = cl->maxidle;
1258 	opt.minidle = (u32)(-cl->minidle);
1259 	opt.offtime = cl->offtime;
1260 	opt.change = ~0;
1261 	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1262 		goto nla_put_failure;
1263 	return skb->len;
1264 
1265 nla_put_failure:
1266 	nlmsg_trim(skb, b);
1267 	return -1;
1268 }
1269 
cbq_dump_wrr(struct sk_buff * skb,struct cbq_class * cl)1270 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1271 {
1272 	unsigned char *b = skb_tail_pointer(skb);
1273 	struct tc_cbq_wrropt opt;
1274 
1275 	memset(&opt, 0, sizeof(opt));
1276 	opt.flags = 0;
1277 	opt.allot = cl->allot;
1278 	opt.priority = cl->priority + 1;
1279 	opt.cpriority = cl->cpriority + 1;
1280 	opt.weight = cl->weight;
1281 	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1282 		goto nla_put_failure;
1283 	return skb->len;
1284 
1285 nla_put_failure:
1286 	nlmsg_trim(skb, b);
1287 	return -1;
1288 }
1289 
cbq_dump_fopt(struct sk_buff * skb,struct cbq_class * cl)1290 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1291 {
1292 	unsigned char *b = skb_tail_pointer(skb);
1293 	struct tc_cbq_fopt opt;
1294 
1295 	if (cl->split || cl->defmap) {
1296 		opt.split = cl->split ? cl->split->common.classid : 0;
1297 		opt.defmap = cl->defmap;
1298 		opt.defchange = ~0;
1299 		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1300 			goto nla_put_failure;
1301 	}
1302 	return skb->len;
1303 
1304 nla_put_failure:
1305 	nlmsg_trim(skb, b);
1306 	return -1;
1307 }
1308 
cbq_dump_attr(struct sk_buff * skb,struct cbq_class * cl)1309 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1310 {
1311 	if (cbq_dump_lss(skb, cl) < 0 ||
1312 	    cbq_dump_rate(skb, cl) < 0 ||
1313 	    cbq_dump_wrr(skb, cl) < 0 ||
1314 	    cbq_dump_fopt(skb, cl) < 0)
1315 		return -1;
1316 	return 0;
1317 }
1318 
cbq_dump(struct Qdisc * sch,struct sk_buff * skb)1319 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1320 {
1321 	struct cbq_sched_data *q = qdisc_priv(sch);
1322 	struct nlattr *nest;
1323 
1324 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1325 	if (nest == NULL)
1326 		goto nla_put_failure;
1327 	if (cbq_dump_attr(skb, &q->link) < 0)
1328 		goto nla_put_failure;
1329 	return nla_nest_end(skb, nest);
1330 
1331 nla_put_failure:
1332 	nla_nest_cancel(skb, nest);
1333 	return -1;
1334 }
1335 
1336 static int
cbq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1337 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1338 {
1339 	struct cbq_sched_data *q = qdisc_priv(sch);
1340 
1341 	q->link.xstats.avgidle = q->link.avgidle;
1342 	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1343 }
1344 
1345 static int
cbq_dump_class(struct Qdisc * sch,unsigned long arg,struct sk_buff * skb,struct tcmsg * tcm)1346 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1347 	       struct sk_buff *skb, struct tcmsg *tcm)
1348 {
1349 	struct cbq_class *cl = (struct cbq_class *)arg;
1350 	struct nlattr *nest;
1351 
1352 	if (cl->tparent)
1353 		tcm->tcm_parent = cl->tparent->common.classid;
1354 	else
1355 		tcm->tcm_parent = TC_H_ROOT;
1356 	tcm->tcm_handle = cl->common.classid;
1357 	tcm->tcm_info = cl->q->handle;
1358 
1359 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1360 	if (nest == NULL)
1361 		goto nla_put_failure;
1362 	if (cbq_dump_attr(skb, cl) < 0)
1363 		goto nla_put_failure;
1364 	return nla_nest_end(skb, nest);
1365 
1366 nla_put_failure:
1367 	nla_nest_cancel(skb, nest);
1368 	return -1;
1369 }
1370 
1371 static int
cbq_dump_class_stats(struct Qdisc * sch,unsigned long arg,struct gnet_dump * d)1372 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1373 	struct gnet_dump *d)
1374 {
1375 	struct cbq_sched_data *q = qdisc_priv(sch);
1376 	struct cbq_class *cl = (struct cbq_class *)arg;
1377 	__u32 qlen;
1378 
1379 	cl->xstats.avgidle = cl->avgidle;
1380 	cl->xstats.undertime = 0;
1381 	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1382 
1383 	if (cl->undertime != PSCHED_PASTPERFECT)
1384 		cl->xstats.undertime = cl->undertime - q->now;
1385 
1386 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1387 				  d, NULL, &cl->bstats) < 0 ||
1388 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1389 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1390 		return -1;
1391 
1392 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1393 }
1394 
cbq_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)1395 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1396 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1397 {
1398 	struct cbq_class *cl = (struct cbq_class *)arg;
1399 
1400 	if (new == NULL) {
1401 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1402 					cl->common.classid, extack);
1403 		if (new == NULL)
1404 			return -ENOBUFS;
1405 	}
1406 
1407 	*old = qdisc_replace(sch, new, &cl->q);
1408 	return 0;
1409 }
1410 
cbq_leaf(struct Qdisc * sch,unsigned long arg)1411 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1412 {
1413 	struct cbq_class *cl = (struct cbq_class *)arg;
1414 
1415 	return cl->q;
1416 }
1417 
cbq_qlen_notify(struct Qdisc * sch,unsigned long arg)1418 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1419 {
1420 	struct cbq_class *cl = (struct cbq_class *)arg;
1421 
1422 	cbq_deactivate_class(cl);
1423 }
1424 
cbq_find(struct Qdisc * sch,u32 classid)1425 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1426 {
1427 	struct cbq_sched_data *q = qdisc_priv(sch);
1428 
1429 	return (unsigned long)cbq_class_lookup(q, classid);
1430 }
1431 
cbq_destroy_class(struct Qdisc * sch,struct cbq_class * cl)1432 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1433 {
1434 	struct cbq_sched_data *q = qdisc_priv(sch);
1435 
1436 	WARN_ON(cl->filters);
1437 
1438 	tcf_block_put(cl->block);
1439 	qdisc_put(cl->q);
1440 	qdisc_put_rtab(cl->R_tab);
1441 	gen_kill_estimator(&cl->rate_est);
1442 	if (cl != &q->link)
1443 		kfree(cl);
1444 }
1445 
cbq_destroy(struct Qdisc * sch)1446 static void cbq_destroy(struct Qdisc *sch)
1447 {
1448 	struct cbq_sched_data *q = qdisc_priv(sch);
1449 	struct hlist_node *next;
1450 	struct cbq_class *cl;
1451 	unsigned int h;
1452 
1453 #ifdef CONFIG_NET_CLS_ACT
1454 	q->rx_class = NULL;
1455 #endif
1456 	/*
1457 	 * Filters must be destroyed first because we don't destroy the
1458 	 * classes from root to leafs which means that filters can still
1459 	 * be bound to classes which have been destroyed already. --TGR '04
1460 	 */
1461 	for (h = 0; h < q->clhash.hashsize; h++) {
1462 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1463 			tcf_block_put(cl->block);
1464 			cl->block = NULL;
1465 		}
1466 	}
1467 	for (h = 0; h < q->clhash.hashsize; h++) {
1468 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1469 					  common.hnode)
1470 			cbq_destroy_class(sch, cl);
1471 	}
1472 	qdisc_class_hash_destroy(&q->clhash);
1473 }
1474 
1475 static int
cbq_change_class(struct Qdisc * sch,u32 classid,u32 parentid,struct nlattr ** tca,unsigned long * arg,struct netlink_ext_ack * extack)1476 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1477 		 unsigned long *arg, struct netlink_ext_ack *extack)
1478 {
1479 	int err;
1480 	struct cbq_sched_data *q = qdisc_priv(sch);
1481 	struct cbq_class *cl = (struct cbq_class *)*arg;
1482 	struct nlattr *opt = tca[TCA_OPTIONS];
1483 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1484 	struct cbq_class *parent;
1485 	struct qdisc_rate_table *rtab = NULL;
1486 
1487 	err = cbq_opt_parse(tb, opt, extack);
1488 	if (err < 0)
1489 		return err;
1490 
1491 	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1492 		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1493 		return -EOPNOTSUPP;
1494 	}
1495 
1496 	if (cl) {
1497 		/* Check parent */
1498 		if (parentid) {
1499 			if (cl->tparent &&
1500 			    cl->tparent->common.classid != parentid) {
1501 				NL_SET_ERR_MSG(extack, "Invalid parent id");
1502 				return -EINVAL;
1503 			}
1504 			if (!cl->tparent && parentid != TC_H_ROOT) {
1505 				NL_SET_ERR_MSG(extack, "Parent must be root");
1506 				return -EINVAL;
1507 			}
1508 		}
1509 
1510 		if (tb[TCA_CBQ_RATE]) {
1511 			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1512 					      tb[TCA_CBQ_RTAB], extack);
1513 			if (rtab == NULL)
1514 				return -EINVAL;
1515 		}
1516 
1517 		if (tca[TCA_RATE]) {
1518 			err = gen_replace_estimator(&cl->bstats, NULL,
1519 						    &cl->rate_est,
1520 						    NULL,
1521 						    qdisc_root_sleeping_running(sch),
1522 						    tca[TCA_RATE]);
1523 			if (err) {
1524 				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1525 				qdisc_put_rtab(rtab);
1526 				return err;
1527 			}
1528 		}
1529 
1530 		/* Change class parameters */
1531 		sch_tree_lock(sch);
1532 
1533 		if (cl->next_alive != NULL)
1534 			cbq_deactivate_class(cl);
1535 
1536 		if (rtab) {
1537 			qdisc_put_rtab(cl->R_tab);
1538 			cl->R_tab = rtab;
1539 		}
1540 
1541 		if (tb[TCA_CBQ_LSSOPT])
1542 			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1543 
1544 		if (tb[TCA_CBQ_WRROPT]) {
1545 			cbq_rmprio(q, cl);
1546 			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1547 		}
1548 
1549 		if (tb[TCA_CBQ_FOPT])
1550 			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1551 
1552 		if (cl->q->q.qlen)
1553 			cbq_activate_class(cl);
1554 
1555 		sch_tree_unlock(sch);
1556 
1557 		return 0;
1558 	}
1559 
1560 	if (parentid == TC_H_ROOT)
1561 		return -EINVAL;
1562 
1563 	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1564 		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1565 		return -EINVAL;
1566 	}
1567 
1568 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1569 			      extack);
1570 	if (rtab == NULL)
1571 		return -EINVAL;
1572 
1573 	if (classid) {
1574 		err = -EINVAL;
1575 		if (TC_H_MAJ(classid ^ sch->handle) ||
1576 		    cbq_class_lookup(q, classid)) {
1577 			NL_SET_ERR_MSG(extack, "Specified class not found");
1578 			goto failure;
1579 		}
1580 	} else {
1581 		int i;
1582 		classid = TC_H_MAKE(sch->handle, 0x8000);
1583 
1584 		for (i = 0; i < 0x8000; i++) {
1585 			if (++q->hgenerator >= 0x8000)
1586 				q->hgenerator = 1;
1587 			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1588 				break;
1589 		}
1590 		err = -ENOSR;
1591 		if (i >= 0x8000) {
1592 			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1593 			goto failure;
1594 		}
1595 		classid = classid|q->hgenerator;
1596 	}
1597 
1598 	parent = &q->link;
1599 	if (parentid) {
1600 		parent = cbq_class_lookup(q, parentid);
1601 		err = -EINVAL;
1602 		if (!parent) {
1603 			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1604 			goto failure;
1605 		}
1606 	}
1607 
1608 	err = -ENOBUFS;
1609 	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1610 	if (cl == NULL)
1611 		goto failure;
1612 
1613 	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1614 	if (err) {
1615 		kfree(cl);
1616 		goto failure;
1617 	}
1618 
1619 	if (tca[TCA_RATE]) {
1620 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1621 					NULL,
1622 					qdisc_root_sleeping_running(sch),
1623 					tca[TCA_RATE]);
1624 		if (err) {
1625 			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1626 			tcf_block_put(cl->block);
1627 			kfree(cl);
1628 			goto failure;
1629 		}
1630 	}
1631 
1632 	cl->R_tab = rtab;
1633 	rtab = NULL;
1634 	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1635 				  NULL);
1636 	if (!cl->q)
1637 		cl->q = &noop_qdisc;
1638 	else
1639 		qdisc_hash_add(cl->q, true);
1640 
1641 	cl->common.classid = classid;
1642 	cl->tparent = parent;
1643 	cl->qdisc = sch;
1644 	cl->allot = parent->allot;
1645 	cl->quantum = cl->allot;
1646 	cl->weight = cl->R_tab->rate.rate;
1647 
1648 	sch_tree_lock(sch);
1649 	cbq_link_class(cl);
1650 	cl->borrow = cl->tparent;
1651 	if (cl->tparent != &q->link)
1652 		cl->share = cl->tparent;
1653 	cbq_adjust_levels(parent);
1654 	cl->minidle = -0x7FFFFFFF;
1655 	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1656 	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1657 	if (cl->ewma_log == 0)
1658 		cl->ewma_log = q->link.ewma_log;
1659 	if (cl->maxidle == 0)
1660 		cl->maxidle = q->link.maxidle;
1661 	if (cl->avpkt == 0)
1662 		cl->avpkt = q->link.avpkt;
1663 	if (tb[TCA_CBQ_FOPT])
1664 		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1665 	sch_tree_unlock(sch);
1666 
1667 	qdisc_class_hash_grow(sch, &q->clhash);
1668 
1669 	*arg = (unsigned long)cl;
1670 	return 0;
1671 
1672 failure:
1673 	qdisc_put_rtab(rtab);
1674 	return err;
1675 }
1676 
cbq_delete(struct Qdisc * sch,unsigned long arg)1677 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1678 {
1679 	struct cbq_sched_data *q = qdisc_priv(sch);
1680 	struct cbq_class *cl = (struct cbq_class *)arg;
1681 
1682 	if (cl->filters || cl->children || cl == &q->link)
1683 		return -EBUSY;
1684 
1685 	sch_tree_lock(sch);
1686 
1687 	qdisc_purge_queue(cl->q);
1688 
1689 	if (cl->next_alive)
1690 		cbq_deactivate_class(cl);
1691 
1692 	if (q->tx_borrowed == cl)
1693 		q->tx_borrowed = q->tx_class;
1694 	if (q->tx_class == cl) {
1695 		q->tx_class = NULL;
1696 		q->tx_borrowed = NULL;
1697 	}
1698 #ifdef CONFIG_NET_CLS_ACT
1699 	if (q->rx_class == cl)
1700 		q->rx_class = NULL;
1701 #endif
1702 
1703 	cbq_unlink_class(cl);
1704 	cbq_adjust_levels(cl->tparent);
1705 	cl->defmap = 0;
1706 	cbq_sync_defmap(cl);
1707 
1708 	cbq_rmprio(q, cl);
1709 	sch_tree_unlock(sch);
1710 
1711 	cbq_destroy_class(sch, cl);
1712 	return 0;
1713 }
1714 
cbq_tcf_block(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1715 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1716 				       struct netlink_ext_ack *extack)
1717 {
1718 	struct cbq_sched_data *q = qdisc_priv(sch);
1719 	struct cbq_class *cl = (struct cbq_class *)arg;
1720 
1721 	if (cl == NULL)
1722 		cl = &q->link;
1723 
1724 	return cl->block;
1725 }
1726 
cbq_bind_filter(struct Qdisc * sch,unsigned long parent,u32 classid)1727 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1728 				     u32 classid)
1729 {
1730 	struct cbq_sched_data *q = qdisc_priv(sch);
1731 	struct cbq_class *p = (struct cbq_class *)parent;
1732 	struct cbq_class *cl = cbq_class_lookup(q, classid);
1733 
1734 	if (cl) {
1735 		if (p && p->level <= cl->level)
1736 			return 0;
1737 		cl->filters++;
1738 		return (unsigned long)cl;
1739 	}
1740 	return 0;
1741 }
1742 
cbq_unbind_filter(struct Qdisc * sch,unsigned long arg)1743 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1744 {
1745 	struct cbq_class *cl = (struct cbq_class *)arg;
1746 
1747 	cl->filters--;
1748 }
1749 
cbq_walk(struct Qdisc * sch,struct qdisc_walker * arg)1750 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1751 {
1752 	struct cbq_sched_data *q = qdisc_priv(sch);
1753 	struct cbq_class *cl;
1754 	unsigned int h;
1755 
1756 	if (arg->stop)
1757 		return;
1758 
1759 	for (h = 0; h < q->clhash.hashsize; h++) {
1760 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1761 			if (arg->count < arg->skip) {
1762 				arg->count++;
1763 				continue;
1764 			}
1765 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1766 				arg->stop = 1;
1767 				return;
1768 			}
1769 			arg->count++;
1770 		}
1771 	}
1772 }
1773 
1774 static const struct Qdisc_class_ops cbq_class_ops = {
1775 	.graft		=	cbq_graft,
1776 	.leaf		=	cbq_leaf,
1777 	.qlen_notify	=	cbq_qlen_notify,
1778 	.find		=	cbq_find,
1779 	.change		=	cbq_change_class,
1780 	.delete		=	cbq_delete,
1781 	.walk		=	cbq_walk,
1782 	.tcf_block	=	cbq_tcf_block,
1783 	.bind_tcf	=	cbq_bind_filter,
1784 	.unbind_tcf	=	cbq_unbind_filter,
1785 	.dump		=	cbq_dump_class,
1786 	.dump_stats	=	cbq_dump_class_stats,
1787 };
1788 
1789 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1790 	.next		=	NULL,
1791 	.cl_ops		=	&cbq_class_ops,
1792 	.id		=	"cbq",
1793 	.priv_size	=	sizeof(struct cbq_sched_data),
1794 	.enqueue	=	cbq_enqueue,
1795 	.dequeue	=	cbq_dequeue,
1796 	.peek		=	qdisc_peek_dequeued,
1797 	.init		=	cbq_init,
1798 	.reset		=	cbq_reset,
1799 	.destroy	=	cbq_destroy,
1800 	.change		=	NULL,
1801 	.dump		=	cbq_dump,
1802 	.dump_stats	=	cbq_dump_stats,
1803 	.owner		=	THIS_MODULE,
1804 };
1805 
cbq_module_init(void)1806 static int __init cbq_module_init(void)
1807 {
1808 	return register_qdisc(&cbq_qdisc_ops);
1809 }
cbq_module_exit(void)1810 static void __exit cbq_module_exit(void)
1811 {
1812 	unregister_qdisc(&cbq_qdisc_ops);
1813 }
1814 module_init(cbq_module_init)
1815 module_exit(cbq_module_exit)
1816 MODULE_LICENSE("GPL");
1817