• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * net/sched/sch_cbq.c	Class-Based Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 
23 
24 /*	Class-Based Queueing (CBQ) algorithm.
25 	=======================================
26 
27 	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28 		 Management Models for Packet Networks",
29 		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
30 
31 		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
32 
33 		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
34 		 Parameters", 1996
35 
36 		 [4] Sally Floyd and Michael Speer, "Experimental Results
37 		 for Class-Based Queueing", 1998, not published.
38 
39 	-----------------------------------------------------------------------
40 
41 	Algorithm skeleton was taken from NS simulator cbq.cc.
42 	If someone wants to check this code against the LBL version,
43 	he should take into account that ONLY the skeleton was borrowed,
44 	the implementation is different. Particularly:
45 
46 	--- The WRR algorithm is different. Our version looks more
47 	reasonable (I hope) and works when quanta are allowed to be
48 	less than MTU, which is always the case when real time classes
49 	have small rates. Note, that the statement of [3] is
50 	incomplete, delay may actually be estimated even if class
51 	per-round allotment is less than MTU. Namely, if per-round
52 	allotment is W*r_i, and r_1+...+r_k = r < 1
53 
54 	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
55 
56 	In the worst case we have IntServ estimate with D = W*r+k*MTU
57 	and C = MTU*r. The proof (if correct at all) is trivial.
58 
59 
60 	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
61 	interpret some places, which look like wrong translations
62 	from NS. Anyone is advised to find these differences
63 	and explain to me, why I am wrong 8).
64 
65 	--- Linux has no EOI event, so that we cannot estimate true class
66 	idle time. Workaround is to consider the next dequeue event
67 	as sign that previous packet is finished. This is wrong because of
68 	internal device queueing, but on a permanently loaded link it is true.
69 	Moreover, combined with clock integrator, this scheme looks
70 	very close to an ideal solution.  */
71 
72 struct cbq_sched_data;
73 
74 
75 struct cbq_class {
76 	struct Qdisc_class_common common;
77 	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
78 
79 /* Parameters */
80 	unsigned char		priority;	/* class priority */
81 	unsigned char		priority2;	/* priority to be used after overlimit */
82 	unsigned char		ewma_log;	/* time constant for idle time calculation */
83 
84 	u32			defmap;
85 
86 	/* Link-sharing scheduler parameters */
87 	long			maxidle;	/* Class parameters: see below. */
88 	long			offtime;
89 	long			minidle;
90 	u32			avpkt;
91 	struct qdisc_rate_table	*R_tab;
92 
93 	/* General scheduler (WRR) parameters */
94 	long			allot;
95 	long			quantum;	/* Allotment per WRR round */
96 	long			weight;		/* Relative allotment: see below */
97 
98 	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
99 	struct cbq_class	*split;		/* Ptr to split node */
100 	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
101 	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
102 	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
103 						   parent otherwise */
104 	struct cbq_class	*sibling;	/* Sibling chain */
105 	struct cbq_class	*children;	/* Pointer to children chain */
106 
107 	struct Qdisc		*q;		/* Elementary queueing discipline */
108 
109 
110 /* Variables */
111 	unsigned char		cpriority;	/* Effective priority */
112 	unsigned char		delayed;
113 	unsigned char		level;		/* level of the class in hierarchy:
114 						   0 for leaf classes, and maximal
115 						   level of children + 1 for nodes.
116 						 */
117 
118 	psched_time_t		last;		/* Last end of service */
119 	psched_time_t		undertime;
120 	long			avgidle;
121 	long			deficit;	/* Saved deficit for WRR */
122 	psched_time_t		penalized;
123 	struct gnet_stats_basic_packed bstats;
124 	struct gnet_stats_queue qstats;
125 	struct gnet_stats_rate_est64 rate_est;
126 	struct tc_cbq_xstats	xstats;
127 
128 	struct tcf_proto __rcu	*filter_list;
129 
130 	int			refcnt;
131 	int			filters;
132 
133 	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
134 };
135 
136 struct cbq_sched_data {
137 	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
138 	int			nclasses[TC_CBQ_MAXPRIO + 1];
139 	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
140 
141 	struct cbq_class	link;
142 
143 	unsigned int		activemask;
144 	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
145 								   with backlog */
146 
147 #ifdef CONFIG_NET_CLS_ACT
148 	struct cbq_class	*rx_class;
149 #endif
150 	struct cbq_class	*tx_class;
151 	struct cbq_class	*tx_borrowed;
152 	int			tx_len;
153 	psched_time_t		now;		/* Cached timestamp */
154 	unsigned int		pmask;
155 
156 	struct hrtimer		delay_timer;
157 	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
158 						   started when CBQ has
159 						   backlog, but cannot
160 						   transmit just now */
161 	psched_tdiff_t		wd_expires;
162 	int			toplevel;
163 	u32			hgenerator;
164 };
165 
166 
167 #define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
168 
169 static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data * q,u32 classid)170 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
171 {
172 	struct Qdisc_class_common *clc;
173 
174 	clc = qdisc_class_find(&q->clhash, classid);
175 	if (clc == NULL)
176 		return NULL;
177 	return container_of(clc, struct cbq_class, common);
178 }
179 
180 #ifdef CONFIG_NET_CLS_ACT
181 
182 static struct cbq_class *
cbq_reclassify(struct sk_buff * skb,struct cbq_class * this)183 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
184 {
185 	struct cbq_class *cl;
186 
187 	for (cl = this->tparent; cl; cl = cl->tparent) {
188 		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
189 
190 		if (new != NULL && new != this)
191 			return new;
192 	}
193 	return NULL;
194 }
195 
196 #endif
197 
198 /* Classify packet. The procedure is pretty complicated, but
199  * it allows us to combine link sharing and priority scheduling
200  * transparently.
201  *
202  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
203  * so that it resolves to split nodes. Then packets are classified
204  * by logical priority, or a more specific classifier may be attached
205  * to the split node.
206  */
207 
208 static struct cbq_class *
cbq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)209 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
210 {
211 	struct cbq_sched_data *q = qdisc_priv(sch);
212 	struct cbq_class *head = &q->link;
213 	struct cbq_class **defmap;
214 	struct cbq_class *cl = NULL;
215 	u32 prio = skb->priority;
216 	struct tcf_proto *fl;
217 	struct tcf_result res;
218 
219 	/*
220 	 *  Step 1. If skb->priority points to one of our classes, use it.
221 	 */
222 	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
223 	    (cl = cbq_class_lookup(q, prio)) != NULL)
224 		return cl;
225 
226 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
227 	for (;;) {
228 		int result = 0;
229 		defmap = head->defaults;
230 
231 		fl = rcu_dereference_bh(head->filter_list);
232 		/*
233 		 * Step 2+n. Apply classifier.
234 		 */
235 		result = tc_classify(skb, fl, &res, true);
236 		if (!fl || result < 0)
237 			goto fallback;
238 
239 		cl = (void *)res.class;
240 		if (!cl) {
241 			if (TC_H_MAJ(res.classid))
242 				cl = cbq_class_lookup(q, res.classid);
243 			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
244 				cl = defmap[TC_PRIO_BESTEFFORT];
245 
246 			if (cl == NULL)
247 				goto fallback;
248 		}
249 		if (cl->level >= head->level)
250 			goto fallback;
251 #ifdef CONFIG_NET_CLS_ACT
252 		switch (result) {
253 		case TC_ACT_QUEUED:
254 		case TC_ACT_STOLEN:
255 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
256 		case TC_ACT_SHOT:
257 			return NULL;
258 		case TC_ACT_RECLASSIFY:
259 			return cbq_reclassify(skb, cl);
260 		}
261 #endif
262 		if (cl->level == 0)
263 			return cl;
264 
265 		/*
266 		 * Step 3+n. If classifier selected a link sharing class,
267 		 *	   apply agency specific classifier.
268 		 *	   Repeat this procdure until we hit a leaf node.
269 		 */
270 		head = cl;
271 	}
272 
273 fallback:
274 	cl = head;
275 
276 	/*
277 	 * Step 4. No success...
278 	 */
279 	if (TC_H_MAJ(prio) == 0 &&
280 	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
281 	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
282 		return head;
283 
284 	return cl;
285 }
286 
287 /*
288  * A packet has just been enqueued on the empty class.
289  * cbq_activate_class adds it to the tail of active class list
290  * of its priority band.
291  */
292 
cbq_activate_class(struct cbq_class * cl)293 static inline void cbq_activate_class(struct cbq_class *cl)
294 {
295 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
296 	int prio = cl->cpriority;
297 	struct cbq_class *cl_tail;
298 
299 	cl_tail = q->active[prio];
300 	q->active[prio] = cl;
301 
302 	if (cl_tail != NULL) {
303 		cl->next_alive = cl_tail->next_alive;
304 		cl_tail->next_alive = cl;
305 	} else {
306 		cl->next_alive = cl;
307 		q->activemask |= (1<<prio);
308 	}
309 }
310 
311 /*
312  * Unlink class from active chain.
313  * Note that this same procedure is done directly in cbq_dequeue*
314  * during round-robin procedure.
315  */
316 
cbq_deactivate_class(struct cbq_class * this)317 static void cbq_deactivate_class(struct cbq_class *this)
318 {
319 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
320 	int prio = this->cpriority;
321 	struct cbq_class *cl;
322 	struct cbq_class *cl_prev = q->active[prio];
323 
324 	do {
325 		cl = cl_prev->next_alive;
326 		if (cl == this) {
327 			cl_prev->next_alive = cl->next_alive;
328 			cl->next_alive = NULL;
329 
330 			if (cl == q->active[prio]) {
331 				q->active[prio] = cl_prev;
332 				if (cl == q->active[prio]) {
333 					q->active[prio] = NULL;
334 					q->activemask &= ~(1<<prio);
335 					return;
336 				}
337 			}
338 			return;
339 		}
340 	} while ((cl_prev = cl) != q->active[prio]);
341 }
342 
343 static void
cbq_mark_toplevel(struct cbq_sched_data * q,struct cbq_class * cl)344 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
345 {
346 	int toplevel = q->toplevel;
347 
348 	if (toplevel > cl->level) {
349 		psched_time_t now = psched_get_time();
350 
351 		do {
352 			if (cl->undertime < now) {
353 				q->toplevel = cl->level;
354 				return;
355 			}
356 		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
357 	}
358 }
359 
360 static int
cbq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)361 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
362 	    struct sk_buff **to_free)
363 {
364 	struct cbq_sched_data *q = qdisc_priv(sch);
365 	int uninitialized_var(ret);
366 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
367 
368 #ifdef CONFIG_NET_CLS_ACT
369 	q->rx_class = cl;
370 #endif
371 	if (cl == NULL) {
372 		if (ret & __NET_XMIT_BYPASS)
373 			qdisc_qstats_drop(sch);
374 		__qdisc_drop(skb, to_free);
375 		return ret;
376 	}
377 
378 	ret = qdisc_enqueue(skb, cl->q, to_free);
379 	if (ret == NET_XMIT_SUCCESS) {
380 		sch->q.qlen++;
381 		cbq_mark_toplevel(q, cl);
382 		if (!cl->next_alive)
383 			cbq_activate_class(cl);
384 		return ret;
385 	}
386 
387 	if (net_xmit_drop_count(ret)) {
388 		qdisc_qstats_drop(sch);
389 		cbq_mark_toplevel(q, cl);
390 		cl->qstats.drops++;
391 	}
392 	return ret;
393 }
394 
395 /* Overlimit action: penalize leaf class by adding offtime */
cbq_overlimit(struct cbq_class * cl)396 static void cbq_overlimit(struct cbq_class *cl)
397 {
398 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
399 	psched_tdiff_t delay = cl->undertime - q->now;
400 
401 	if (!cl->delayed) {
402 		delay += cl->offtime;
403 
404 		/*
405 		 * Class goes to sleep, so that it will have no
406 		 * chance to work avgidle. Let's forgive it 8)
407 		 *
408 		 * BTW cbq-2.0 has a crap in this
409 		 * place, apparently they forgot to shift it by cl->ewma_log.
410 		 */
411 		if (cl->avgidle < 0)
412 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
413 		if (cl->avgidle < cl->minidle)
414 			cl->avgidle = cl->minidle;
415 		if (delay <= 0)
416 			delay = 1;
417 		cl->undertime = q->now + delay;
418 
419 		cl->xstats.overactions++;
420 		cl->delayed = 1;
421 	}
422 	if (q->wd_expires == 0 || q->wd_expires > delay)
423 		q->wd_expires = delay;
424 
425 	/* Dirty work! We must schedule wakeups based on
426 	 * real available rate, rather than leaf rate,
427 	 * which may be tiny (even zero).
428 	 */
429 	if (q->toplevel == TC_CBQ_MAXLEVEL) {
430 		struct cbq_class *b;
431 		psched_tdiff_t base_delay = q->wd_expires;
432 
433 		for (b = cl->borrow; b; b = b->borrow) {
434 			delay = b->undertime - q->now;
435 			if (delay < base_delay) {
436 				if (delay <= 0)
437 					delay = 1;
438 				base_delay = delay;
439 			}
440 		}
441 
442 		q->wd_expires = base_delay;
443 	}
444 }
445 
cbq_undelay_prio(struct cbq_sched_data * q,int prio,psched_time_t now)446 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
447 				       psched_time_t now)
448 {
449 	struct cbq_class *cl;
450 	struct cbq_class *cl_prev = q->active[prio];
451 	psched_time_t sched = now;
452 
453 	if (cl_prev == NULL)
454 		return 0;
455 
456 	do {
457 		cl = cl_prev->next_alive;
458 		if (now - cl->penalized > 0) {
459 			cl_prev->next_alive = cl->next_alive;
460 			cl->next_alive = NULL;
461 			cl->cpriority = cl->priority;
462 			cl->delayed = 0;
463 			cbq_activate_class(cl);
464 
465 			if (cl == q->active[prio]) {
466 				q->active[prio] = cl_prev;
467 				if (cl == q->active[prio]) {
468 					q->active[prio] = NULL;
469 					return 0;
470 				}
471 			}
472 
473 			cl = cl_prev->next_alive;
474 		} else if (sched - cl->penalized > 0)
475 			sched = cl->penalized;
476 	} while ((cl_prev = cl) != q->active[prio]);
477 
478 	return sched - now;
479 }
480 
cbq_undelay(struct hrtimer * timer)481 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
482 {
483 	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
484 						delay_timer);
485 	struct Qdisc *sch = q->watchdog.qdisc;
486 	psched_time_t now;
487 	psched_tdiff_t delay = 0;
488 	unsigned int pmask;
489 
490 	now = psched_get_time();
491 
492 	pmask = q->pmask;
493 	q->pmask = 0;
494 
495 	while (pmask) {
496 		int prio = ffz(~pmask);
497 		psched_tdiff_t tmp;
498 
499 		pmask &= ~(1<<prio);
500 
501 		tmp = cbq_undelay_prio(q, prio, now);
502 		if (tmp > 0) {
503 			q->pmask |= 1<<prio;
504 			if (tmp < delay || delay == 0)
505 				delay = tmp;
506 		}
507 	}
508 
509 	if (delay) {
510 		ktime_t time;
511 
512 		time = ktime_set(0, 0);
513 		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
514 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
515 	}
516 
517 	__netif_schedule(qdisc_root(sch));
518 	return HRTIMER_NORESTART;
519 }
520 
521 /*
522  * It is mission critical procedure.
523  *
524  * We "regenerate" toplevel cutoff, if transmitting class
525  * has backlog and it is not regulated. It is not part of
526  * original CBQ description, but looks more reasonable.
527  * Probably, it is wrong. This question needs further investigation.
528  */
529 
530 static inline void
cbq_update_toplevel(struct cbq_sched_data * q,struct cbq_class * cl,struct cbq_class * borrowed)531 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
532 		    struct cbq_class *borrowed)
533 {
534 	if (cl && q->toplevel >= borrowed->level) {
535 		if (cl->q->q.qlen > 1) {
536 			do {
537 				if (borrowed->undertime == PSCHED_PASTPERFECT) {
538 					q->toplevel = borrowed->level;
539 					return;
540 				}
541 			} while ((borrowed = borrowed->borrow) != NULL);
542 		}
543 #if 0
544 	/* It is not necessary now. Uncommenting it
545 	   will save CPU cycles, but decrease fairness.
546 	 */
547 		q->toplevel = TC_CBQ_MAXLEVEL;
548 #endif
549 	}
550 }
551 
552 static void
cbq_update(struct cbq_sched_data * q)553 cbq_update(struct cbq_sched_data *q)
554 {
555 	struct cbq_class *this = q->tx_class;
556 	struct cbq_class *cl = this;
557 	int len = q->tx_len;
558 	psched_time_t now;
559 
560 	q->tx_class = NULL;
561 	/* Time integrator. We calculate EOS time
562 	 * by adding expected packet transmission time.
563 	 */
564 	now = q->now + L2T(&q->link, len);
565 
566 	for ( ; cl; cl = cl->share) {
567 		long avgidle = cl->avgidle;
568 		long idle;
569 
570 		cl->bstats.packets++;
571 		cl->bstats.bytes += len;
572 
573 		/*
574 		 * (now - last) is total time between packet right edges.
575 		 * (last_pktlen/rate) is "virtual" busy time, so that
576 		 *
577 		 *	idle = (now - last) - last_pktlen/rate
578 		 */
579 
580 		idle = now - cl->last;
581 		if ((unsigned long)idle > 128*1024*1024) {
582 			avgidle = cl->maxidle;
583 		} else {
584 			idle -= L2T(cl, len);
585 
586 		/* true_avgidle := (1-W)*true_avgidle + W*idle,
587 		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
588 		 * cl->avgidle == true_avgidle/W,
589 		 * hence:
590 		 */
591 			avgidle += idle - (avgidle>>cl->ewma_log);
592 		}
593 
594 		if (avgidle <= 0) {
595 			/* Overlimit or at-limit */
596 
597 			if (avgidle < cl->minidle)
598 				avgidle = cl->minidle;
599 
600 			cl->avgidle = avgidle;
601 
602 			/* Calculate expected time, when this class
603 			 * will be allowed to send.
604 			 * It will occur, when:
605 			 * (1-W)*true_avgidle + W*delay = 0, i.e.
606 			 * idle = (1/W - 1)*(-true_avgidle)
607 			 * or
608 			 * idle = (1 - W)*(-cl->avgidle);
609 			 */
610 			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
611 
612 			/*
613 			 * That is not all.
614 			 * To maintain the rate allocated to the class,
615 			 * we add to undertime virtual clock,
616 			 * necessary to complete transmitted packet.
617 			 * (len/phys_bandwidth has been already passed
618 			 * to the moment of cbq_update)
619 			 */
620 
621 			idle -= L2T(&q->link, len);
622 			idle += L2T(cl, len);
623 
624 			cl->undertime = now + idle;
625 		} else {
626 			/* Underlimit */
627 
628 			cl->undertime = PSCHED_PASTPERFECT;
629 			if (avgidle > cl->maxidle)
630 				cl->avgidle = cl->maxidle;
631 			else
632 				cl->avgidle = avgidle;
633 		}
634 		if ((s64)(now - cl->last) > 0)
635 			cl->last = now;
636 	}
637 
638 	cbq_update_toplevel(q, this, q->tx_borrowed);
639 }
640 
641 static inline struct cbq_class *
cbq_under_limit(struct cbq_class * cl)642 cbq_under_limit(struct cbq_class *cl)
643 {
644 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
645 	struct cbq_class *this_cl = cl;
646 
647 	if (cl->tparent == NULL)
648 		return cl;
649 
650 	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
651 		cl->delayed = 0;
652 		return cl;
653 	}
654 
655 	do {
656 		/* It is very suspicious place. Now overlimit
657 		 * action is generated for not bounded classes
658 		 * only if link is completely congested.
659 		 * Though it is in agree with ancestor-only paradigm,
660 		 * it looks very stupid. Particularly,
661 		 * it means that this chunk of code will either
662 		 * never be called or result in strong amplification
663 		 * of burstiness. Dangerous, silly, and, however,
664 		 * no another solution exists.
665 		 */
666 		cl = cl->borrow;
667 		if (!cl) {
668 			this_cl->qstats.overlimits++;
669 			cbq_overlimit(this_cl);
670 			return NULL;
671 		}
672 		if (cl->level > q->toplevel)
673 			return NULL;
674 	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
675 
676 	cl->delayed = 0;
677 	return cl;
678 }
679 
680 static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc * sch,int prio)681 cbq_dequeue_prio(struct Qdisc *sch, int prio)
682 {
683 	struct cbq_sched_data *q = qdisc_priv(sch);
684 	struct cbq_class *cl_tail, *cl_prev, *cl;
685 	struct sk_buff *skb;
686 	int deficit;
687 
688 	cl_tail = cl_prev = q->active[prio];
689 	cl = cl_prev->next_alive;
690 
691 	do {
692 		deficit = 0;
693 
694 		/* Start round */
695 		do {
696 			struct cbq_class *borrow = cl;
697 
698 			if (cl->q->q.qlen &&
699 			    (borrow = cbq_under_limit(cl)) == NULL)
700 				goto skip_class;
701 
702 			if (cl->deficit <= 0) {
703 				/* Class exhausted its allotment per
704 				 * this round. Switch to the next one.
705 				 */
706 				deficit = 1;
707 				cl->deficit += cl->quantum;
708 				goto next_class;
709 			}
710 
711 			skb = cl->q->dequeue(cl->q);
712 
713 			/* Class did not give us any skb :-(
714 			 * It could occur even if cl->q->q.qlen != 0
715 			 * f.e. if cl->q == "tbf"
716 			 */
717 			if (skb == NULL)
718 				goto skip_class;
719 
720 			cl->deficit -= qdisc_pkt_len(skb);
721 			q->tx_class = cl;
722 			q->tx_borrowed = borrow;
723 			if (borrow != cl) {
724 #ifndef CBQ_XSTATS_BORROWS_BYTES
725 				borrow->xstats.borrows++;
726 				cl->xstats.borrows++;
727 #else
728 				borrow->xstats.borrows += qdisc_pkt_len(skb);
729 				cl->xstats.borrows += qdisc_pkt_len(skb);
730 #endif
731 			}
732 			q->tx_len = qdisc_pkt_len(skb);
733 
734 			if (cl->deficit <= 0) {
735 				q->active[prio] = cl;
736 				cl = cl->next_alive;
737 				cl->deficit += cl->quantum;
738 			}
739 			return skb;
740 
741 skip_class:
742 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
743 				/* Class is empty or penalized.
744 				 * Unlink it from active chain.
745 				 */
746 				cl_prev->next_alive = cl->next_alive;
747 				cl->next_alive = NULL;
748 
749 				/* Did cl_tail point to it? */
750 				if (cl == cl_tail) {
751 					/* Repair it! */
752 					cl_tail = cl_prev;
753 
754 					/* Was it the last class in this band? */
755 					if (cl == cl_tail) {
756 						/* Kill the band! */
757 						q->active[prio] = NULL;
758 						q->activemask &= ~(1<<prio);
759 						if (cl->q->q.qlen)
760 							cbq_activate_class(cl);
761 						return NULL;
762 					}
763 
764 					q->active[prio] = cl_tail;
765 				}
766 				if (cl->q->q.qlen)
767 					cbq_activate_class(cl);
768 
769 				cl = cl_prev;
770 			}
771 
772 next_class:
773 			cl_prev = cl;
774 			cl = cl->next_alive;
775 		} while (cl_prev != cl_tail);
776 	} while (deficit);
777 
778 	q->active[prio] = cl_prev;
779 
780 	return NULL;
781 }
782 
783 static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc * sch)784 cbq_dequeue_1(struct Qdisc *sch)
785 {
786 	struct cbq_sched_data *q = qdisc_priv(sch);
787 	struct sk_buff *skb;
788 	unsigned int activemask;
789 
790 	activemask = q->activemask & 0xFF;
791 	while (activemask) {
792 		int prio = ffz(~activemask);
793 		activemask &= ~(1<<prio);
794 		skb = cbq_dequeue_prio(sch, prio);
795 		if (skb)
796 			return skb;
797 	}
798 	return NULL;
799 }
800 
801 static struct sk_buff *
cbq_dequeue(struct Qdisc * sch)802 cbq_dequeue(struct Qdisc *sch)
803 {
804 	struct sk_buff *skb;
805 	struct cbq_sched_data *q = qdisc_priv(sch);
806 	psched_time_t now;
807 
808 	now = psched_get_time();
809 
810 	if (q->tx_class)
811 		cbq_update(q);
812 
813 	q->now = now;
814 
815 	for (;;) {
816 		q->wd_expires = 0;
817 
818 		skb = cbq_dequeue_1(sch);
819 		if (skb) {
820 			qdisc_bstats_update(sch, skb);
821 			sch->q.qlen--;
822 			return skb;
823 		}
824 
825 		/* All the classes are overlimit.
826 		 *
827 		 * It is possible, if:
828 		 *
829 		 * 1. Scheduler is empty.
830 		 * 2. Toplevel cutoff inhibited borrowing.
831 		 * 3. Root class is overlimit.
832 		 *
833 		 * Reset 2d and 3d conditions and retry.
834 		 *
835 		 * Note, that NS and cbq-2.0 are buggy, peeking
836 		 * an arbitrary class is appropriate for ancestor-only
837 		 * sharing, but not for toplevel algorithm.
838 		 *
839 		 * Our version is better, but slower, because it requires
840 		 * two passes, but it is unavoidable with top-level sharing.
841 		 */
842 
843 		if (q->toplevel == TC_CBQ_MAXLEVEL &&
844 		    q->link.undertime == PSCHED_PASTPERFECT)
845 			break;
846 
847 		q->toplevel = TC_CBQ_MAXLEVEL;
848 		q->link.undertime = PSCHED_PASTPERFECT;
849 	}
850 
851 	/* No packets in scheduler or nobody wants to give them to us :-(
852 	 * Sigh... start watchdog timer in the last case.
853 	 */
854 
855 	if (sch->q.qlen) {
856 		qdisc_qstats_overlimit(sch);
857 		if (q->wd_expires)
858 			qdisc_watchdog_schedule(&q->watchdog,
859 						now + q->wd_expires);
860 	}
861 	return NULL;
862 }
863 
864 /* CBQ class maintanance routines */
865 
cbq_adjust_levels(struct cbq_class * this)866 static void cbq_adjust_levels(struct cbq_class *this)
867 {
868 	if (this == NULL)
869 		return;
870 
871 	do {
872 		int level = 0;
873 		struct cbq_class *cl;
874 
875 		cl = this->children;
876 		if (cl) {
877 			do {
878 				if (cl->level > level)
879 					level = cl->level;
880 			} while ((cl = cl->sibling) != this->children);
881 		}
882 		this->level = level + 1;
883 	} while ((this = this->tparent) != NULL);
884 }
885 
cbq_normalize_quanta(struct cbq_sched_data * q,int prio)886 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
887 {
888 	struct cbq_class *cl;
889 	unsigned int h;
890 
891 	if (q->quanta[prio] == 0)
892 		return;
893 
894 	for (h = 0; h < q->clhash.hashsize; h++) {
895 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
896 			/* BUGGGG... Beware! This expression suffer of
897 			 * arithmetic overflows!
898 			 */
899 			if (cl->priority == prio) {
900 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
901 					q->quanta[prio];
902 			}
903 			if (cl->quantum <= 0 ||
904 			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
905 				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
906 					cl->common.classid, cl->quantum);
907 				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
908 			}
909 		}
910 	}
911 }
912 
cbq_sync_defmap(struct cbq_class * cl)913 static void cbq_sync_defmap(struct cbq_class *cl)
914 {
915 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
916 	struct cbq_class *split = cl->split;
917 	unsigned int h;
918 	int i;
919 
920 	if (split == NULL)
921 		return;
922 
923 	for (i = 0; i <= TC_PRIO_MAX; i++) {
924 		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
925 			split->defaults[i] = NULL;
926 	}
927 
928 	for (i = 0; i <= TC_PRIO_MAX; i++) {
929 		int level = split->level;
930 
931 		if (split->defaults[i])
932 			continue;
933 
934 		for (h = 0; h < q->clhash.hashsize; h++) {
935 			struct cbq_class *c;
936 
937 			hlist_for_each_entry(c, &q->clhash.hash[h],
938 					     common.hnode) {
939 				if (c->split == split && c->level < level &&
940 				    c->defmap & (1<<i)) {
941 					split->defaults[i] = c;
942 					level = c->level;
943 				}
944 			}
945 		}
946 	}
947 }
948 
cbq_change_defmap(struct cbq_class * cl,u32 splitid,u32 def,u32 mask)949 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
950 {
951 	struct cbq_class *split = NULL;
952 
953 	if (splitid == 0) {
954 		split = cl->split;
955 		if (!split)
956 			return;
957 		splitid = split->common.classid;
958 	}
959 
960 	if (split == NULL || split->common.classid != splitid) {
961 		for (split = cl->tparent; split; split = split->tparent)
962 			if (split->common.classid == splitid)
963 				break;
964 	}
965 
966 	if (split == NULL)
967 		return;
968 
969 	if (cl->split != split) {
970 		cl->defmap = 0;
971 		cbq_sync_defmap(cl);
972 		cl->split = split;
973 		cl->defmap = def & mask;
974 	} else
975 		cl->defmap = (cl->defmap & ~mask) | (def & mask);
976 
977 	cbq_sync_defmap(cl);
978 }
979 
cbq_unlink_class(struct cbq_class * this)980 static void cbq_unlink_class(struct cbq_class *this)
981 {
982 	struct cbq_class *cl, **clp;
983 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
984 
985 	qdisc_class_hash_remove(&q->clhash, &this->common);
986 
987 	if (this->tparent) {
988 		clp = &this->sibling;
989 		cl = *clp;
990 		do {
991 			if (cl == this) {
992 				*clp = cl->sibling;
993 				break;
994 			}
995 			clp = &cl->sibling;
996 		} while ((cl = *clp) != this->sibling);
997 
998 		if (this->tparent->children == this) {
999 			this->tparent->children = this->sibling;
1000 			if (this->sibling == this)
1001 				this->tparent->children = NULL;
1002 		}
1003 	} else {
1004 		WARN_ON(this->sibling != this);
1005 	}
1006 }
1007 
cbq_link_class(struct cbq_class * this)1008 static void cbq_link_class(struct cbq_class *this)
1009 {
1010 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1011 	struct cbq_class *parent = this->tparent;
1012 
1013 	this->sibling = this;
1014 	qdisc_class_hash_insert(&q->clhash, &this->common);
1015 
1016 	if (parent == NULL)
1017 		return;
1018 
1019 	if (parent->children == NULL) {
1020 		parent->children = this;
1021 	} else {
1022 		this->sibling = parent->children->sibling;
1023 		parent->children->sibling = this;
1024 	}
1025 }
1026 
1027 static void
cbq_reset(struct Qdisc * sch)1028 cbq_reset(struct Qdisc *sch)
1029 {
1030 	struct cbq_sched_data *q = qdisc_priv(sch);
1031 	struct cbq_class *cl;
1032 	int prio;
1033 	unsigned int h;
1034 
1035 	q->activemask = 0;
1036 	q->pmask = 0;
1037 	q->tx_class = NULL;
1038 	q->tx_borrowed = NULL;
1039 	qdisc_watchdog_cancel(&q->watchdog);
1040 	hrtimer_cancel(&q->delay_timer);
1041 	q->toplevel = TC_CBQ_MAXLEVEL;
1042 	q->now = psched_get_time();
1043 
1044 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1045 		q->active[prio] = NULL;
1046 
1047 	for (h = 0; h < q->clhash.hashsize; h++) {
1048 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1049 			qdisc_reset(cl->q);
1050 
1051 			cl->next_alive = NULL;
1052 			cl->undertime = PSCHED_PASTPERFECT;
1053 			cl->avgidle = cl->maxidle;
1054 			cl->deficit = cl->quantum;
1055 			cl->cpriority = cl->priority;
1056 		}
1057 	}
1058 	sch->q.qlen = 0;
1059 }
1060 
1061 
cbq_set_lss(struct cbq_class * cl,struct tc_cbq_lssopt * lss)1062 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1063 {
1064 	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1065 		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1066 		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1067 	}
1068 	if (lss->change & TCF_CBQ_LSS_EWMA)
1069 		cl->ewma_log = lss->ewma_log;
1070 	if (lss->change & TCF_CBQ_LSS_AVPKT)
1071 		cl->avpkt = lss->avpkt;
1072 	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1073 		cl->minidle = -(long)lss->minidle;
1074 	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1075 		cl->maxidle = lss->maxidle;
1076 		cl->avgidle = lss->maxidle;
1077 	}
1078 	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1079 		cl->offtime = lss->offtime;
1080 	return 0;
1081 }
1082 
cbq_rmprio(struct cbq_sched_data * q,struct cbq_class * cl)1083 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1084 {
1085 	q->nclasses[cl->priority]--;
1086 	q->quanta[cl->priority] -= cl->weight;
1087 	cbq_normalize_quanta(q, cl->priority);
1088 }
1089 
cbq_addprio(struct cbq_sched_data * q,struct cbq_class * cl)1090 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1091 {
1092 	q->nclasses[cl->priority]++;
1093 	q->quanta[cl->priority] += cl->weight;
1094 	cbq_normalize_quanta(q, cl->priority);
1095 }
1096 
cbq_set_wrr(struct cbq_class * cl,struct tc_cbq_wrropt * wrr)1097 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1098 {
1099 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1100 
1101 	if (wrr->allot)
1102 		cl->allot = wrr->allot;
1103 	if (wrr->weight)
1104 		cl->weight = wrr->weight;
1105 	if (wrr->priority) {
1106 		cl->priority = wrr->priority - 1;
1107 		cl->cpriority = cl->priority;
1108 		if (cl->priority >= cl->priority2)
1109 			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1110 	}
1111 
1112 	cbq_addprio(q, cl);
1113 	return 0;
1114 }
1115 
cbq_set_fopt(struct cbq_class * cl,struct tc_cbq_fopt * fopt)1116 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1117 {
1118 	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1119 	return 0;
1120 }
1121 
1122 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1123 	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1124 	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1125 	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1126 	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1127 	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1128 	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1129 	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1130 };
1131 
cbq_init(struct Qdisc * sch,struct nlattr * opt)1132 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1133 {
1134 	struct cbq_sched_data *q = qdisc_priv(sch);
1135 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1136 	struct tc_ratespec *r;
1137 	int err;
1138 
1139 	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1140 	if (err < 0)
1141 		return err;
1142 
1143 	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1144 		return -EINVAL;
1145 
1146 	r = nla_data(tb[TCA_CBQ_RATE]);
1147 
1148 	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1149 		return -EINVAL;
1150 
1151 	err = qdisc_class_hash_init(&q->clhash);
1152 	if (err < 0)
1153 		goto put_rtab;
1154 
1155 	q->link.refcnt = 1;
1156 	q->link.sibling = &q->link;
1157 	q->link.common.classid = sch->handle;
1158 	q->link.qdisc = sch;
1159 	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1160 				      sch->handle);
1161 	if (!q->link.q)
1162 		q->link.q = &noop_qdisc;
1163 
1164 	q->link.priority = TC_CBQ_MAXPRIO - 1;
1165 	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1166 	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1167 	q->link.allot = psched_mtu(qdisc_dev(sch));
1168 	q->link.quantum = q->link.allot;
1169 	q->link.weight = q->link.R_tab->rate.rate;
1170 
1171 	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1172 	q->link.avpkt = q->link.allot/2;
1173 	q->link.minidle = -0x7FFFFFFF;
1174 
1175 	qdisc_watchdog_init(&q->watchdog, sch);
1176 	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1177 	q->delay_timer.function = cbq_undelay;
1178 	q->toplevel = TC_CBQ_MAXLEVEL;
1179 	q->now = psched_get_time();
1180 
1181 	cbq_link_class(&q->link);
1182 
1183 	if (tb[TCA_CBQ_LSSOPT])
1184 		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1185 
1186 	cbq_addprio(q, &q->link);
1187 	return 0;
1188 
1189 put_rtab:
1190 	qdisc_put_rtab(q->link.R_tab);
1191 	return err;
1192 }
1193 
cbq_dump_rate(struct sk_buff * skb,struct cbq_class * cl)1194 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1195 {
1196 	unsigned char *b = skb_tail_pointer(skb);
1197 
1198 	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1199 		goto nla_put_failure;
1200 	return skb->len;
1201 
1202 nla_put_failure:
1203 	nlmsg_trim(skb, b);
1204 	return -1;
1205 }
1206 
cbq_dump_lss(struct sk_buff * skb,struct cbq_class * cl)1207 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1208 {
1209 	unsigned char *b = skb_tail_pointer(skb);
1210 	struct tc_cbq_lssopt opt;
1211 
1212 	opt.flags = 0;
1213 	if (cl->borrow == NULL)
1214 		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1215 	if (cl->share == NULL)
1216 		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1217 	opt.ewma_log = cl->ewma_log;
1218 	opt.level = cl->level;
1219 	opt.avpkt = cl->avpkt;
1220 	opt.maxidle = cl->maxidle;
1221 	opt.minidle = (u32)(-cl->minidle);
1222 	opt.offtime = cl->offtime;
1223 	opt.change = ~0;
1224 	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1225 		goto nla_put_failure;
1226 	return skb->len;
1227 
1228 nla_put_failure:
1229 	nlmsg_trim(skb, b);
1230 	return -1;
1231 }
1232 
cbq_dump_wrr(struct sk_buff * skb,struct cbq_class * cl)1233 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1234 {
1235 	unsigned char *b = skb_tail_pointer(skb);
1236 	struct tc_cbq_wrropt opt;
1237 
1238 	memset(&opt, 0, sizeof(opt));
1239 	opt.flags = 0;
1240 	opt.allot = cl->allot;
1241 	opt.priority = cl->priority + 1;
1242 	opt.cpriority = cl->cpriority + 1;
1243 	opt.weight = cl->weight;
1244 	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1245 		goto nla_put_failure;
1246 	return skb->len;
1247 
1248 nla_put_failure:
1249 	nlmsg_trim(skb, b);
1250 	return -1;
1251 }
1252 
cbq_dump_fopt(struct sk_buff * skb,struct cbq_class * cl)1253 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1254 {
1255 	unsigned char *b = skb_tail_pointer(skb);
1256 	struct tc_cbq_fopt opt;
1257 
1258 	if (cl->split || cl->defmap) {
1259 		opt.split = cl->split ? cl->split->common.classid : 0;
1260 		opt.defmap = cl->defmap;
1261 		opt.defchange = ~0;
1262 		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1263 			goto nla_put_failure;
1264 	}
1265 	return skb->len;
1266 
1267 nla_put_failure:
1268 	nlmsg_trim(skb, b);
1269 	return -1;
1270 }
1271 
cbq_dump_attr(struct sk_buff * skb,struct cbq_class * cl)1272 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1273 {
1274 	if (cbq_dump_lss(skb, cl) < 0 ||
1275 	    cbq_dump_rate(skb, cl) < 0 ||
1276 	    cbq_dump_wrr(skb, cl) < 0 ||
1277 	    cbq_dump_fopt(skb, cl) < 0)
1278 		return -1;
1279 	return 0;
1280 }
1281 
cbq_dump(struct Qdisc * sch,struct sk_buff * skb)1282 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1283 {
1284 	struct cbq_sched_data *q = qdisc_priv(sch);
1285 	struct nlattr *nest;
1286 
1287 	nest = nla_nest_start(skb, TCA_OPTIONS);
1288 	if (nest == NULL)
1289 		goto nla_put_failure;
1290 	if (cbq_dump_attr(skb, &q->link) < 0)
1291 		goto nla_put_failure;
1292 	return nla_nest_end(skb, nest);
1293 
1294 nla_put_failure:
1295 	nla_nest_cancel(skb, nest);
1296 	return -1;
1297 }
1298 
1299 static int
cbq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1300 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1301 {
1302 	struct cbq_sched_data *q = qdisc_priv(sch);
1303 
1304 	q->link.xstats.avgidle = q->link.avgidle;
1305 	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1306 }
1307 
1308 static int
cbq_dump_class(struct Qdisc * sch,unsigned long arg,struct sk_buff * skb,struct tcmsg * tcm)1309 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1310 	       struct sk_buff *skb, struct tcmsg *tcm)
1311 {
1312 	struct cbq_class *cl = (struct cbq_class *)arg;
1313 	struct nlattr *nest;
1314 
1315 	if (cl->tparent)
1316 		tcm->tcm_parent = cl->tparent->common.classid;
1317 	else
1318 		tcm->tcm_parent = TC_H_ROOT;
1319 	tcm->tcm_handle = cl->common.classid;
1320 	tcm->tcm_info = cl->q->handle;
1321 
1322 	nest = nla_nest_start(skb, TCA_OPTIONS);
1323 	if (nest == NULL)
1324 		goto nla_put_failure;
1325 	if (cbq_dump_attr(skb, cl) < 0)
1326 		goto nla_put_failure;
1327 	return nla_nest_end(skb, nest);
1328 
1329 nla_put_failure:
1330 	nla_nest_cancel(skb, nest);
1331 	return -1;
1332 }
1333 
1334 static int
cbq_dump_class_stats(struct Qdisc * sch,unsigned long arg,struct gnet_dump * d)1335 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1336 	struct gnet_dump *d)
1337 {
1338 	struct cbq_sched_data *q = qdisc_priv(sch);
1339 	struct cbq_class *cl = (struct cbq_class *)arg;
1340 
1341 	cl->xstats.avgidle = cl->avgidle;
1342 	cl->xstats.undertime = 0;
1343 
1344 	if (cl->undertime != PSCHED_PASTPERFECT)
1345 		cl->xstats.undertime = cl->undertime - q->now;
1346 
1347 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1348 				  d, NULL, &cl->bstats) < 0 ||
1349 	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1350 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1351 		return -1;
1352 
1353 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1354 }
1355 
cbq_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old)1356 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1357 		     struct Qdisc **old)
1358 {
1359 	struct cbq_class *cl = (struct cbq_class *)arg;
1360 
1361 	if (new == NULL) {
1362 		new = qdisc_create_dflt(sch->dev_queue,
1363 					&pfifo_qdisc_ops, cl->common.classid);
1364 		if (new == NULL)
1365 			return -ENOBUFS;
1366 	}
1367 
1368 	*old = qdisc_replace(sch, new, &cl->q);
1369 	return 0;
1370 }
1371 
cbq_leaf(struct Qdisc * sch,unsigned long arg)1372 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1373 {
1374 	struct cbq_class *cl = (struct cbq_class *)arg;
1375 
1376 	return cl->q;
1377 }
1378 
cbq_qlen_notify(struct Qdisc * sch,unsigned long arg)1379 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1380 {
1381 	struct cbq_class *cl = (struct cbq_class *)arg;
1382 
1383 	if (cl->q->q.qlen == 0)
1384 		cbq_deactivate_class(cl);
1385 }
1386 
cbq_get(struct Qdisc * sch,u32 classid)1387 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1388 {
1389 	struct cbq_sched_data *q = qdisc_priv(sch);
1390 	struct cbq_class *cl = cbq_class_lookup(q, classid);
1391 
1392 	if (cl) {
1393 		cl->refcnt++;
1394 		return (unsigned long)cl;
1395 	}
1396 	return 0;
1397 }
1398 
cbq_destroy_class(struct Qdisc * sch,struct cbq_class * cl)1399 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1400 {
1401 	struct cbq_sched_data *q = qdisc_priv(sch);
1402 
1403 	WARN_ON(cl->filters);
1404 
1405 	tcf_destroy_chain(&cl->filter_list);
1406 	qdisc_destroy(cl->q);
1407 	qdisc_put_rtab(cl->R_tab);
1408 	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1409 	if (cl != &q->link)
1410 		kfree(cl);
1411 }
1412 
cbq_destroy(struct Qdisc * sch)1413 static void cbq_destroy(struct Qdisc *sch)
1414 {
1415 	struct cbq_sched_data *q = qdisc_priv(sch);
1416 	struct hlist_node *next;
1417 	struct cbq_class *cl;
1418 	unsigned int h;
1419 
1420 #ifdef CONFIG_NET_CLS_ACT
1421 	q->rx_class = NULL;
1422 #endif
1423 	/*
1424 	 * Filters must be destroyed first because we don't destroy the
1425 	 * classes from root to leafs which means that filters can still
1426 	 * be bound to classes which have been destroyed already. --TGR '04
1427 	 */
1428 	for (h = 0; h < q->clhash.hashsize; h++) {
1429 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1430 			tcf_destroy_chain(&cl->filter_list);
1431 	}
1432 	for (h = 0; h < q->clhash.hashsize; h++) {
1433 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1434 					  common.hnode)
1435 			cbq_destroy_class(sch, cl);
1436 	}
1437 	qdisc_class_hash_destroy(&q->clhash);
1438 }
1439 
cbq_put(struct Qdisc * sch,unsigned long arg)1440 static void cbq_put(struct Qdisc *sch, unsigned long arg)
1441 {
1442 	struct cbq_class *cl = (struct cbq_class *)arg;
1443 
1444 	if (--cl->refcnt == 0) {
1445 #ifdef CONFIG_NET_CLS_ACT
1446 		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1447 		struct cbq_sched_data *q = qdisc_priv(sch);
1448 
1449 		spin_lock_bh(root_lock);
1450 		if (q->rx_class == cl)
1451 			q->rx_class = NULL;
1452 		spin_unlock_bh(root_lock);
1453 #endif
1454 
1455 		cbq_destroy_class(sch, cl);
1456 	}
1457 }
1458 
1459 static int
cbq_change_class(struct Qdisc * sch,u32 classid,u32 parentid,struct nlattr ** tca,unsigned long * arg)1460 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1461 		 unsigned long *arg)
1462 {
1463 	int err;
1464 	struct cbq_sched_data *q = qdisc_priv(sch);
1465 	struct cbq_class *cl = (struct cbq_class *)*arg;
1466 	struct nlattr *opt = tca[TCA_OPTIONS];
1467 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1468 	struct cbq_class *parent;
1469 	struct qdisc_rate_table *rtab = NULL;
1470 
1471 	if (opt == NULL)
1472 		return -EINVAL;
1473 
1474 	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1475 	if (err < 0)
1476 		return err;
1477 
1478 	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1479 		return -EOPNOTSUPP;
1480 
1481 	if (cl) {
1482 		/* Check parent */
1483 		if (parentid) {
1484 			if (cl->tparent &&
1485 			    cl->tparent->common.classid != parentid)
1486 				return -EINVAL;
1487 			if (!cl->tparent && parentid != TC_H_ROOT)
1488 				return -EINVAL;
1489 		}
1490 
1491 		if (tb[TCA_CBQ_RATE]) {
1492 			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1493 					      tb[TCA_CBQ_RTAB]);
1494 			if (rtab == NULL)
1495 				return -EINVAL;
1496 		}
1497 
1498 		if (tca[TCA_RATE]) {
1499 			err = gen_replace_estimator(&cl->bstats, NULL,
1500 						    &cl->rate_est,
1501 						    NULL,
1502 						    qdisc_root_sleeping_running(sch),
1503 						    tca[TCA_RATE]);
1504 			if (err) {
1505 				qdisc_put_rtab(rtab);
1506 				return err;
1507 			}
1508 		}
1509 
1510 		/* Change class parameters */
1511 		sch_tree_lock(sch);
1512 
1513 		if (cl->next_alive != NULL)
1514 			cbq_deactivate_class(cl);
1515 
1516 		if (rtab) {
1517 			qdisc_put_rtab(cl->R_tab);
1518 			cl->R_tab = rtab;
1519 		}
1520 
1521 		if (tb[TCA_CBQ_LSSOPT])
1522 			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1523 
1524 		if (tb[TCA_CBQ_WRROPT]) {
1525 			cbq_rmprio(q, cl);
1526 			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1527 		}
1528 
1529 		if (tb[TCA_CBQ_FOPT])
1530 			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1531 
1532 		if (cl->q->q.qlen)
1533 			cbq_activate_class(cl);
1534 
1535 		sch_tree_unlock(sch);
1536 
1537 		return 0;
1538 	}
1539 
1540 	if (parentid == TC_H_ROOT)
1541 		return -EINVAL;
1542 
1543 	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1544 	    tb[TCA_CBQ_LSSOPT] == NULL)
1545 		return -EINVAL;
1546 
1547 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1548 	if (rtab == NULL)
1549 		return -EINVAL;
1550 
1551 	if (classid) {
1552 		err = -EINVAL;
1553 		if (TC_H_MAJ(classid ^ sch->handle) ||
1554 		    cbq_class_lookup(q, classid))
1555 			goto failure;
1556 	} else {
1557 		int i;
1558 		classid = TC_H_MAKE(sch->handle, 0x8000);
1559 
1560 		for (i = 0; i < 0x8000; i++) {
1561 			if (++q->hgenerator >= 0x8000)
1562 				q->hgenerator = 1;
1563 			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1564 				break;
1565 		}
1566 		err = -ENOSR;
1567 		if (i >= 0x8000)
1568 			goto failure;
1569 		classid = classid|q->hgenerator;
1570 	}
1571 
1572 	parent = &q->link;
1573 	if (parentid) {
1574 		parent = cbq_class_lookup(q, parentid);
1575 		err = -EINVAL;
1576 		if (parent == NULL)
1577 			goto failure;
1578 	}
1579 
1580 	err = -ENOBUFS;
1581 	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1582 	if (cl == NULL)
1583 		goto failure;
1584 
1585 	if (tca[TCA_RATE]) {
1586 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1587 					NULL,
1588 					qdisc_root_sleeping_running(sch),
1589 					tca[TCA_RATE]);
1590 		if (err) {
1591 			kfree(cl);
1592 			goto failure;
1593 		}
1594 	}
1595 
1596 	cl->R_tab = rtab;
1597 	rtab = NULL;
1598 	cl->refcnt = 1;
1599 	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1600 	if (!cl->q)
1601 		cl->q = &noop_qdisc;
1602 	cl->common.classid = classid;
1603 	cl->tparent = parent;
1604 	cl->qdisc = sch;
1605 	cl->allot = parent->allot;
1606 	cl->quantum = cl->allot;
1607 	cl->weight = cl->R_tab->rate.rate;
1608 
1609 	sch_tree_lock(sch);
1610 	cbq_link_class(cl);
1611 	cl->borrow = cl->tparent;
1612 	if (cl->tparent != &q->link)
1613 		cl->share = cl->tparent;
1614 	cbq_adjust_levels(parent);
1615 	cl->minidle = -0x7FFFFFFF;
1616 	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1617 	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1618 	if (cl->ewma_log == 0)
1619 		cl->ewma_log = q->link.ewma_log;
1620 	if (cl->maxidle == 0)
1621 		cl->maxidle = q->link.maxidle;
1622 	if (cl->avpkt == 0)
1623 		cl->avpkt = q->link.avpkt;
1624 	if (tb[TCA_CBQ_FOPT])
1625 		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1626 	sch_tree_unlock(sch);
1627 
1628 	qdisc_class_hash_grow(sch, &q->clhash);
1629 
1630 	*arg = (unsigned long)cl;
1631 	return 0;
1632 
1633 failure:
1634 	qdisc_put_rtab(rtab);
1635 	return err;
1636 }
1637 
cbq_delete(struct Qdisc * sch,unsigned long arg)1638 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1639 {
1640 	struct cbq_sched_data *q = qdisc_priv(sch);
1641 	struct cbq_class *cl = (struct cbq_class *)arg;
1642 	unsigned int qlen, backlog;
1643 
1644 	if (cl->filters || cl->children || cl == &q->link)
1645 		return -EBUSY;
1646 
1647 	sch_tree_lock(sch);
1648 
1649 	qlen = cl->q->q.qlen;
1650 	backlog = cl->q->qstats.backlog;
1651 	qdisc_reset(cl->q);
1652 	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1653 
1654 	if (cl->next_alive)
1655 		cbq_deactivate_class(cl);
1656 
1657 	if (q->tx_borrowed == cl)
1658 		q->tx_borrowed = q->tx_class;
1659 	if (q->tx_class == cl) {
1660 		q->tx_class = NULL;
1661 		q->tx_borrowed = NULL;
1662 	}
1663 #ifdef CONFIG_NET_CLS_ACT
1664 	if (q->rx_class == cl)
1665 		q->rx_class = NULL;
1666 #endif
1667 
1668 	cbq_unlink_class(cl);
1669 	cbq_adjust_levels(cl->tparent);
1670 	cl->defmap = 0;
1671 	cbq_sync_defmap(cl);
1672 
1673 	cbq_rmprio(q, cl);
1674 	sch_tree_unlock(sch);
1675 
1676 	BUG_ON(--cl->refcnt == 0);
1677 	/*
1678 	 * This shouldn't happen: we "hold" one cops->get() when called
1679 	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1680 	 */
1681 
1682 	return 0;
1683 }
1684 
cbq_find_tcf(struct Qdisc * sch,unsigned long arg)1685 static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
1686 					     unsigned long arg)
1687 {
1688 	struct cbq_sched_data *q = qdisc_priv(sch);
1689 	struct cbq_class *cl = (struct cbq_class *)arg;
1690 
1691 	if (cl == NULL)
1692 		cl = &q->link;
1693 
1694 	return &cl->filter_list;
1695 }
1696 
cbq_bind_filter(struct Qdisc * sch,unsigned long parent,u32 classid)1697 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1698 				     u32 classid)
1699 {
1700 	struct cbq_sched_data *q = qdisc_priv(sch);
1701 	struct cbq_class *p = (struct cbq_class *)parent;
1702 	struct cbq_class *cl = cbq_class_lookup(q, classid);
1703 
1704 	if (cl) {
1705 		if (p && p->level <= cl->level)
1706 			return 0;
1707 		cl->filters++;
1708 		return (unsigned long)cl;
1709 	}
1710 	return 0;
1711 }
1712 
cbq_unbind_filter(struct Qdisc * sch,unsigned long arg)1713 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1714 {
1715 	struct cbq_class *cl = (struct cbq_class *)arg;
1716 
1717 	cl->filters--;
1718 }
1719 
cbq_walk(struct Qdisc * sch,struct qdisc_walker * arg)1720 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1721 {
1722 	struct cbq_sched_data *q = qdisc_priv(sch);
1723 	struct cbq_class *cl;
1724 	unsigned int h;
1725 
1726 	if (arg->stop)
1727 		return;
1728 
1729 	for (h = 0; h < q->clhash.hashsize; h++) {
1730 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1731 			if (arg->count < arg->skip) {
1732 				arg->count++;
1733 				continue;
1734 			}
1735 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1736 				arg->stop = 1;
1737 				return;
1738 			}
1739 			arg->count++;
1740 		}
1741 	}
1742 }
1743 
1744 static const struct Qdisc_class_ops cbq_class_ops = {
1745 	.graft		=	cbq_graft,
1746 	.leaf		=	cbq_leaf,
1747 	.qlen_notify	=	cbq_qlen_notify,
1748 	.get		=	cbq_get,
1749 	.put		=	cbq_put,
1750 	.change		=	cbq_change_class,
1751 	.delete		=	cbq_delete,
1752 	.walk		=	cbq_walk,
1753 	.tcf_chain	=	cbq_find_tcf,
1754 	.bind_tcf	=	cbq_bind_filter,
1755 	.unbind_tcf	=	cbq_unbind_filter,
1756 	.dump		=	cbq_dump_class,
1757 	.dump_stats	=	cbq_dump_class_stats,
1758 };
1759 
1760 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1761 	.next		=	NULL,
1762 	.cl_ops		=	&cbq_class_ops,
1763 	.id		=	"cbq",
1764 	.priv_size	=	sizeof(struct cbq_sched_data),
1765 	.enqueue	=	cbq_enqueue,
1766 	.dequeue	=	cbq_dequeue,
1767 	.peek		=	qdisc_peek_dequeued,
1768 	.init		=	cbq_init,
1769 	.reset		=	cbq_reset,
1770 	.destroy	=	cbq_destroy,
1771 	.change		=	NULL,
1772 	.dump		=	cbq_dump,
1773 	.dump_stats	=	cbq_dump_stats,
1774 	.owner		=	THIS_MODULE,
1775 };
1776 
cbq_module_init(void)1777 static int __init cbq_module_init(void)
1778 {
1779 	return register_qdisc(&cbq_qdisc_ops);
1780 }
cbq_module_exit(void)1781 static void __exit cbq_module_exit(void)
1782 {
1783 	unregister_qdisc(&cbq_qdisc_ops);
1784 }
1785 module_init(cbq_module_init)
1786 module_exit(cbq_module_exit)
1787 MODULE_LICENSE("GPL");
1788