• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * net/sched/sch_choke.c	CHOKE scheduler
3  *
4  * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
5  * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * version 2 as published by the Free Software Foundation.
10  *
11  */
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/kernel.h>
16 #include <linux/skbuff.h>
17 #include <linux/vmalloc.h>
18 #include <net/pkt_sched.h>
19 #include <net/inet_ecn.h>
20 #include <net/red.h>
21 #include <net/flow_dissector.h>
22 
23 /*
24    CHOKe stateless AQM for fair bandwidth allocation
25    =================================================
26 
27    CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
28    unresponsive flows) is a variant of RED that penalizes misbehaving flows but
29    maintains no flow state. The difference from RED is an additional step
30    during the enqueuing process. If average queue size is over the
31    low threshold (qmin), a packet is chosen at random from the queue.
32    If both the new and chosen packet are from the same flow, both
33    are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
34    needs to access packets in queue randomly. It has a minimal class
35    interface to allow overriding the builtin flow classifier with
36    filters.
37 
38    Source:
39    R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
40    Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
41    IEEE INFOCOM, 2000.
42 
43    A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
44    Characteristics", IEEE/ACM Transactions on Networking, 2004
45 
46  */
47 
48 /* Upper bound on size of sk_buff table (packets) */
49 #define CHOKE_MAX_QUEUE	(128*1024 - 1)
50 
51 struct choke_sched_data {
52 /* Parameters */
53 	u32		 limit;
54 	unsigned char	 flags;
55 
56 	struct red_parms parms;
57 
58 /* Variables */
59 	struct red_vars  vars;
60 	struct tcf_proto __rcu *filter_list;
61 	struct {
62 		u32	prob_drop;	/* Early probability drops */
63 		u32	prob_mark;	/* Early probability marks */
64 		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
65 		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
66 		u32	pdrop;          /* Drops due to queue limits */
67 		u32	other;          /* Drops due to drop() calls */
68 		u32	matched;	/* Drops to flow match */
69 	} stats;
70 
71 	unsigned int	 head;
72 	unsigned int	 tail;
73 
74 	unsigned int	 tab_mask; /* size - 1 */
75 
76 	struct sk_buff **tab;
77 };
78 
79 /* number of elements in queue including holes */
choke_len(const struct choke_sched_data * q)80 static unsigned int choke_len(const struct choke_sched_data *q)
81 {
82 	return (q->tail - q->head) & q->tab_mask;
83 }
84 
85 /* Is ECN parameter configured */
use_ecn(const struct choke_sched_data * q)86 static int use_ecn(const struct choke_sched_data *q)
87 {
88 	return q->flags & TC_RED_ECN;
89 }
90 
91 /* Should packets over max just be dropped (versus marked) */
use_harddrop(const struct choke_sched_data * q)92 static int use_harddrop(const struct choke_sched_data *q)
93 {
94 	return q->flags & TC_RED_HARDDROP;
95 }
96 
97 /* Move head pointer forward to skip over holes */
choke_zap_head_holes(struct choke_sched_data * q)98 static void choke_zap_head_holes(struct choke_sched_data *q)
99 {
100 	do {
101 		q->head = (q->head + 1) & q->tab_mask;
102 		if (q->head == q->tail)
103 			break;
104 	} while (q->tab[q->head] == NULL);
105 }
106 
107 /* Move tail pointer backwards to reuse holes */
choke_zap_tail_holes(struct choke_sched_data * q)108 static void choke_zap_tail_holes(struct choke_sched_data *q)
109 {
110 	do {
111 		q->tail = (q->tail - 1) & q->tab_mask;
112 		if (q->head == q->tail)
113 			break;
114 	} while (q->tab[q->tail] == NULL);
115 }
116 
117 /* Drop packet from queue array by creating a "hole" */
choke_drop_by_idx(struct Qdisc * sch,unsigned int idx,struct sk_buff ** to_free)118 static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
119 			      struct sk_buff **to_free)
120 {
121 	struct choke_sched_data *q = qdisc_priv(sch);
122 	struct sk_buff *skb = q->tab[idx];
123 
124 	q->tab[idx] = NULL;
125 
126 	if (idx == q->head)
127 		choke_zap_head_holes(q);
128 	if (idx == q->tail)
129 		choke_zap_tail_holes(q);
130 
131 	qdisc_qstats_backlog_dec(sch, skb);
132 	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
133 	qdisc_drop(skb, sch, to_free);
134 	--sch->q.qlen;
135 }
136 
137 struct choke_skb_cb {
138 	u16			classid;
139 	u8			keys_valid;
140 	struct			flow_keys_digest keys;
141 };
142 
choke_skb_cb(const struct sk_buff * skb)143 static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
144 {
145 	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
146 	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
147 }
148 
choke_set_classid(struct sk_buff * skb,u16 classid)149 static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
150 {
151 	choke_skb_cb(skb)->classid = classid;
152 }
153 
choke_get_classid(const struct sk_buff * skb)154 static u16 choke_get_classid(const struct sk_buff *skb)
155 {
156 	return choke_skb_cb(skb)->classid;
157 }
158 
159 /*
160  * Compare flow of two packets
161  *  Returns true only if source and destination address and port match.
162  *          false for special cases
163  */
choke_match_flow(struct sk_buff * skb1,struct sk_buff * skb2)164 static bool choke_match_flow(struct sk_buff *skb1,
165 			     struct sk_buff *skb2)
166 {
167 	struct flow_keys temp;
168 
169 	if (skb1->protocol != skb2->protocol)
170 		return false;
171 
172 	if (!choke_skb_cb(skb1)->keys_valid) {
173 		choke_skb_cb(skb1)->keys_valid = 1;
174 		skb_flow_dissect_flow_keys(skb1, &temp, 0);
175 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
176 	}
177 
178 	if (!choke_skb_cb(skb2)->keys_valid) {
179 		choke_skb_cb(skb2)->keys_valid = 1;
180 		skb_flow_dissect_flow_keys(skb2, &temp, 0);
181 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
182 	}
183 
184 	return !memcmp(&choke_skb_cb(skb1)->keys,
185 		       &choke_skb_cb(skb2)->keys,
186 		       sizeof(choke_skb_cb(skb1)->keys));
187 }
188 
189 /*
190  * Classify flow using either:
191  *  1. pre-existing classification result in skb
192  *  2. fast internal classification
193  *  3. use TC filter based classification
194  */
choke_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)195 static bool choke_classify(struct sk_buff *skb,
196 			   struct Qdisc *sch, int *qerr)
197 
198 {
199 	struct choke_sched_data *q = qdisc_priv(sch);
200 	struct tcf_result res;
201 	struct tcf_proto *fl;
202 	int result;
203 
204 	fl = rcu_dereference_bh(q->filter_list);
205 	result = tc_classify(skb, fl, &res, false);
206 	if (result >= 0) {
207 #ifdef CONFIG_NET_CLS_ACT
208 		switch (result) {
209 		case TC_ACT_STOLEN:
210 		case TC_ACT_QUEUED:
211 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
212 		case TC_ACT_SHOT:
213 			return false;
214 		}
215 #endif
216 		choke_set_classid(skb, TC_H_MIN(res.classid));
217 		return true;
218 	}
219 
220 	return false;
221 }
222 
223 /*
224  * Select a packet at random from queue
225  * HACK: since queue can have holes from previous deletion; retry several
226  *   times to find a random skb but then just give up and return the head
227  * Will return NULL if queue is empty (q->head == q->tail)
228  */
choke_peek_random(const struct choke_sched_data * q,unsigned int * pidx)229 static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
230 					 unsigned int *pidx)
231 {
232 	struct sk_buff *skb;
233 	int retrys = 3;
234 
235 	do {
236 		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
237 		skb = q->tab[*pidx];
238 		if (skb)
239 			return skb;
240 	} while (--retrys > 0);
241 
242 	return q->tab[*pidx = q->head];
243 }
244 
245 /*
246  * Compare new packet with random packet in queue
247  * returns true if matched and sets *pidx
248  */
choke_match_random(const struct choke_sched_data * q,struct sk_buff * nskb,unsigned int * pidx)249 static bool choke_match_random(const struct choke_sched_data *q,
250 			       struct sk_buff *nskb,
251 			       unsigned int *pidx)
252 {
253 	struct sk_buff *oskb;
254 
255 	if (q->head == q->tail)
256 		return false;
257 
258 	oskb = choke_peek_random(q, pidx);
259 	if (rcu_access_pointer(q->filter_list))
260 		return choke_get_classid(nskb) == choke_get_classid(oskb);
261 
262 	return choke_match_flow(oskb, nskb);
263 }
264 
choke_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)265 static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
266 			 struct sk_buff **to_free)
267 {
268 	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
269 	struct choke_sched_data *q = qdisc_priv(sch);
270 	const struct red_parms *p = &q->parms;
271 
272 	if (rcu_access_pointer(q->filter_list)) {
273 		/* If using external classifiers, get result and record it. */
274 		if (!choke_classify(skb, sch, &ret))
275 			goto other_drop;	/* Packet was eaten by filter */
276 	}
277 
278 	choke_skb_cb(skb)->keys_valid = 0;
279 	/* Compute average queue usage (see RED) */
280 	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
281 	if (red_is_idling(&q->vars))
282 		red_end_of_idle_period(&q->vars);
283 
284 	/* Is queue small? */
285 	if (q->vars.qavg <= p->qth_min)
286 		q->vars.qcount = -1;
287 	else {
288 		unsigned int idx;
289 
290 		/* Draw a packet at random from queue and compare flow */
291 		if (choke_match_random(q, skb, &idx)) {
292 			q->stats.matched++;
293 			choke_drop_by_idx(sch, idx, to_free);
294 			goto congestion_drop;
295 		}
296 
297 		/* Queue is large, always mark/drop */
298 		if (q->vars.qavg > p->qth_max) {
299 			q->vars.qcount = -1;
300 
301 			qdisc_qstats_overlimit(sch);
302 			if (use_harddrop(q) || !use_ecn(q) ||
303 			    !INET_ECN_set_ce(skb)) {
304 				q->stats.forced_drop++;
305 				goto congestion_drop;
306 			}
307 
308 			q->stats.forced_mark++;
309 		} else if (++q->vars.qcount) {
310 			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
311 				q->vars.qcount = 0;
312 				q->vars.qR = red_random(p);
313 
314 				qdisc_qstats_overlimit(sch);
315 				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
316 					q->stats.prob_drop++;
317 					goto congestion_drop;
318 				}
319 
320 				q->stats.prob_mark++;
321 			}
322 		} else
323 			q->vars.qR = red_random(p);
324 	}
325 
326 	/* Admit new packet */
327 	if (sch->q.qlen < q->limit) {
328 		q->tab[q->tail] = skb;
329 		q->tail = (q->tail + 1) & q->tab_mask;
330 		++sch->q.qlen;
331 		qdisc_qstats_backlog_inc(sch, skb);
332 		return NET_XMIT_SUCCESS;
333 	}
334 
335 	q->stats.pdrop++;
336 	return qdisc_drop(skb, sch, to_free);
337 
338 congestion_drop:
339 	qdisc_drop(skb, sch, to_free);
340 	return NET_XMIT_CN;
341 
342 other_drop:
343 	if (ret & __NET_XMIT_BYPASS)
344 		qdisc_qstats_drop(sch);
345 	__qdisc_drop(skb, to_free);
346 	return ret;
347 }
348 
choke_dequeue(struct Qdisc * sch)349 static struct sk_buff *choke_dequeue(struct Qdisc *sch)
350 {
351 	struct choke_sched_data *q = qdisc_priv(sch);
352 	struct sk_buff *skb;
353 
354 	if (q->head == q->tail) {
355 		if (!red_is_idling(&q->vars))
356 			red_start_of_idle_period(&q->vars);
357 		return NULL;
358 	}
359 
360 	skb = q->tab[q->head];
361 	q->tab[q->head] = NULL;
362 	choke_zap_head_holes(q);
363 	--sch->q.qlen;
364 	qdisc_qstats_backlog_dec(sch, skb);
365 	qdisc_bstats_update(sch, skb);
366 
367 	return skb;
368 }
369 
choke_reset(struct Qdisc * sch)370 static void choke_reset(struct Qdisc *sch)
371 {
372 	struct choke_sched_data *q = qdisc_priv(sch);
373 
374 	while (q->head != q->tail) {
375 		struct sk_buff *skb = q->tab[q->head];
376 
377 		q->head = (q->head + 1) & q->tab_mask;
378 		if (!skb)
379 			continue;
380 		rtnl_qdisc_drop(skb, sch);
381 	}
382 
383 	sch->q.qlen = 0;
384 	sch->qstats.backlog = 0;
385 	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
386 	q->head = q->tail = 0;
387 	red_restart(&q->vars);
388 }
389 
390 static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
391 	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
392 	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
393 	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
394 };
395 
396 
choke_free(void * addr)397 static void choke_free(void *addr)
398 {
399 	kvfree(addr);
400 }
401 
choke_change(struct Qdisc * sch,struct nlattr * opt)402 static int choke_change(struct Qdisc *sch, struct nlattr *opt)
403 {
404 	struct choke_sched_data *q = qdisc_priv(sch);
405 	struct nlattr *tb[TCA_CHOKE_MAX + 1];
406 	const struct tc_red_qopt *ctl;
407 	int err;
408 	struct sk_buff **old = NULL;
409 	unsigned int mask;
410 	u32 max_P;
411 
412 	if (opt == NULL)
413 		return -EINVAL;
414 
415 	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
416 	if (err < 0)
417 		return err;
418 
419 	if (tb[TCA_CHOKE_PARMS] == NULL ||
420 	    tb[TCA_CHOKE_STAB] == NULL)
421 		return -EINVAL;
422 
423 	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;
424 
425 	ctl = nla_data(tb[TCA_CHOKE_PARMS]);
426 
427 	if (!red_check_params(ctl->qth_min, ctl->qth_max, ctl->Wlog))
428 		return -EINVAL;
429 
430 	if (ctl->limit > CHOKE_MAX_QUEUE)
431 		return -EINVAL;
432 
433 	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
434 	if (mask != q->tab_mask) {
435 		struct sk_buff **ntab;
436 
437 		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
438 			       GFP_KERNEL | __GFP_NOWARN);
439 		if (!ntab)
440 			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
441 		if (!ntab)
442 			return -ENOMEM;
443 
444 		sch_tree_lock(sch);
445 		old = q->tab;
446 		if (old) {
447 			unsigned int oqlen = sch->q.qlen, tail = 0;
448 			unsigned dropped = 0;
449 
450 			while (q->head != q->tail) {
451 				struct sk_buff *skb = q->tab[q->head];
452 
453 				q->head = (q->head + 1) & q->tab_mask;
454 				if (!skb)
455 					continue;
456 				if (tail < mask) {
457 					ntab[tail++] = skb;
458 					continue;
459 				}
460 				dropped += qdisc_pkt_len(skb);
461 				qdisc_qstats_backlog_dec(sch, skb);
462 				--sch->q.qlen;
463 				rtnl_qdisc_drop(skb, sch);
464 			}
465 			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
466 			q->head = 0;
467 			q->tail = tail;
468 		}
469 
470 		q->tab_mask = mask;
471 		q->tab = ntab;
472 	} else
473 		sch_tree_lock(sch);
474 
475 	q->flags = ctl->flags;
476 	q->limit = ctl->limit;
477 
478 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
479 		      ctl->Plog, ctl->Scell_log,
480 		      nla_data(tb[TCA_CHOKE_STAB]),
481 		      max_P);
482 	red_set_vars(&q->vars);
483 
484 	if (q->head == q->tail)
485 		red_end_of_idle_period(&q->vars);
486 
487 	sch_tree_unlock(sch);
488 	choke_free(old);
489 	return 0;
490 }
491 
choke_init(struct Qdisc * sch,struct nlattr * opt)492 static int choke_init(struct Qdisc *sch, struct nlattr *opt)
493 {
494 	return choke_change(sch, opt);
495 }
496 
choke_dump(struct Qdisc * sch,struct sk_buff * skb)497 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
498 {
499 	struct choke_sched_data *q = qdisc_priv(sch);
500 	struct nlattr *opts = NULL;
501 	struct tc_red_qopt opt = {
502 		.limit		= q->limit,
503 		.flags		= q->flags,
504 		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
505 		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
506 		.Wlog		= q->parms.Wlog,
507 		.Plog		= q->parms.Plog,
508 		.Scell_log	= q->parms.Scell_log,
509 	};
510 
511 	opts = nla_nest_start(skb, TCA_OPTIONS);
512 	if (opts == NULL)
513 		goto nla_put_failure;
514 
515 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
516 	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
517 		goto nla_put_failure;
518 	return nla_nest_end(skb, opts);
519 
520 nla_put_failure:
521 	nla_nest_cancel(skb, opts);
522 	return -EMSGSIZE;
523 }
524 
choke_dump_stats(struct Qdisc * sch,struct gnet_dump * d)525 static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
526 {
527 	struct choke_sched_data *q = qdisc_priv(sch);
528 	struct tc_choke_xstats st = {
529 		.early	= q->stats.prob_drop + q->stats.forced_drop,
530 		.marked	= q->stats.prob_mark + q->stats.forced_mark,
531 		.pdrop	= q->stats.pdrop,
532 		.other	= q->stats.other,
533 		.matched = q->stats.matched,
534 	};
535 
536 	return gnet_stats_copy_app(d, &st, sizeof(st));
537 }
538 
choke_destroy(struct Qdisc * sch)539 static void choke_destroy(struct Qdisc *sch)
540 {
541 	struct choke_sched_data *q = qdisc_priv(sch);
542 
543 	tcf_destroy_chain(&q->filter_list);
544 	choke_free(q->tab);
545 }
546 
choke_peek_head(struct Qdisc * sch)547 static struct sk_buff *choke_peek_head(struct Qdisc *sch)
548 {
549 	struct choke_sched_data *q = qdisc_priv(sch);
550 
551 	return (q->head != q->tail) ? q->tab[q->head] : NULL;
552 }
553 
554 static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
555 	.id		=	"choke",
556 	.priv_size	=	sizeof(struct choke_sched_data),
557 
558 	.enqueue	=	choke_enqueue,
559 	.dequeue	=	choke_dequeue,
560 	.peek		=	choke_peek_head,
561 	.init		=	choke_init,
562 	.destroy	=	choke_destroy,
563 	.reset		=	choke_reset,
564 	.change		=	choke_change,
565 	.dump		=	choke_dump,
566 	.dump_stats	=	choke_dump_stats,
567 	.owner		=	THIS_MODULE,
568 };
569 
choke_module_init(void)570 static int __init choke_module_init(void)
571 {
572 	return register_qdisc(&choke_qdisc_ops);
573 }
574 
choke_module_exit(void)575 static void __exit choke_module_exit(void)
576 {
577 	unregister_qdisc(&choke_qdisc_ops);
578 }
579 
580 module_init(choke_module_init)
581 module_exit(choke_module_exit)
582 
583 MODULE_LICENSE("GPL");
584