• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * net/sched/sch_netem.c	Network emulator
3  *
4  * 		This program is free software; you can redistribute it and/or
5  * 		modify it under the terms of the GNU General Public License
6  * 		as published by the Free Software Foundation; either version
7  * 		2 of the License.
8  *
9  *  		Many of the algorithms and ideas for this came from
10  *		NIST Net which is not copyrighted.
11  *
12  * Authors:	Stephen Hemminger <shemminger@osdl.org>
13  *		Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14  */
15 
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/slab.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/errno.h>
22 #include <linux/skbuff.h>
23 #include <linux/vmalloc.h>
24 #include <linux/rtnetlink.h>
25 #include <linux/reciprocal_div.h>
26 #include <linux/rbtree.h>
27 
28 #include <net/netlink.h>
29 #include <net/pkt_sched.h>
30 #include <net/inet_ecn.h>
31 
32 #define VERSION "1.3"
33 
34 /*	Network Emulation Queuing algorithm.
35 	====================================
36 
37 	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
38 		 Network Emulation Tool
39 		 [2] Luigi Rizzo, DummyNet for FreeBSD
40 
41 	 ----------------------------------------------------------------
42 
43 	 This started out as a simple way to delay outgoing packets to
44 	 test TCP but has grown to include most of the functionality
45 	 of a full blown network emulator like NISTnet. It can delay
46 	 packets and add random jitter (and correlation). The random
47 	 distribution can be loaded from a table as well to provide
48 	 normal, Pareto, or experimental curves. Packet loss,
49 	 duplication, and reordering can also be emulated.
50 
51 	 This qdisc does not do classification that can be handled in
52 	 layering other disciplines.  It does not need to do bandwidth
53 	 control either since that can be handled by using token
54 	 bucket or other rate control.
55 
56      Correlated Loss Generator models
57 
58 	Added generation of correlated loss according to the
59 	"Gilbert-Elliot" model, a 4-state markov model.
60 
61 	References:
62 	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
63 	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
64 	and intuitive loss model for packet networks and its implementation
65 	in the Netem module in the Linux kernel", available in [1]
66 
67 	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
68 		 Fabio Ludovici <fabio.ludovici at yahoo.it>
69 */
70 
71 struct netem_sched_data {
72 	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
73 	struct rb_root t_root;
74 
75 	/* optional qdisc for classful handling (NULL at netem init) */
76 	struct Qdisc	*qdisc;
77 
78 	struct qdisc_watchdog watchdog;
79 
80 	psched_tdiff_t latency;
81 	psched_tdiff_t jitter;
82 
83 	u32 loss;
84 	u32 ecn;
85 	u32 limit;
86 	u32 counter;
87 	u32 gap;
88 	u32 duplicate;
89 	u32 reorder;
90 	u32 corrupt;
91 	u64 rate;
92 	s32 packet_overhead;
93 	u32 cell_size;
94 	struct reciprocal_value cell_size_reciprocal;
95 	s32 cell_overhead;
96 
97 	struct crndstate {
98 		u32 last;
99 		u32 rho;
100 	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
101 
102 	struct disttable {
103 		u32  size;
104 		s16 table[0];
105 	} *delay_dist;
106 
107 	enum  {
108 		CLG_RANDOM,
109 		CLG_4_STATES,
110 		CLG_GILB_ELL,
111 	} loss_model;
112 
113 	enum {
114 		TX_IN_GAP_PERIOD = 1,
115 		TX_IN_BURST_PERIOD,
116 		LOST_IN_GAP_PERIOD,
117 		LOST_IN_BURST_PERIOD,
118 	} _4_state_model;
119 
120 	enum {
121 		GOOD_STATE = 1,
122 		BAD_STATE,
123 	} GE_state_model;
124 
125 	/* Correlated Loss Generation models */
126 	struct clgstate {
127 		/* state of the Markov chain */
128 		u8 state;
129 
130 		/* 4-states and Gilbert-Elliot models */
131 		u32 a1;	/* p13 for 4-states or p for GE */
132 		u32 a2;	/* p31 for 4-states or r for GE */
133 		u32 a3;	/* p32 for 4-states or h for GE */
134 		u32 a4;	/* p14 for 4-states or 1-k for GE */
135 		u32 a5; /* p23 used only in 4-states */
136 	} clg;
137 
138 };
139 
140 /* Time stamp put into socket buffer control block
141  * Only valid when skbs are in our internal t(ime)fifo queue.
142  *
143  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
144  * and skb->next & skb->prev are scratch space for a qdisc,
145  * we save skb->tstamp value in skb->cb[] before destroying it.
146  */
147 struct netem_skb_cb {
148 	psched_time_t	time_to_send;
149 	ktime_t		tstamp_save;
150 };
151 
152 
netem_rb_to_skb(struct rb_node * rb)153 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
154 {
155 	return container_of(rb, struct sk_buff, rbnode);
156 }
157 
netem_skb_cb(struct sk_buff * skb)158 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
159 {
160 	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
161 	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
162 	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
163 }
164 
165 /* init_crandom - initialize correlated random number generator
166  * Use entropy source for initial seed.
167  */
init_crandom(struct crndstate * state,unsigned long rho)168 static void init_crandom(struct crndstate *state, unsigned long rho)
169 {
170 	state->rho = rho;
171 	state->last = prandom_u32();
172 }
173 
174 /* get_crandom - correlated random number generator
175  * Next number depends on last value.
176  * rho is scaled to avoid floating point.
177  */
get_crandom(struct crndstate * state)178 static u32 get_crandom(struct crndstate *state)
179 {
180 	u64 value, rho;
181 	unsigned long answer;
182 
183 	if (state->rho == 0)	/* no correlation */
184 		return prandom_u32();
185 
186 	value = prandom_u32();
187 	rho = (u64)state->rho + 1;
188 	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
189 	state->last = answer;
190 	return answer;
191 }
192 
193 /* loss_4state - 4-state model loss generator
194  * Generates losses according to the 4-state Markov chain adopted in
195  * the GI (General and Intuitive) loss model.
196  */
loss_4state(struct netem_sched_data * q)197 static bool loss_4state(struct netem_sched_data *q)
198 {
199 	struct clgstate *clg = &q->clg;
200 	u32 rnd = prandom_u32();
201 
202 	/*
203 	 * Makes a comparison between rnd and the transition
204 	 * probabilities outgoing from the current state, then decides the
205 	 * next state and if the next packet has to be transmitted or lost.
206 	 * The four states correspond to:
207 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
208 	 *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
209 	 *   LOST_IN_GAP_PERIOD => lost packets within a burst period
210 	 *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
211 	 */
212 	switch (clg->state) {
213 	case TX_IN_GAP_PERIOD:
214 		if (rnd < clg->a4) {
215 			clg->state = LOST_IN_BURST_PERIOD;
216 			return true;
217 		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
218 			clg->state = LOST_IN_GAP_PERIOD;
219 			return true;
220 		} else if (clg->a1 + clg->a4 < rnd) {
221 			clg->state = TX_IN_GAP_PERIOD;
222 		}
223 
224 		break;
225 	case TX_IN_BURST_PERIOD:
226 		if (rnd < clg->a5) {
227 			clg->state = LOST_IN_GAP_PERIOD;
228 			return true;
229 		} else {
230 			clg->state = TX_IN_BURST_PERIOD;
231 		}
232 
233 		break;
234 	case LOST_IN_GAP_PERIOD:
235 		if (rnd < clg->a3)
236 			clg->state = TX_IN_BURST_PERIOD;
237 		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
238 			clg->state = TX_IN_GAP_PERIOD;
239 		} else if (clg->a2 + clg->a3 < rnd) {
240 			clg->state = LOST_IN_GAP_PERIOD;
241 			return true;
242 		}
243 		break;
244 	case LOST_IN_BURST_PERIOD:
245 		clg->state = TX_IN_GAP_PERIOD;
246 		break;
247 	}
248 
249 	return false;
250 }
251 
252 /* loss_gilb_ell - Gilbert-Elliot model loss generator
253  * Generates losses according to the Gilbert-Elliot loss model or
254  * its special cases  (Gilbert or Simple Gilbert)
255  *
256  * Makes a comparison between random number and the transition
257  * probabilities outgoing from the current state, then decides the
258  * next state. A second random number is extracted and the comparison
259  * with the loss probability of the current state decides if the next
260  * packet will be transmitted or lost.
261  */
loss_gilb_ell(struct netem_sched_data * q)262 static bool loss_gilb_ell(struct netem_sched_data *q)
263 {
264 	struct clgstate *clg = &q->clg;
265 
266 	switch (clg->state) {
267 	case GOOD_STATE:
268 		if (prandom_u32() < clg->a1)
269 			clg->state = BAD_STATE;
270 		if (prandom_u32() < clg->a4)
271 			return true;
272 		break;
273 	case BAD_STATE:
274 		if (prandom_u32() < clg->a2)
275 			clg->state = GOOD_STATE;
276 		if (prandom_u32() > clg->a3)
277 			return true;
278 	}
279 
280 	return false;
281 }
282 
loss_event(struct netem_sched_data * q)283 static bool loss_event(struct netem_sched_data *q)
284 {
285 	switch (q->loss_model) {
286 	case CLG_RANDOM:
287 		/* Random packet drop 0 => none, ~0 => all */
288 		return q->loss && q->loss >= get_crandom(&q->loss_cor);
289 
290 	case CLG_4_STATES:
291 		/* 4state loss model algorithm (used also for GI model)
292 		* Extracts a value from the markov 4 state loss generator,
293 		* if it is 1 drops a packet and if needed writes the event in
294 		* the kernel logs
295 		*/
296 		return loss_4state(q);
297 
298 	case CLG_GILB_ELL:
299 		/* Gilbert-Elliot loss model algorithm
300 		* Extracts a value from the Gilbert-Elliot loss generator,
301 		* if it is 1 drops a packet and if needed writes the event in
302 		* the kernel logs
303 		*/
304 		return loss_gilb_ell(q);
305 	}
306 
307 	return false;	/* not reached */
308 }
309 
310 
311 /* tabledist - return a pseudo-randomly distributed value with mean mu and
312  * std deviation sigma.  Uses table lookup to approximate the desired
313  * distribution, and a uniformly-distributed pseudo-random source.
314  */
tabledist(psched_tdiff_t mu,psched_tdiff_t sigma,struct crndstate * state,const struct disttable * dist)315 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
316 				struct crndstate *state,
317 				const struct disttable *dist)
318 {
319 	psched_tdiff_t x;
320 	long t;
321 	u32 rnd;
322 
323 	if (sigma == 0)
324 		return mu;
325 
326 	rnd = get_crandom(state);
327 
328 	/* default uniform distribution */
329 	if (dist == NULL)
330 		return (rnd % (2*sigma)) - sigma + mu;
331 
332 	t = dist->table[rnd % dist->size];
333 	x = (sigma % NETEM_DIST_SCALE) * t;
334 	if (x >= 0)
335 		x += NETEM_DIST_SCALE/2;
336 	else
337 		x -= NETEM_DIST_SCALE/2;
338 
339 	return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
340 }
341 
packet_len_2_sched_time(unsigned int len,struct netem_sched_data * q)342 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
343 {
344 	u64 ticks;
345 
346 	len += q->packet_overhead;
347 
348 	if (q->cell_size) {
349 		u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
350 
351 		if (len > cells * q->cell_size)	/* extra cell needed for remainder */
352 			cells++;
353 		len = cells * (q->cell_size + q->cell_overhead);
354 	}
355 
356 	ticks = (u64)len * NSEC_PER_SEC;
357 
358 	do_div(ticks, q->rate);
359 	return PSCHED_NS2TICKS(ticks);
360 }
361 
tfifo_reset(struct Qdisc * sch)362 static void tfifo_reset(struct Qdisc *sch)
363 {
364 	struct netem_sched_data *q = qdisc_priv(sch);
365 	struct rb_node *p;
366 
367 	while ((p = rb_first(&q->t_root))) {
368 		struct sk_buff *skb = netem_rb_to_skb(p);
369 
370 		rb_erase(p, &q->t_root);
371 		rtnl_kfree_skbs(skb, skb);
372 	}
373 }
374 
tfifo_enqueue(struct sk_buff * nskb,struct Qdisc * sch)375 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
376 {
377 	struct netem_sched_data *q = qdisc_priv(sch);
378 	psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
379 	struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
380 
381 	while (*p) {
382 		struct sk_buff *skb;
383 
384 		parent = *p;
385 		skb = netem_rb_to_skb(parent);
386 		if (tnext >= netem_skb_cb(skb)->time_to_send)
387 			p = &parent->rb_right;
388 		else
389 			p = &parent->rb_left;
390 	}
391 	rb_link_node(&nskb->rbnode, parent, p);
392 	rb_insert_color(&nskb->rbnode, &q->t_root);
393 	sch->q.qlen++;
394 }
395 
396 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
397  * when we statistically choose to corrupt one, we instead segment it, returning
398  * the first packet to be corrupted, and re-enqueue the remaining frames
399  */
netem_segment(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)400 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
401 				     struct sk_buff **to_free)
402 {
403 	struct sk_buff *segs;
404 	netdev_features_t features = netif_skb_features(skb);
405 
406 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
407 
408 	if (IS_ERR_OR_NULL(segs)) {
409 		qdisc_drop(skb, sch, to_free);
410 		return NULL;
411 	}
412 	consume_skb(skb);
413 	return segs;
414 }
415 
netem_enqueue_skb_head(struct qdisc_skb_head * qh,struct sk_buff * skb)416 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
417 {
418 	skb->next = qh->head;
419 
420 	if (!qh->head)
421 		qh->tail = skb;
422 	qh->head = skb;
423 	qh->qlen++;
424 }
425 
426 /*
427  * Insert one skb into qdisc.
428  * Note: parent depends on return value to account for queue length.
429  * 	NET_XMIT_DROP: queue length didn't change.
430  *      NET_XMIT_SUCCESS: one skb was queued.
431  */
netem_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)432 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
433 			 struct sk_buff **to_free)
434 {
435 	struct netem_sched_data *q = qdisc_priv(sch);
436 	/* We don't fill cb now as skb_unshare() may invalidate it */
437 	struct netem_skb_cb *cb;
438 	struct sk_buff *skb2;
439 	struct sk_buff *segs = NULL;
440 	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
441 	int nb = 0;
442 	int count = 1;
443 	int rc = NET_XMIT_SUCCESS;
444 
445 	/* Random duplication */
446 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
447 		++count;
448 
449 	/* Drop packet? */
450 	if (loss_event(q)) {
451 		if (q->ecn && INET_ECN_set_ce(skb))
452 			qdisc_qstats_drop(sch); /* mark packet */
453 		else
454 			--count;
455 	}
456 	if (count == 0) {
457 		qdisc_qstats_drop(sch);
458 		__qdisc_drop(skb, to_free);
459 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
460 	}
461 
462 	/* If a delay is expected, orphan the skb. (orphaning usually takes
463 	 * place at TX completion time, so _before_ the link transit delay)
464 	 */
465 	if (q->latency || q->jitter || q->rate)
466 		skb_orphan_partial(skb);
467 
468 	/*
469 	 * If we need to duplicate packet, then re-insert at top of the
470 	 * qdisc tree, since parent queuer expects that only one
471 	 * skb will be queued.
472 	 */
473 	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
474 		struct Qdisc *rootq = qdisc_root(sch);
475 		u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
476 
477 		q->duplicate = 0;
478 		rootq->enqueue(skb2, rootq, to_free);
479 		q->duplicate = dupsave;
480 	}
481 
482 	/*
483 	 * Randomized packet corruption.
484 	 * Make copy if needed since we are modifying
485 	 * If packet is going to be hardware checksummed, then
486 	 * do it now in software before we mangle it.
487 	 */
488 	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
489 		if (skb_is_gso(skb)) {
490 			segs = netem_segment(skb, sch, to_free);
491 			if (!segs)
492 				return NET_XMIT_DROP;
493 		} else {
494 			segs = skb;
495 		}
496 
497 		skb = segs;
498 		segs = segs->next;
499 
500 		skb = skb_unshare(skb, GFP_ATOMIC);
501 		if (unlikely(!skb)) {
502 			qdisc_qstats_drop(sch);
503 			goto finish_segs;
504 		}
505 		if (skb->ip_summed == CHECKSUM_PARTIAL &&
506 		    skb_checksum_help(skb)) {
507 			qdisc_drop(skb, sch, to_free);
508 			goto finish_segs;
509 		}
510 
511 		skb->data[prandom_u32() % skb_headlen(skb)] ^=
512 			1<<(prandom_u32() % 8);
513 	}
514 
515 	if (unlikely(sch->q.qlen >= sch->limit))
516 		return qdisc_drop_all(skb, sch, to_free);
517 
518 	qdisc_qstats_backlog_inc(sch, skb);
519 
520 	cb = netem_skb_cb(skb);
521 	if (q->gap == 0 ||		/* not doing reordering */
522 	    q->counter < q->gap - 1 ||	/* inside last reordering gap */
523 	    q->reorder < get_crandom(&q->reorder_cor)) {
524 		psched_time_t now;
525 		psched_tdiff_t delay;
526 
527 		delay = tabledist(q->latency, q->jitter,
528 				  &q->delay_cor, q->delay_dist);
529 
530 		now = psched_get_time();
531 
532 		if (q->rate) {
533 			struct netem_skb_cb *last = NULL;
534 
535 			if (sch->q.tail)
536 				last = netem_skb_cb(sch->q.tail);
537 			if (q->t_root.rb_node) {
538 				struct sk_buff *t_skb;
539 				struct netem_skb_cb *t_last;
540 
541 				t_skb = netem_rb_to_skb(rb_last(&q->t_root));
542 				t_last = netem_skb_cb(t_skb);
543 				if (!last ||
544 				    t_last->time_to_send > last->time_to_send) {
545 					last = t_last;
546 				}
547 			}
548 
549 			if (last) {
550 				/*
551 				 * Last packet in queue is reference point (now),
552 				 * calculate this time bonus and subtract
553 				 * from delay.
554 				 */
555 				delay -= last->time_to_send - now;
556 				delay = max_t(psched_tdiff_t, 0, delay);
557 				now = last->time_to_send;
558 			}
559 
560 			delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q);
561 		}
562 
563 		cb->time_to_send = now + delay;
564 		cb->tstamp_save = skb->tstamp;
565 		++q->counter;
566 		tfifo_enqueue(skb, sch);
567 	} else {
568 		/*
569 		 * Do re-ordering by putting one out of N packets at the front
570 		 * of the queue.
571 		 */
572 		cb->time_to_send = psched_get_time();
573 		q->counter = 0;
574 
575 		netem_enqueue_skb_head(&sch->q, skb);
576 		sch->qstats.requeues++;
577 	}
578 
579 finish_segs:
580 	if (segs) {
581 		while (segs) {
582 			skb2 = segs->next;
583 			segs->next = NULL;
584 			qdisc_skb_cb(segs)->pkt_len = segs->len;
585 			last_len = segs->len;
586 			rc = qdisc_enqueue(segs, sch, to_free);
587 			if (rc != NET_XMIT_SUCCESS) {
588 				if (net_xmit_drop_count(rc))
589 					qdisc_qstats_drop(sch);
590 			} else {
591 				nb++;
592 				len += last_len;
593 			}
594 			segs = skb2;
595 		}
596 		sch->q.qlen += nb;
597 		if (nb > 1)
598 			qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
599 	}
600 	return NET_XMIT_SUCCESS;
601 }
602 
netem_dequeue(struct Qdisc * sch)603 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
604 {
605 	struct netem_sched_data *q = qdisc_priv(sch);
606 	struct sk_buff *skb;
607 	struct rb_node *p;
608 
609 tfifo_dequeue:
610 	skb = __qdisc_dequeue_head(&sch->q);
611 	if (skb) {
612 		qdisc_qstats_backlog_dec(sch, skb);
613 deliver:
614 		qdisc_bstats_update(sch, skb);
615 		return skb;
616 	}
617 	p = rb_first(&q->t_root);
618 	if (p) {
619 		psched_time_t time_to_send;
620 
621 		skb = netem_rb_to_skb(p);
622 
623 		/* if more time remaining? */
624 		time_to_send = netem_skb_cb(skb)->time_to_send;
625 		if (time_to_send <= psched_get_time()) {
626 			rb_erase(p, &q->t_root);
627 
628 			sch->q.qlen--;
629 			qdisc_qstats_backlog_dec(sch, skb);
630 			skb->next = NULL;
631 			skb->prev = NULL;
632 			skb->tstamp = netem_skb_cb(skb)->tstamp_save;
633 
634 #ifdef CONFIG_NET_CLS_ACT
635 			/*
636 			 * If it's at ingress let's pretend the delay is
637 			 * from the network (tstamp will be updated).
638 			 */
639 			if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
640 				skb->tstamp.tv64 = 0;
641 #endif
642 
643 			if (q->qdisc) {
644 				unsigned int pkt_len = qdisc_pkt_len(skb);
645 				struct sk_buff *to_free = NULL;
646 				int err;
647 
648 				err = qdisc_enqueue(skb, q->qdisc, &to_free);
649 				kfree_skb_list(to_free);
650 				if (err != NET_XMIT_SUCCESS &&
651 				    net_xmit_drop_count(err)) {
652 					qdisc_qstats_drop(sch);
653 					qdisc_tree_reduce_backlog(sch, 1,
654 								  pkt_len);
655 				}
656 				goto tfifo_dequeue;
657 			}
658 			goto deliver;
659 		}
660 
661 		if (q->qdisc) {
662 			skb = q->qdisc->ops->dequeue(q->qdisc);
663 			if (skb)
664 				goto deliver;
665 		}
666 		qdisc_watchdog_schedule(&q->watchdog, time_to_send);
667 	}
668 
669 	if (q->qdisc) {
670 		skb = q->qdisc->ops->dequeue(q->qdisc);
671 		if (skb)
672 			goto deliver;
673 	}
674 	return NULL;
675 }
676 
netem_reset(struct Qdisc * sch)677 static void netem_reset(struct Qdisc *sch)
678 {
679 	struct netem_sched_data *q = qdisc_priv(sch);
680 
681 	qdisc_reset_queue(sch);
682 	tfifo_reset(sch);
683 	if (q->qdisc)
684 		qdisc_reset(q->qdisc);
685 	qdisc_watchdog_cancel(&q->watchdog);
686 }
687 
dist_free(struct disttable * d)688 static void dist_free(struct disttable *d)
689 {
690 	kvfree(d);
691 }
692 
693 /*
694  * Distribution data is a variable size payload containing
695  * signed 16 bit values.
696  */
get_dist_table(struct Qdisc * sch,const struct nlattr * attr)697 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
698 {
699 	struct netem_sched_data *q = qdisc_priv(sch);
700 	size_t n = nla_len(attr)/sizeof(__s16);
701 	const __s16 *data = nla_data(attr);
702 	spinlock_t *root_lock;
703 	struct disttable *d;
704 	int i;
705 	size_t s;
706 
707 	if (n > NETEM_DIST_MAX)
708 		return -EINVAL;
709 
710 	s = sizeof(struct disttable) + n * sizeof(s16);
711 	d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
712 	if (!d)
713 		d = vmalloc(s);
714 	if (!d)
715 		return -ENOMEM;
716 
717 	d->size = n;
718 	for (i = 0; i < n; i++)
719 		d->table[i] = data[i];
720 
721 	root_lock = qdisc_root_sleeping_lock(sch);
722 
723 	spin_lock_bh(root_lock);
724 	swap(q->delay_dist, d);
725 	spin_unlock_bh(root_lock);
726 
727 	dist_free(d);
728 	return 0;
729 }
730 
get_correlation(struct netem_sched_data * q,const struct nlattr * attr)731 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
732 {
733 	const struct tc_netem_corr *c = nla_data(attr);
734 
735 	init_crandom(&q->delay_cor, c->delay_corr);
736 	init_crandom(&q->loss_cor, c->loss_corr);
737 	init_crandom(&q->dup_cor, c->dup_corr);
738 }
739 
get_reorder(struct netem_sched_data * q,const struct nlattr * attr)740 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
741 {
742 	const struct tc_netem_reorder *r = nla_data(attr);
743 
744 	q->reorder = r->probability;
745 	init_crandom(&q->reorder_cor, r->correlation);
746 }
747 
get_corrupt(struct netem_sched_data * q,const struct nlattr * attr)748 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
749 {
750 	const struct tc_netem_corrupt *r = nla_data(attr);
751 
752 	q->corrupt = r->probability;
753 	init_crandom(&q->corrupt_cor, r->correlation);
754 }
755 
get_rate(struct netem_sched_data * q,const struct nlattr * attr)756 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
757 {
758 	const struct tc_netem_rate *r = nla_data(attr);
759 
760 	q->rate = r->rate;
761 	q->packet_overhead = r->packet_overhead;
762 	q->cell_size = r->cell_size;
763 	q->cell_overhead = r->cell_overhead;
764 	if (q->cell_size)
765 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
766 	else
767 		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
768 }
769 
get_loss_clg(struct netem_sched_data * q,const struct nlattr * attr)770 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
771 {
772 	const struct nlattr *la;
773 	int rem;
774 
775 	nla_for_each_nested(la, attr, rem) {
776 		u16 type = nla_type(la);
777 
778 		switch (type) {
779 		case NETEM_LOSS_GI: {
780 			const struct tc_netem_gimodel *gi = nla_data(la);
781 
782 			if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
783 				pr_info("netem: incorrect gi model size\n");
784 				return -EINVAL;
785 			}
786 
787 			q->loss_model = CLG_4_STATES;
788 
789 			q->clg.state = TX_IN_GAP_PERIOD;
790 			q->clg.a1 = gi->p13;
791 			q->clg.a2 = gi->p31;
792 			q->clg.a3 = gi->p32;
793 			q->clg.a4 = gi->p14;
794 			q->clg.a5 = gi->p23;
795 			break;
796 		}
797 
798 		case NETEM_LOSS_GE: {
799 			const struct tc_netem_gemodel *ge = nla_data(la);
800 
801 			if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
802 				pr_info("netem: incorrect ge model size\n");
803 				return -EINVAL;
804 			}
805 
806 			q->loss_model = CLG_GILB_ELL;
807 			q->clg.state = GOOD_STATE;
808 			q->clg.a1 = ge->p;
809 			q->clg.a2 = ge->r;
810 			q->clg.a3 = ge->h;
811 			q->clg.a4 = ge->k1;
812 			break;
813 		}
814 
815 		default:
816 			pr_info("netem: unknown loss type %u\n", type);
817 			return -EINVAL;
818 		}
819 	}
820 
821 	return 0;
822 }
823 
824 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
825 	[TCA_NETEM_CORR]	= { .len = sizeof(struct tc_netem_corr) },
826 	[TCA_NETEM_REORDER]	= { .len = sizeof(struct tc_netem_reorder) },
827 	[TCA_NETEM_CORRUPT]	= { .len = sizeof(struct tc_netem_corrupt) },
828 	[TCA_NETEM_RATE]	= { .len = sizeof(struct tc_netem_rate) },
829 	[TCA_NETEM_LOSS]	= { .type = NLA_NESTED },
830 	[TCA_NETEM_ECN]		= { .type = NLA_U32 },
831 	[TCA_NETEM_RATE64]	= { .type = NLA_U64 },
832 };
833 
parse_attr(struct nlattr * tb[],int maxtype,struct nlattr * nla,const struct nla_policy * policy,int len)834 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
835 		      const struct nla_policy *policy, int len)
836 {
837 	int nested_len = nla_len(nla) - NLA_ALIGN(len);
838 
839 	if (nested_len < 0) {
840 		pr_info("netem: invalid attributes len %d\n", nested_len);
841 		return -EINVAL;
842 	}
843 
844 	if (nested_len >= nla_attr_size(0))
845 		return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
846 				 nested_len, policy);
847 
848 	memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
849 	return 0;
850 }
851 
852 /* Parse netlink message to set options */
netem_change(struct Qdisc * sch,struct nlattr * opt)853 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
854 {
855 	struct netem_sched_data *q = qdisc_priv(sch);
856 	struct nlattr *tb[TCA_NETEM_MAX + 1];
857 	struct tc_netem_qopt *qopt;
858 	struct clgstate old_clg;
859 	int old_loss_model = CLG_RANDOM;
860 	int ret;
861 
862 	if (opt == NULL)
863 		return -EINVAL;
864 
865 	qopt = nla_data(opt);
866 	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
867 	if (ret < 0)
868 		return ret;
869 
870 	/* backup q->clg and q->loss_model */
871 	old_clg = q->clg;
872 	old_loss_model = q->loss_model;
873 
874 	if (tb[TCA_NETEM_LOSS]) {
875 		ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
876 		if (ret) {
877 			q->loss_model = old_loss_model;
878 			return ret;
879 		}
880 	} else {
881 		q->loss_model = CLG_RANDOM;
882 	}
883 
884 	if (tb[TCA_NETEM_DELAY_DIST]) {
885 		ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
886 		if (ret) {
887 			/* recover clg and loss_model, in case of
888 			 * q->clg and q->loss_model were modified
889 			 * in get_loss_clg()
890 			 */
891 			q->clg = old_clg;
892 			q->loss_model = old_loss_model;
893 			return ret;
894 		}
895 	}
896 
897 	sch->limit = qopt->limit;
898 
899 	q->latency = qopt->latency;
900 	q->jitter = qopt->jitter;
901 	q->limit = qopt->limit;
902 	q->gap = qopt->gap;
903 	q->counter = 0;
904 	q->loss = qopt->loss;
905 	q->duplicate = qopt->duplicate;
906 
907 	/* for compatibility with earlier versions.
908 	 * if gap is set, need to assume 100% probability
909 	 */
910 	if (q->gap)
911 		q->reorder = ~0;
912 
913 	if (tb[TCA_NETEM_CORR])
914 		get_correlation(q, tb[TCA_NETEM_CORR]);
915 
916 	if (tb[TCA_NETEM_REORDER])
917 		get_reorder(q, tb[TCA_NETEM_REORDER]);
918 
919 	if (tb[TCA_NETEM_CORRUPT])
920 		get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
921 
922 	if (tb[TCA_NETEM_RATE])
923 		get_rate(q, tb[TCA_NETEM_RATE]);
924 
925 	if (tb[TCA_NETEM_RATE64])
926 		q->rate = max_t(u64, q->rate,
927 				nla_get_u64(tb[TCA_NETEM_RATE64]));
928 
929 	if (tb[TCA_NETEM_ECN])
930 		q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
931 
932 	return ret;
933 }
934 
netem_init(struct Qdisc * sch,struct nlattr * opt)935 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
936 {
937 	struct netem_sched_data *q = qdisc_priv(sch);
938 	int ret;
939 
940 	if (!opt)
941 		return -EINVAL;
942 
943 	qdisc_watchdog_init(&q->watchdog, sch);
944 
945 	q->loss_model = CLG_RANDOM;
946 	ret = netem_change(sch, opt);
947 	if (ret)
948 		pr_info("netem: change failed\n");
949 	return ret;
950 }
951 
netem_destroy(struct Qdisc * sch)952 static void netem_destroy(struct Qdisc *sch)
953 {
954 	struct netem_sched_data *q = qdisc_priv(sch);
955 
956 	qdisc_watchdog_cancel(&q->watchdog);
957 	if (q->qdisc)
958 		qdisc_destroy(q->qdisc);
959 	dist_free(q->delay_dist);
960 }
961 
dump_loss_model(const struct netem_sched_data * q,struct sk_buff * skb)962 static int dump_loss_model(const struct netem_sched_data *q,
963 			   struct sk_buff *skb)
964 {
965 	struct nlattr *nest;
966 
967 	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
968 	if (nest == NULL)
969 		goto nla_put_failure;
970 
971 	switch (q->loss_model) {
972 	case CLG_RANDOM:
973 		/* legacy loss model */
974 		nla_nest_cancel(skb, nest);
975 		return 0;	/* no data */
976 
977 	case CLG_4_STATES: {
978 		struct tc_netem_gimodel gi = {
979 			.p13 = q->clg.a1,
980 			.p31 = q->clg.a2,
981 			.p32 = q->clg.a3,
982 			.p14 = q->clg.a4,
983 			.p23 = q->clg.a5,
984 		};
985 
986 		if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
987 			goto nla_put_failure;
988 		break;
989 	}
990 	case CLG_GILB_ELL: {
991 		struct tc_netem_gemodel ge = {
992 			.p = q->clg.a1,
993 			.r = q->clg.a2,
994 			.h = q->clg.a3,
995 			.k1 = q->clg.a4,
996 		};
997 
998 		if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
999 			goto nla_put_failure;
1000 		break;
1001 	}
1002 	}
1003 
1004 	nla_nest_end(skb, nest);
1005 	return 0;
1006 
1007 nla_put_failure:
1008 	nla_nest_cancel(skb, nest);
1009 	return -1;
1010 }
1011 
netem_dump(struct Qdisc * sch,struct sk_buff * skb)1012 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1013 {
1014 	const struct netem_sched_data *q = qdisc_priv(sch);
1015 	struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1016 	struct tc_netem_qopt qopt;
1017 	struct tc_netem_corr cor;
1018 	struct tc_netem_reorder reorder;
1019 	struct tc_netem_corrupt corrupt;
1020 	struct tc_netem_rate rate;
1021 
1022 	qopt.latency = q->latency;
1023 	qopt.jitter = q->jitter;
1024 	qopt.limit = q->limit;
1025 	qopt.loss = q->loss;
1026 	qopt.gap = q->gap;
1027 	qopt.duplicate = q->duplicate;
1028 	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1029 		goto nla_put_failure;
1030 
1031 	cor.delay_corr = q->delay_cor.rho;
1032 	cor.loss_corr = q->loss_cor.rho;
1033 	cor.dup_corr = q->dup_cor.rho;
1034 	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1035 		goto nla_put_failure;
1036 
1037 	reorder.probability = q->reorder;
1038 	reorder.correlation = q->reorder_cor.rho;
1039 	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1040 		goto nla_put_failure;
1041 
1042 	corrupt.probability = q->corrupt;
1043 	corrupt.correlation = q->corrupt_cor.rho;
1044 	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1045 		goto nla_put_failure;
1046 
1047 	if (q->rate >= (1ULL << 32)) {
1048 		if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1049 				      TCA_NETEM_PAD))
1050 			goto nla_put_failure;
1051 		rate.rate = ~0U;
1052 	} else {
1053 		rate.rate = q->rate;
1054 	}
1055 	rate.packet_overhead = q->packet_overhead;
1056 	rate.cell_size = q->cell_size;
1057 	rate.cell_overhead = q->cell_overhead;
1058 	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1059 		goto nla_put_failure;
1060 
1061 	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1062 		goto nla_put_failure;
1063 
1064 	if (dump_loss_model(q, skb) != 0)
1065 		goto nla_put_failure;
1066 
1067 	return nla_nest_end(skb, nla);
1068 
1069 nla_put_failure:
1070 	nlmsg_trim(skb, nla);
1071 	return -1;
1072 }
1073 
netem_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)1074 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1075 			  struct sk_buff *skb, struct tcmsg *tcm)
1076 {
1077 	struct netem_sched_data *q = qdisc_priv(sch);
1078 
1079 	if (cl != 1 || !q->qdisc) 	/* only one class */
1080 		return -ENOENT;
1081 
1082 	tcm->tcm_handle |= TC_H_MIN(1);
1083 	tcm->tcm_info = q->qdisc->handle;
1084 
1085 	return 0;
1086 }
1087 
netem_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old)1088 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1089 		     struct Qdisc **old)
1090 {
1091 	struct netem_sched_data *q = qdisc_priv(sch);
1092 
1093 	*old = qdisc_replace(sch, new, &q->qdisc);
1094 	return 0;
1095 }
1096 
netem_leaf(struct Qdisc * sch,unsigned long arg)1097 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1098 {
1099 	struct netem_sched_data *q = qdisc_priv(sch);
1100 	return q->qdisc;
1101 }
1102 
netem_get(struct Qdisc * sch,u32 classid)1103 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
1104 {
1105 	return 1;
1106 }
1107 
netem_put(struct Qdisc * sch,unsigned long arg)1108 static void netem_put(struct Qdisc *sch, unsigned long arg)
1109 {
1110 }
1111 
netem_walk(struct Qdisc * sch,struct qdisc_walker * walker)1112 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1113 {
1114 	if (!walker->stop) {
1115 		if (walker->count >= walker->skip)
1116 			if (walker->fn(sch, 1, walker) < 0) {
1117 				walker->stop = 1;
1118 				return;
1119 			}
1120 		walker->count++;
1121 	}
1122 }
1123 
1124 static const struct Qdisc_class_ops netem_class_ops = {
1125 	.graft		=	netem_graft,
1126 	.leaf		=	netem_leaf,
1127 	.get		=	netem_get,
1128 	.put		=	netem_put,
1129 	.walk		=	netem_walk,
1130 	.dump		=	netem_dump_class,
1131 };
1132 
1133 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1134 	.id		=	"netem",
1135 	.cl_ops		=	&netem_class_ops,
1136 	.priv_size	=	sizeof(struct netem_sched_data),
1137 	.enqueue	=	netem_enqueue,
1138 	.dequeue	=	netem_dequeue,
1139 	.peek		=	qdisc_peek_dequeued,
1140 	.init		=	netem_init,
1141 	.reset		=	netem_reset,
1142 	.destroy	=	netem_destroy,
1143 	.change		=	netem_change,
1144 	.dump		=	netem_dump,
1145 	.owner		=	THIS_MODULE,
1146 };
1147 
1148 
netem_module_init(void)1149 static int __init netem_module_init(void)
1150 {
1151 	pr_info("netem: version " VERSION "\n");
1152 	return register_qdisc(&netem_qdisc_ops);
1153 }
netem_module_exit(void)1154 static void __exit netem_module_exit(void)
1155 {
1156 	unregister_qdisc(&netem_qdisc_ops);
1157 }
1158 module_init(netem_module_init)
1159 module_exit(netem_module_exit)
1160 MODULE_LICENSE("GPL");
1161