1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * net/sched/sch_netem.c Network emulator
4 *
5 * Many of the algorithms and ideas for this came from
6 * NIST Net which is not copyrighted.
7 *
8 * Authors: Stephen Hemminger <shemminger@osdl.org>
9 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
10 */
11
12 #include <linux/mm.h>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/errno.h>
18 #include <linux/skbuff.h>
19 #include <linux/vmalloc.h>
20 #include <linux/rtnetlink.h>
21 #include <linux/reciprocal_div.h>
22 #include <linux/rbtree.h>
23
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/inet_ecn.h>
27
28 #define VERSION "1.3"
29
30 /* Network Emulation Queuing algorithm.
31 ====================================
32
33 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
34 Network Emulation Tool
35 [2] Luigi Rizzo, DummyNet for FreeBSD
36
37 ----------------------------------------------------------------
38
39 This started out as a simple way to delay outgoing packets to
40 test TCP but has grown to include most of the functionality
41 of a full blown network emulator like NISTnet. It can delay
42 packets and add random jitter (and correlation). The random
43 distribution can be loaded from a table as well to provide
44 normal, Pareto, or experimental curves. Packet loss,
45 duplication, and reordering can also be emulated.
46
47 This qdisc does not do classification that can be handled in
48 layering other disciplines. It does not need to do bandwidth
49 control either since that can be handled by using token
50 bucket or other rate control.
51
52 Correlated Loss Generator models
53
54 Added generation of correlated loss according to the
55 "Gilbert-Elliot" model, a 4-state markov model.
56
57 References:
58 [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
59 [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
60 and intuitive loss model for packet networks and its implementation
61 in the Netem module in the Linux kernel", available in [1]
62
63 Authors: Stefano Salsano <stefano.salsano at uniroma2.it
64 Fabio Ludovici <fabio.ludovici at yahoo.it>
65 */
66
67 struct disttable {
68 u32 size;
69 s16 table[];
70 };
71
72 struct netem_sched_data {
73 /* internal t(ime)fifo qdisc uses t_root and sch->limit */
74 struct rb_root t_root;
75
76 /* a linear queue; reduces rbtree rebalancing when jitter is low */
77 struct sk_buff *t_head;
78 struct sk_buff *t_tail;
79
80 u32 t_len;
81
82 /* optional qdisc for classful handling (NULL at netem init) */
83 struct Qdisc *qdisc;
84
85 struct qdisc_watchdog watchdog;
86
87 s64 latency;
88 s64 jitter;
89
90 u32 loss;
91 u32 ecn;
92 u32 limit;
93 u32 counter;
94 u32 gap;
95 u32 duplicate;
96 u32 reorder;
97 u32 corrupt;
98 u64 rate;
99 s32 packet_overhead;
100 u32 cell_size;
101 struct reciprocal_value cell_size_reciprocal;
102 s32 cell_overhead;
103
104 struct crndstate {
105 u32 last;
106 u32 rho;
107 } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
108
109 struct disttable *delay_dist;
110
111 enum {
112 CLG_RANDOM,
113 CLG_4_STATES,
114 CLG_GILB_ELL,
115 } loss_model;
116
117 enum {
118 TX_IN_GAP_PERIOD = 1,
119 TX_IN_BURST_PERIOD,
120 LOST_IN_GAP_PERIOD,
121 LOST_IN_BURST_PERIOD,
122 } _4_state_model;
123
124 enum {
125 GOOD_STATE = 1,
126 BAD_STATE,
127 } GE_state_model;
128
129 /* Correlated Loss Generation models */
130 struct clgstate {
131 /* state of the Markov chain */
132 u8 state;
133
134 /* 4-states and Gilbert-Elliot models */
135 u32 a1; /* p13 for 4-states or p for GE */
136 u32 a2; /* p31 for 4-states or r for GE */
137 u32 a3; /* p32 for 4-states or h for GE */
138 u32 a4; /* p14 for 4-states or 1-k for GE */
139 u32 a5; /* p23 used only in 4-states */
140 } clg;
141
142 struct tc_netem_slot slot_config;
143 struct slotstate {
144 u64 slot_next;
145 s32 packets_left;
146 s32 bytes_left;
147 } slot;
148
149 struct disttable *slot_dist;
150 };
151
152 /* Time stamp put into socket buffer control block
153 * Only valid when skbs are in our internal t(ime)fifo queue.
154 *
155 * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
156 * and skb->next & skb->prev are scratch space for a qdisc,
157 * we save skb->tstamp value in skb->cb[] before destroying it.
158 */
159 struct netem_skb_cb {
160 u64 time_to_send;
161 };
162
netem_skb_cb(struct sk_buff * skb)163 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
164 {
165 /* we assume we can use skb next/prev/tstamp as storage for rb_node */
166 qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
167 return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
168 }
169
170 /* init_crandom - initialize correlated random number generator
171 * Use entropy source for initial seed.
172 */
init_crandom(struct crndstate * state,unsigned long rho)173 static void init_crandom(struct crndstate *state, unsigned long rho)
174 {
175 state->rho = rho;
176 state->last = prandom_u32();
177 }
178
179 /* get_crandom - correlated random number generator
180 * Next number depends on last value.
181 * rho is scaled to avoid floating point.
182 */
get_crandom(struct crndstate * state)183 static u32 get_crandom(struct crndstate *state)
184 {
185 u64 value, rho;
186 unsigned long answer;
187
188 if (!state || state->rho == 0) /* no correlation */
189 return prandom_u32();
190
191 value = prandom_u32();
192 rho = (u64)state->rho + 1;
193 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
194 state->last = answer;
195 return answer;
196 }
197
198 /* loss_4state - 4-state model loss generator
199 * Generates losses according to the 4-state Markov chain adopted in
200 * the GI (General and Intuitive) loss model.
201 */
loss_4state(struct netem_sched_data * q)202 static bool loss_4state(struct netem_sched_data *q)
203 {
204 struct clgstate *clg = &q->clg;
205 u32 rnd = prandom_u32();
206
207 /*
208 * Makes a comparison between rnd and the transition
209 * probabilities outgoing from the current state, then decides the
210 * next state and if the next packet has to be transmitted or lost.
211 * The four states correspond to:
212 * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
213 * LOST_IN_BURST_PERIOD => isolated losses within a gap period
214 * LOST_IN_GAP_PERIOD => lost packets within a burst period
215 * TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
216 */
217 switch (clg->state) {
218 case TX_IN_GAP_PERIOD:
219 if (rnd < clg->a4) {
220 clg->state = LOST_IN_BURST_PERIOD;
221 return true;
222 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
223 clg->state = LOST_IN_GAP_PERIOD;
224 return true;
225 } else if (clg->a1 + clg->a4 < rnd) {
226 clg->state = TX_IN_GAP_PERIOD;
227 }
228
229 break;
230 case TX_IN_BURST_PERIOD:
231 if (rnd < clg->a5) {
232 clg->state = LOST_IN_GAP_PERIOD;
233 return true;
234 } else {
235 clg->state = TX_IN_BURST_PERIOD;
236 }
237
238 break;
239 case LOST_IN_GAP_PERIOD:
240 if (rnd < clg->a3)
241 clg->state = TX_IN_BURST_PERIOD;
242 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
243 clg->state = TX_IN_GAP_PERIOD;
244 } else if (clg->a2 + clg->a3 < rnd) {
245 clg->state = LOST_IN_GAP_PERIOD;
246 return true;
247 }
248 break;
249 case LOST_IN_BURST_PERIOD:
250 clg->state = TX_IN_GAP_PERIOD;
251 break;
252 }
253
254 return false;
255 }
256
257 /* loss_gilb_ell - Gilbert-Elliot model loss generator
258 * Generates losses according to the Gilbert-Elliot loss model or
259 * its special cases (Gilbert or Simple Gilbert)
260 *
261 * Makes a comparison between random number and the transition
262 * probabilities outgoing from the current state, then decides the
263 * next state. A second random number is extracted and the comparison
264 * with the loss probability of the current state decides if the next
265 * packet will be transmitted or lost.
266 */
loss_gilb_ell(struct netem_sched_data * q)267 static bool loss_gilb_ell(struct netem_sched_data *q)
268 {
269 struct clgstate *clg = &q->clg;
270
271 switch (clg->state) {
272 case GOOD_STATE:
273 if (prandom_u32() < clg->a1)
274 clg->state = BAD_STATE;
275 if (prandom_u32() < clg->a4)
276 return true;
277 break;
278 case BAD_STATE:
279 if (prandom_u32() < clg->a2)
280 clg->state = GOOD_STATE;
281 if (prandom_u32() > clg->a3)
282 return true;
283 }
284
285 return false;
286 }
287
loss_event(struct netem_sched_data * q)288 static bool loss_event(struct netem_sched_data *q)
289 {
290 switch (q->loss_model) {
291 case CLG_RANDOM:
292 /* Random packet drop 0 => none, ~0 => all */
293 return q->loss && q->loss >= get_crandom(&q->loss_cor);
294
295 case CLG_4_STATES:
296 /* 4state loss model algorithm (used also for GI model)
297 * Extracts a value from the markov 4 state loss generator,
298 * if it is 1 drops a packet and if needed writes the event in
299 * the kernel logs
300 */
301 return loss_4state(q);
302
303 case CLG_GILB_ELL:
304 /* Gilbert-Elliot loss model algorithm
305 * Extracts a value from the Gilbert-Elliot loss generator,
306 * if it is 1 drops a packet and if needed writes the event in
307 * the kernel logs
308 */
309 return loss_gilb_ell(q);
310 }
311
312 return false; /* not reached */
313 }
314
315
316 /* tabledist - return a pseudo-randomly distributed value with mean mu and
317 * std deviation sigma. Uses table lookup to approximate the desired
318 * distribution, and a uniformly-distributed pseudo-random source.
319 */
tabledist(s64 mu,s32 sigma,struct crndstate * state,const struct disttable * dist)320 static s64 tabledist(s64 mu, s32 sigma,
321 struct crndstate *state,
322 const struct disttable *dist)
323 {
324 s64 x;
325 long t;
326 u32 rnd;
327
328 if (sigma == 0)
329 return mu;
330
331 rnd = get_crandom(state);
332
333 /* default uniform distribution */
334 if (dist == NULL)
335 return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
336
337 t = dist->table[rnd % dist->size];
338 x = (sigma % NETEM_DIST_SCALE) * t;
339 if (x >= 0)
340 x += NETEM_DIST_SCALE/2;
341 else
342 x -= NETEM_DIST_SCALE/2;
343
344 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
345 }
346
packet_time_ns(u64 len,const struct netem_sched_data * q)347 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
348 {
349 len += q->packet_overhead;
350
351 if (q->cell_size) {
352 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
353
354 if (len > cells * q->cell_size) /* extra cell needed for remainder */
355 cells++;
356 len = cells * (q->cell_size + q->cell_overhead);
357 }
358
359 return div64_u64(len * NSEC_PER_SEC, q->rate);
360 }
361
tfifo_reset(struct Qdisc * sch)362 static void tfifo_reset(struct Qdisc *sch)
363 {
364 struct netem_sched_data *q = qdisc_priv(sch);
365 struct rb_node *p = rb_first(&q->t_root);
366
367 while (p) {
368 struct sk_buff *skb = rb_to_skb(p);
369
370 p = rb_next(p);
371 rb_erase(&skb->rbnode, &q->t_root);
372 rtnl_kfree_skbs(skb, skb);
373 }
374
375 rtnl_kfree_skbs(q->t_head, q->t_tail);
376 q->t_head = NULL;
377 q->t_tail = NULL;
378 q->t_len = 0;
379 }
380
tfifo_enqueue(struct sk_buff * nskb,struct Qdisc * sch)381 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
382 {
383 struct netem_sched_data *q = qdisc_priv(sch);
384 u64 tnext = netem_skb_cb(nskb)->time_to_send;
385
386 if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) {
387 if (q->t_tail)
388 q->t_tail->next = nskb;
389 else
390 q->t_head = nskb;
391 q->t_tail = nskb;
392 } else {
393 struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
394
395 while (*p) {
396 struct sk_buff *skb;
397
398 parent = *p;
399 skb = rb_to_skb(parent);
400 if (tnext >= netem_skb_cb(skb)->time_to_send)
401 p = &parent->rb_right;
402 else
403 p = &parent->rb_left;
404 }
405 rb_link_node(&nskb->rbnode, parent, p);
406 rb_insert_color(&nskb->rbnode, &q->t_root);
407 }
408 q->t_len++;
409 sch->q.qlen++;
410 }
411
412 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
413 * when we statistically choose to corrupt one, we instead segment it, returning
414 * the first packet to be corrupted, and re-enqueue the remaining frames
415 */
netem_segment(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)416 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
417 struct sk_buff **to_free)
418 {
419 struct sk_buff *segs;
420 netdev_features_t features = netif_skb_features(skb);
421
422 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
423
424 if (IS_ERR_OR_NULL(segs)) {
425 qdisc_drop(skb, sch, to_free);
426 return NULL;
427 }
428 consume_skb(skb);
429 return segs;
430 }
431
432 /*
433 * Insert one skb into qdisc.
434 * Note: parent depends on return value to account for queue length.
435 * NET_XMIT_DROP: queue length didn't change.
436 * NET_XMIT_SUCCESS: one skb was queued.
437 */
netem_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)438 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
439 struct sk_buff **to_free)
440 {
441 struct netem_sched_data *q = qdisc_priv(sch);
442 /* We don't fill cb now as skb_unshare() may invalidate it */
443 struct netem_skb_cb *cb;
444 struct sk_buff *skb2;
445 struct sk_buff *segs = NULL;
446 unsigned int prev_len = qdisc_pkt_len(skb);
447 int count = 1;
448 int rc = NET_XMIT_SUCCESS;
449 int rc_drop = NET_XMIT_DROP;
450
451 /* Do not fool qdisc_drop_all() */
452 skb->prev = NULL;
453
454 /* Random duplication */
455 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
456 ++count;
457
458 /* Drop packet? */
459 if (loss_event(q)) {
460 if (q->ecn && INET_ECN_set_ce(skb))
461 qdisc_qstats_drop(sch); /* mark packet */
462 else
463 --count;
464 }
465 if (count == 0) {
466 qdisc_qstats_drop(sch);
467 __qdisc_drop(skb, to_free);
468 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
469 }
470
471 /* If a delay is expected, orphan the skb. (orphaning usually takes
472 * place at TX completion time, so _before_ the link transit delay)
473 */
474 if (q->latency || q->jitter || q->rate)
475 skb_orphan_partial(skb);
476
477 /*
478 * If we need to duplicate packet, then re-insert at top of the
479 * qdisc tree, since parent queuer expects that only one
480 * skb will be queued.
481 */
482 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
483 struct Qdisc *rootq = qdisc_root_bh(sch);
484 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
485
486 q->duplicate = 0;
487 rootq->enqueue(skb2, rootq, to_free);
488 q->duplicate = dupsave;
489 rc_drop = NET_XMIT_SUCCESS;
490 }
491
492 /*
493 * Randomized packet corruption.
494 * Make copy if needed since we are modifying
495 * If packet is going to be hardware checksummed, then
496 * do it now in software before we mangle it.
497 */
498 if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
499 if (skb_is_gso(skb)) {
500 skb = netem_segment(skb, sch, to_free);
501 if (!skb)
502 return rc_drop;
503 segs = skb->next;
504 skb_mark_not_on_list(skb);
505 qdisc_skb_cb(skb)->pkt_len = skb->len;
506 }
507
508 skb = skb_unshare(skb, GFP_ATOMIC);
509 if (unlikely(!skb)) {
510 qdisc_qstats_drop(sch);
511 goto finish_segs;
512 }
513 if (skb->ip_summed == CHECKSUM_PARTIAL &&
514 skb_checksum_help(skb)) {
515 qdisc_drop(skb, sch, to_free);
516 skb = NULL;
517 goto finish_segs;
518 }
519
520 skb->data[prandom_u32() % skb_headlen(skb)] ^=
521 1<<(prandom_u32() % 8);
522 }
523
524 if (unlikely(q->t_len >= sch->limit)) {
525 /* re-link segs, so that qdisc_drop_all() frees them all */
526 skb->next = segs;
527 qdisc_drop_all(skb, sch, to_free);
528 return rc_drop;
529 }
530
531 qdisc_qstats_backlog_inc(sch, skb);
532
533 cb = netem_skb_cb(skb);
534 if (q->gap == 0 || /* not doing reordering */
535 q->counter < q->gap - 1 || /* inside last reordering gap */
536 q->reorder < get_crandom(&q->reorder_cor)) {
537 u64 now;
538 s64 delay;
539
540 delay = tabledist(q->latency, q->jitter,
541 &q->delay_cor, q->delay_dist);
542
543 now = ktime_get_ns();
544
545 if (q->rate) {
546 struct netem_skb_cb *last = NULL;
547
548 if (sch->q.tail)
549 last = netem_skb_cb(sch->q.tail);
550 if (q->t_root.rb_node) {
551 struct sk_buff *t_skb;
552 struct netem_skb_cb *t_last;
553
554 t_skb = skb_rb_last(&q->t_root);
555 t_last = netem_skb_cb(t_skb);
556 if (!last ||
557 t_last->time_to_send > last->time_to_send)
558 last = t_last;
559 }
560 if (q->t_tail) {
561 struct netem_skb_cb *t_last =
562 netem_skb_cb(q->t_tail);
563
564 if (!last ||
565 t_last->time_to_send > last->time_to_send)
566 last = t_last;
567 }
568
569 if (last) {
570 /*
571 * Last packet in queue is reference point (now),
572 * calculate this time bonus and subtract
573 * from delay.
574 */
575 delay -= last->time_to_send - now;
576 delay = max_t(s64, 0, delay);
577 now = last->time_to_send;
578 }
579
580 delay += packet_time_ns(qdisc_pkt_len(skb), q);
581 }
582
583 cb->time_to_send = now + delay;
584 ++q->counter;
585 tfifo_enqueue(skb, sch);
586 } else {
587 /*
588 * Do re-ordering by putting one out of N packets at the front
589 * of the queue.
590 */
591 cb->time_to_send = ktime_get_ns();
592 q->counter = 0;
593
594 __qdisc_enqueue_head(skb, &sch->q);
595 sch->qstats.requeues++;
596 }
597
598 finish_segs:
599 if (segs) {
600 unsigned int len, last_len;
601 int nb;
602
603 len = skb ? skb->len : 0;
604 nb = skb ? 1 : 0;
605
606 while (segs) {
607 skb2 = segs->next;
608 skb_mark_not_on_list(segs);
609 qdisc_skb_cb(segs)->pkt_len = segs->len;
610 last_len = segs->len;
611 rc = qdisc_enqueue(segs, sch, to_free);
612 if (rc != NET_XMIT_SUCCESS) {
613 if (net_xmit_drop_count(rc))
614 qdisc_qstats_drop(sch);
615 } else {
616 nb++;
617 len += last_len;
618 }
619 segs = skb2;
620 }
621 /* Parent qdiscs accounted for 1 skb of size @prev_len */
622 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
623 } else if (!skb) {
624 return NET_XMIT_DROP;
625 }
626 return NET_XMIT_SUCCESS;
627 }
628
629 /* Delay the next round with a new future slot with a
630 * correct number of bytes and packets.
631 */
632
get_slot_next(struct netem_sched_data * q,u64 now)633 static void get_slot_next(struct netem_sched_data *q, u64 now)
634 {
635 s64 next_delay;
636
637 if (!q->slot_dist)
638 next_delay = q->slot_config.min_delay +
639 (prandom_u32() *
640 (q->slot_config.max_delay -
641 q->slot_config.min_delay) >> 32);
642 else
643 next_delay = tabledist(q->slot_config.dist_delay,
644 (s32)(q->slot_config.dist_jitter),
645 NULL, q->slot_dist);
646
647 q->slot.slot_next = now + next_delay;
648 q->slot.packets_left = q->slot_config.max_packets;
649 q->slot.bytes_left = q->slot_config.max_bytes;
650 }
651
netem_peek(struct netem_sched_data * q)652 static struct sk_buff *netem_peek(struct netem_sched_data *q)
653 {
654 struct sk_buff *skb = skb_rb_first(&q->t_root);
655 u64 t1, t2;
656
657 if (!skb)
658 return q->t_head;
659 if (!q->t_head)
660 return skb;
661
662 t1 = netem_skb_cb(skb)->time_to_send;
663 t2 = netem_skb_cb(q->t_head)->time_to_send;
664 if (t1 < t2)
665 return skb;
666 return q->t_head;
667 }
668
netem_erase_head(struct netem_sched_data * q,struct sk_buff * skb)669 static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
670 {
671 if (skb == q->t_head) {
672 q->t_head = skb->next;
673 if (!q->t_head)
674 q->t_tail = NULL;
675 } else {
676 rb_erase(&skb->rbnode, &q->t_root);
677 }
678 }
679
netem_dequeue(struct Qdisc * sch)680 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
681 {
682 struct netem_sched_data *q = qdisc_priv(sch);
683 struct sk_buff *skb;
684
685 tfifo_dequeue:
686 skb = __qdisc_dequeue_head(&sch->q);
687 if (skb) {
688 deliver:
689 qdisc_qstats_backlog_dec(sch, skb);
690 qdisc_bstats_update(sch, skb);
691 return skb;
692 }
693 skb = netem_peek(q);
694 if (skb) {
695 u64 time_to_send;
696 u64 now = ktime_get_ns();
697
698 /* if more time remaining? */
699 time_to_send = netem_skb_cb(skb)->time_to_send;
700 if (q->slot.slot_next && q->slot.slot_next < time_to_send)
701 get_slot_next(q, now);
702
703 if (time_to_send <= now && q->slot.slot_next <= now) {
704 netem_erase_head(q, skb);
705 q->t_len--;
706 skb->next = NULL;
707 skb->prev = NULL;
708 /* skb->dev shares skb->rbnode area,
709 * we need to restore its value.
710 */
711 skb->dev = qdisc_dev(sch);
712
713 if (q->slot.slot_next) {
714 q->slot.packets_left--;
715 q->slot.bytes_left -= qdisc_pkt_len(skb);
716 if (q->slot.packets_left <= 0 ||
717 q->slot.bytes_left <= 0)
718 get_slot_next(q, now);
719 }
720
721 if (q->qdisc) {
722 unsigned int pkt_len = qdisc_pkt_len(skb);
723 struct sk_buff *to_free = NULL;
724 int err;
725
726 err = qdisc_enqueue(skb, q->qdisc, &to_free);
727 kfree_skb_list(to_free);
728 if (err != NET_XMIT_SUCCESS) {
729 if (net_xmit_drop_count(err))
730 qdisc_qstats_drop(sch);
731 qdisc_tree_reduce_backlog(sch, 1, pkt_len);
732 sch->qstats.backlog -= pkt_len;
733 sch->q.qlen--;
734 }
735 goto tfifo_dequeue;
736 }
737 sch->q.qlen--;
738 goto deliver;
739 }
740
741 if (q->qdisc) {
742 skb = q->qdisc->ops->dequeue(q->qdisc);
743 if (skb) {
744 sch->q.qlen--;
745 goto deliver;
746 }
747 }
748
749 qdisc_watchdog_schedule_ns(&q->watchdog,
750 max(time_to_send,
751 q->slot.slot_next));
752 }
753
754 if (q->qdisc) {
755 skb = q->qdisc->ops->dequeue(q->qdisc);
756 if (skb) {
757 sch->q.qlen--;
758 goto deliver;
759 }
760 }
761 return NULL;
762 }
763
netem_reset(struct Qdisc * sch)764 static void netem_reset(struct Qdisc *sch)
765 {
766 struct netem_sched_data *q = qdisc_priv(sch);
767
768 qdisc_reset_queue(sch);
769 tfifo_reset(sch);
770 if (q->qdisc)
771 qdisc_reset(q->qdisc);
772 qdisc_watchdog_cancel(&q->watchdog);
773 }
774
dist_free(struct disttable * d)775 static void dist_free(struct disttable *d)
776 {
777 kvfree(d);
778 }
779
780 /*
781 * Distribution data is a variable size payload containing
782 * signed 16 bit values.
783 */
784
get_dist_table(struct disttable ** tbl,const struct nlattr * attr)785 static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
786 {
787 size_t n = nla_len(attr)/sizeof(__s16);
788 const __s16 *data = nla_data(attr);
789 struct disttable *d;
790 int i;
791
792 if (!n || n > NETEM_DIST_MAX)
793 return -EINVAL;
794
795 d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
796 if (!d)
797 return -ENOMEM;
798
799 d->size = n;
800 for (i = 0; i < n; i++)
801 d->table[i] = data[i];
802
803 *tbl = d;
804 return 0;
805 }
806
get_slot(struct netem_sched_data * q,const struct nlattr * attr)807 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
808 {
809 const struct tc_netem_slot *c = nla_data(attr);
810
811 q->slot_config = *c;
812 if (q->slot_config.max_packets == 0)
813 q->slot_config.max_packets = INT_MAX;
814 if (q->slot_config.max_bytes == 0)
815 q->slot_config.max_bytes = INT_MAX;
816
817 /* capping dist_jitter to the range acceptable by tabledist() */
818 q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
819
820 q->slot.packets_left = q->slot_config.max_packets;
821 q->slot.bytes_left = q->slot_config.max_bytes;
822 if (q->slot_config.min_delay | q->slot_config.max_delay |
823 q->slot_config.dist_jitter)
824 q->slot.slot_next = ktime_get_ns();
825 else
826 q->slot.slot_next = 0;
827 }
828
get_correlation(struct netem_sched_data * q,const struct nlattr * attr)829 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
830 {
831 const struct tc_netem_corr *c = nla_data(attr);
832
833 init_crandom(&q->delay_cor, c->delay_corr);
834 init_crandom(&q->loss_cor, c->loss_corr);
835 init_crandom(&q->dup_cor, c->dup_corr);
836 }
837
get_reorder(struct netem_sched_data * q,const struct nlattr * attr)838 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
839 {
840 const struct tc_netem_reorder *r = nla_data(attr);
841
842 q->reorder = r->probability;
843 init_crandom(&q->reorder_cor, r->correlation);
844 }
845
get_corrupt(struct netem_sched_data * q,const struct nlattr * attr)846 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
847 {
848 const struct tc_netem_corrupt *r = nla_data(attr);
849
850 q->corrupt = r->probability;
851 init_crandom(&q->corrupt_cor, r->correlation);
852 }
853
get_rate(struct netem_sched_data * q,const struct nlattr * attr)854 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
855 {
856 const struct tc_netem_rate *r = nla_data(attr);
857
858 q->rate = r->rate;
859 q->packet_overhead = r->packet_overhead;
860 q->cell_size = r->cell_size;
861 q->cell_overhead = r->cell_overhead;
862 if (q->cell_size)
863 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
864 else
865 q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
866 }
867
get_loss_clg(struct netem_sched_data * q,const struct nlattr * attr)868 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
869 {
870 const struct nlattr *la;
871 int rem;
872
873 nla_for_each_nested(la, attr, rem) {
874 u16 type = nla_type(la);
875
876 switch (type) {
877 case NETEM_LOSS_GI: {
878 const struct tc_netem_gimodel *gi = nla_data(la);
879
880 if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
881 pr_info("netem: incorrect gi model size\n");
882 return -EINVAL;
883 }
884
885 q->loss_model = CLG_4_STATES;
886
887 q->clg.state = TX_IN_GAP_PERIOD;
888 q->clg.a1 = gi->p13;
889 q->clg.a2 = gi->p31;
890 q->clg.a3 = gi->p32;
891 q->clg.a4 = gi->p14;
892 q->clg.a5 = gi->p23;
893 break;
894 }
895
896 case NETEM_LOSS_GE: {
897 const struct tc_netem_gemodel *ge = nla_data(la);
898
899 if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
900 pr_info("netem: incorrect ge model size\n");
901 return -EINVAL;
902 }
903
904 q->loss_model = CLG_GILB_ELL;
905 q->clg.state = GOOD_STATE;
906 q->clg.a1 = ge->p;
907 q->clg.a2 = ge->r;
908 q->clg.a3 = ge->h;
909 q->clg.a4 = ge->k1;
910 break;
911 }
912
913 default:
914 pr_info("netem: unknown loss type %u\n", type);
915 return -EINVAL;
916 }
917 }
918
919 return 0;
920 }
921
922 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
923 [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
924 [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
925 [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
926 [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
927 [TCA_NETEM_LOSS] = { .type = NLA_NESTED },
928 [TCA_NETEM_ECN] = { .type = NLA_U32 },
929 [TCA_NETEM_RATE64] = { .type = NLA_U64 },
930 [TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
931 [TCA_NETEM_JITTER64] = { .type = NLA_S64 },
932 [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
933 };
934
parse_attr(struct nlattr * tb[],int maxtype,struct nlattr * nla,const struct nla_policy * policy,int len)935 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
936 const struct nla_policy *policy, int len)
937 {
938 int nested_len = nla_len(nla) - NLA_ALIGN(len);
939
940 if (nested_len < 0) {
941 pr_info("netem: invalid attributes len %d\n", nested_len);
942 return -EINVAL;
943 }
944
945 if (nested_len >= nla_attr_size(0))
946 return nla_parse_deprecated(tb, maxtype,
947 nla_data(nla) + NLA_ALIGN(len),
948 nested_len, policy, NULL);
949
950 memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
951 return 0;
952 }
953
954 /* Parse netlink message to set options */
netem_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)955 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
956 struct netlink_ext_ack *extack)
957 {
958 struct netem_sched_data *q = qdisc_priv(sch);
959 struct nlattr *tb[TCA_NETEM_MAX + 1];
960 struct disttable *delay_dist = NULL;
961 struct disttable *slot_dist = NULL;
962 struct tc_netem_qopt *qopt;
963 struct clgstate old_clg;
964 int old_loss_model = CLG_RANDOM;
965 int ret;
966
967 if (opt == NULL)
968 return -EINVAL;
969
970 qopt = nla_data(opt);
971 ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
972 if (ret < 0)
973 return ret;
974
975 if (tb[TCA_NETEM_DELAY_DIST]) {
976 ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]);
977 if (ret)
978 goto table_free;
979 }
980
981 if (tb[TCA_NETEM_SLOT_DIST]) {
982 ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]);
983 if (ret)
984 goto table_free;
985 }
986
987 sch_tree_lock(sch);
988 /* backup q->clg and q->loss_model */
989 old_clg = q->clg;
990 old_loss_model = q->loss_model;
991
992 if (tb[TCA_NETEM_LOSS]) {
993 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
994 if (ret) {
995 q->loss_model = old_loss_model;
996 q->clg = old_clg;
997 goto unlock;
998 }
999 } else {
1000 q->loss_model = CLG_RANDOM;
1001 }
1002
1003 if (delay_dist)
1004 swap(q->delay_dist, delay_dist);
1005 if (slot_dist)
1006 swap(q->slot_dist, slot_dist);
1007 sch->limit = qopt->limit;
1008
1009 q->latency = PSCHED_TICKS2NS(qopt->latency);
1010 q->jitter = PSCHED_TICKS2NS(qopt->jitter);
1011 q->limit = qopt->limit;
1012 q->gap = qopt->gap;
1013 q->counter = 0;
1014 q->loss = qopt->loss;
1015 q->duplicate = qopt->duplicate;
1016
1017 /* for compatibility with earlier versions.
1018 * if gap is set, need to assume 100% probability
1019 */
1020 if (q->gap)
1021 q->reorder = ~0;
1022
1023 if (tb[TCA_NETEM_CORR])
1024 get_correlation(q, tb[TCA_NETEM_CORR]);
1025
1026 if (tb[TCA_NETEM_REORDER])
1027 get_reorder(q, tb[TCA_NETEM_REORDER]);
1028
1029 if (tb[TCA_NETEM_CORRUPT])
1030 get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
1031
1032 if (tb[TCA_NETEM_RATE])
1033 get_rate(q, tb[TCA_NETEM_RATE]);
1034
1035 if (tb[TCA_NETEM_RATE64])
1036 q->rate = max_t(u64, q->rate,
1037 nla_get_u64(tb[TCA_NETEM_RATE64]));
1038
1039 if (tb[TCA_NETEM_LATENCY64])
1040 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1041
1042 if (tb[TCA_NETEM_JITTER64])
1043 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1044
1045 if (tb[TCA_NETEM_ECN])
1046 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1047
1048 if (tb[TCA_NETEM_SLOT])
1049 get_slot(q, tb[TCA_NETEM_SLOT]);
1050
1051 /* capping jitter to the range acceptable by tabledist() */
1052 q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1053
1054 unlock:
1055 sch_tree_unlock(sch);
1056
1057 table_free:
1058 dist_free(delay_dist);
1059 dist_free(slot_dist);
1060 return ret;
1061 }
1062
netem_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)1063 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1064 struct netlink_ext_ack *extack)
1065 {
1066 struct netem_sched_data *q = qdisc_priv(sch);
1067 int ret;
1068
1069 qdisc_watchdog_init(&q->watchdog, sch);
1070
1071 if (!opt)
1072 return -EINVAL;
1073
1074 q->loss_model = CLG_RANDOM;
1075 ret = netem_change(sch, opt, extack);
1076 if (ret)
1077 pr_info("netem: change failed\n");
1078 return ret;
1079 }
1080
netem_destroy(struct Qdisc * sch)1081 static void netem_destroy(struct Qdisc *sch)
1082 {
1083 struct netem_sched_data *q = qdisc_priv(sch);
1084
1085 qdisc_watchdog_cancel(&q->watchdog);
1086 if (q->qdisc)
1087 qdisc_put(q->qdisc);
1088 dist_free(q->delay_dist);
1089 dist_free(q->slot_dist);
1090 }
1091
dump_loss_model(const struct netem_sched_data * q,struct sk_buff * skb)1092 static int dump_loss_model(const struct netem_sched_data *q,
1093 struct sk_buff *skb)
1094 {
1095 struct nlattr *nest;
1096
1097 nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS);
1098 if (nest == NULL)
1099 goto nla_put_failure;
1100
1101 switch (q->loss_model) {
1102 case CLG_RANDOM:
1103 /* legacy loss model */
1104 nla_nest_cancel(skb, nest);
1105 return 0; /* no data */
1106
1107 case CLG_4_STATES: {
1108 struct tc_netem_gimodel gi = {
1109 .p13 = q->clg.a1,
1110 .p31 = q->clg.a2,
1111 .p32 = q->clg.a3,
1112 .p14 = q->clg.a4,
1113 .p23 = q->clg.a5,
1114 };
1115
1116 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1117 goto nla_put_failure;
1118 break;
1119 }
1120 case CLG_GILB_ELL: {
1121 struct tc_netem_gemodel ge = {
1122 .p = q->clg.a1,
1123 .r = q->clg.a2,
1124 .h = q->clg.a3,
1125 .k1 = q->clg.a4,
1126 };
1127
1128 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1129 goto nla_put_failure;
1130 break;
1131 }
1132 }
1133
1134 nla_nest_end(skb, nest);
1135 return 0;
1136
1137 nla_put_failure:
1138 nla_nest_cancel(skb, nest);
1139 return -1;
1140 }
1141
netem_dump(struct Qdisc * sch,struct sk_buff * skb)1142 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1143 {
1144 const struct netem_sched_data *q = qdisc_priv(sch);
1145 struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1146 struct tc_netem_qopt qopt;
1147 struct tc_netem_corr cor;
1148 struct tc_netem_reorder reorder;
1149 struct tc_netem_corrupt corrupt;
1150 struct tc_netem_rate rate;
1151 struct tc_netem_slot slot;
1152
1153 qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency),
1154 UINT_MAX);
1155 qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter),
1156 UINT_MAX);
1157 qopt.limit = q->limit;
1158 qopt.loss = q->loss;
1159 qopt.gap = q->gap;
1160 qopt.duplicate = q->duplicate;
1161 if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1162 goto nla_put_failure;
1163
1164 if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1165 goto nla_put_failure;
1166
1167 if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1168 goto nla_put_failure;
1169
1170 cor.delay_corr = q->delay_cor.rho;
1171 cor.loss_corr = q->loss_cor.rho;
1172 cor.dup_corr = q->dup_cor.rho;
1173 if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1174 goto nla_put_failure;
1175
1176 reorder.probability = q->reorder;
1177 reorder.correlation = q->reorder_cor.rho;
1178 if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1179 goto nla_put_failure;
1180
1181 corrupt.probability = q->corrupt;
1182 corrupt.correlation = q->corrupt_cor.rho;
1183 if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1184 goto nla_put_failure;
1185
1186 if (q->rate >= (1ULL << 32)) {
1187 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1188 TCA_NETEM_PAD))
1189 goto nla_put_failure;
1190 rate.rate = ~0U;
1191 } else {
1192 rate.rate = q->rate;
1193 }
1194 rate.packet_overhead = q->packet_overhead;
1195 rate.cell_size = q->cell_size;
1196 rate.cell_overhead = q->cell_overhead;
1197 if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1198 goto nla_put_failure;
1199
1200 if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1201 goto nla_put_failure;
1202
1203 if (dump_loss_model(q, skb) != 0)
1204 goto nla_put_failure;
1205
1206 if (q->slot_config.min_delay | q->slot_config.max_delay |
1207 q->slot_config.dist_jitter) {
1208 slot = q->slot_config;
1209 if (slot.max_packets == INT_MAX)
1210 slot.max_packets = 0;
1211 if (slot.max_bytes == INT_MAX)
1212 slot.max_bytes = 0;
1213 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1214 goto nla_put_failure;
1215 }
1216
1217 return nla_nest_end(skb, nla);
1218
1219 nla_put_failure:
1220 nlmsg_trim(skb, nla);
1221 return -1;
1222 }
1223
netem_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)1224 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1225 struct sk_buff *skb, struct tcmsg *tcm)
1226 {
1227 struct netem_sched_data *q = qdisc_priv(sch);
1228
1229 if (cl != 1 || !q->qdisc) /* only one class */
1230 return -ENOENT;
1231
1232 tcm->tcm_handle |= TC_H_MIN(1);
1233 tcm->tcm_info = q->qdisc->handle;
1234
1235 return 0;
1236 }
1237
netem_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)1238 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1239 struct Qdisc **old, struct netlink_ext_ack *extack)
1240 {
1241 struct netem_sched_data *q = qdisc_priv(sch);
1242
1243 *old = qdisc_replace(sch, new, &q->qdisc);
1244 return 0;
1245 }
1246
netem_leaf(struct Qdisc * sch,unsigned long arg)1247 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1248 {
1249 struct netem_sched_data *q = qdisc_priv(sch);
1250 return q->qdisc;
1251 }
1252
netem_find(struct Qdisc * sch,u32 classid)1253 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1254 {
1255 return 1;
1256 }
1257
netem_walk(struct Qdisc * sch,struct qdisc_walker * walker)1258 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1259 {
1260 if (!walker->stop) {
1261 if (walker->count >= walker->skip)
1262 if (walker->fn(sch, 1, walker) < 0) {
1263 walker->stop = 1;
1264 return;
1265 }
1266 walker->count++;
1267 }
1268 }
1269
1270 static const struct Qdisc_class_ops netem_class_ops = {
1271 .graft = netem_graft,
1272 .leaf = netem_leaf,
1273 .find = netem_find,
1274 .walk = netem_walk,
1275 .dump = netem_dump_class,
1276 };
1277
1278 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1279 .id = "netem",
1280 .cl_ops = &netem_class_ops,
1281 .priv_size = sizeof(struct netem_sched_data),
1282 .enqueue = netem_enqueue,
1283 .dequeue = netem_dequeue,
1284 .peek = qdisc_peek_dequeued,
1285 .init = netem_init,
1286 .reset = netem_reset,
1287 .destroy = netem_destroy,
1288 .change = netem_change,
1289 .dump = netem_dump,
1290 .owner = THIS_MODULE,
1291 };
1292
1293
netem_module_init(void)1294 static int __init netem_module_init(void)
1295 {
1296 pr_info("netem: version " VERSION "\n");
1297 return register_qdisc(&netem_qdisc_ops);
1298 }
netem_module_exit(void)1299 static void __exit netem_module_exit(void)
1300 {
1301 unregister_qdisc(&netem_qdisc_ops);
1302 }
1303 module_init(netem_module_init)
1304 module_exit(netem_module_exit)
1305 MODULE_LICENSE("GPL");
1306