• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_bulk_flow_count;
143 	u16 dsthost_bulk_flow_count;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
us_to_ns(u64 us)278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 0, 1, 2, 4, 2, 2, 2,
317 	1, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 1, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 /* There is a big difference in timing between the accurate values placed in the
365  * cache and the approximations given by a single Newton step for small count
366  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
367  * these values are calculated using eight Newton steps, using the
368  * implementation below. Above 16, a single Newton step gives sufficient
369  * accuracy in either direction, given the precision stored.
370  *
371  * The magnitude of the error when stepping up to count 2 is such as to give the
372  * value that *should* have been produced at count 4.
373  */
374 
375 #define REC_INV_SQRT_CACHE (16)
376 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
377 		~0,         ~0, 3037000500, 2479700525,
378 	2147483647, 1920767767, 1753413056, 1623345051,
379 	1518500250, 1431655765, 1358187914, 1294981364,
380 	1239850263, 1191209601, 1147878294, 1108955788
381 };
382 
383 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
384  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
385  *
386  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
387  */
388 
cobalt_newton_step(struct cobalt_vars * vars)389 static void cobalt_newton_step(struct cobalt_vars *vars)
390 {
391 	u32 invsqrt, invsqrt2;
392 	u64 val;
393 
394 	invsqrt = vars->rec_inv_sqrt;
395 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
396 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
397 
398 	val >>= 2; /* avoid overflow in following multiply */
399 	val = (val * invsqrt) >> (32 - 2 + 1);
400 
401 	vars->rec_inv_sqrt = val;
402 }
403 
cobalt_invsqrt(struct cobalt_vars * vars)404 static void cobalt_invsqrt(struct cobalt_vars *vars)
405 {
406 	if (vars->count < REC_INV_SQRT_CACHE)
407 		vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
408 	else
409 		cobalt_newton_step(vars);
410 }
411 
cobalt_vars_init(struct cobalt_vars * vars)412 static void cobalt_vars_init(struct cobalt_vars *vars)
413 {
414 	memset(vars, 0, sizeof(*vars));
415 }
416 
417 /* CoDel control_law is t + interval/sqrt(count)
418  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
419  * both sqrt() and divide operation.
420  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)421 static ktime_t cobalt_control(ktime_t t,
422 			      u64 interval,
423 			      u32 rec_inv_sqrt)
424 {
425 	return ktime_add_ns(t, reciprocal_scale(interval,
426 						rec_inv_sqrt));
427 }
428 
429 /* Call this when a packet had to be dropped due to queue overflow.  Returns
430  * true if the BLUE state was quiescent before but active after this call.
431  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)432 static bool cobalt_queue_full(struct cobalt_vars *vars,
433 			      struct cobalt_params *p,
434 			      ktime_t now)
435 {
436 	bool up = false;
437 
438 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
439 		up = !vars->p_drop;
440 		vars->p_drop += p->p_inc;
441 		if (vars->p_drop < p->p_inc)
442 			vars->p_drop = ~0;
443 		vars->blue_timer = now;
444 	}
445 	vars->dropping = true;
446 	vars->drop_next = now;
447 	if (!vars->count)
448 		vars->count = 1;
449 
450 	return up;
451 }
452 
453 /* Call this when the queue was serviced but turned out to be empty.  Returns
454  * true if the BLUE state was active before but quiescent after this call.
455  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)456 static bool cobalt_queue_empty(struct cobalt_vars *vars,
457 			       struct cobalt_params *p,
458 			       ktime_t now)
459 {
460 	bool down = false;
461 
462 	if (vars->p_drop &&
463 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
464 		if (vars->p_drop < p->p_dec)
465 			vars->p_drop = 0;
466 		else
467 			vars->p_drop -= p->p_dec;
468 		vars->blue_timer = now;
469 		down = !vars->p_drop;
470 	}
471 	vars->dropping = false;
472 
473 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
474 		vars->count--;
475 		cobalt_invsqrt(vars);
476 		vars->drop_next = cobalt_control(vars->drop_next,
477 						 p->interval,
478 						 vars->rec_inv_sqrt);
479 	}
480 
481 	return down;
482 }
483 
484 /* Call this with a freshly dequeued packet for possible congestion marking.
485  * Returns true as an instruction to drop the packet, false for delivery.
486  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)487 static bool cobalt_should_drop(struct cobalt_vars *vars,
488 			       struct cobalt_params *p,
489 			       ktime_t now,
490 			       struct sk_buff *skb,
491 			       u32 bulk_flows)
492 {
493 	bool next_due, over_target, drop = false;
494 	ktime_t schedule;
495 	u64 sojourn;
496 
497 /* The 'schedule' variable records, in its sign, whether 'now' is before or
498  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
499  * scheduling decision is actually branched, without destroying that
500  * information.  Similarly, the first 'schedule' value calculated is preserved
501  * in the boolean 'next_due'.
502  *
503  * As for 'drop_next', we take advantage of the fact that 'interval' is both
504  * the delay between first exceeding 'target' and the first signalling event,
505  * *and* the scaling factor for the signalling frequency.  It's therefore very
506  * natural to use a single mechanism for both purposes, and eliminates a
507  * significant amount of reference Codel's spaghetti code.  To help with this,
508  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
509  * as possible to 1.0 in fixed-point.
510  */
511 
512 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
513 	schedule = ktime_sub(now, vars->drop_next);
514 	over_target = sojourn > p->target &&
515 		      sojourn > p->mtu_time * bulk_flows * 2 &&
516 		      sojourn > p->mtu_time * 4;
517 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
518 
519 	vars->ecn_marked = false;
520 
521 	if (over_target) {
522 		if (!vars->dropping) {
523 			vars->dropping = true;
524 			vars->drop_next = cobalt_control(now,
525 							 p->interval,
526 							 vars->rec_inv_sqrt);
527 		}
528 		if (!vars->count)
529 			vars->count = 1;
530 	} else if (vars->dropping) {
531 		vars->dropping = false;
532 	}
533 
534 	if (next_due && vars->dropping) {
535 		/* Use ECN mark if possible, otherwise drop */
536 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
537 
538 		vars->count++;
539 		if (!vars->count)
540 			vars->count--;
541 		cobalt_invsqrt(vars);
542 		vars->drop_next = cobalt_control(vars->drop_next,
543 						 p->interval,
544 						 vars->rec_inv_sqrt);
545 		schedule = ktime_sub(now, vars->drop_next);
546 	} else {
547 		while (next_due) {
548 			vars->count--;
549 			cobalt_invsqrt(vars);
550 			vars->drop_next = cobalt_control(vars->drop_next,
551 							 p->interval,
552 							 vars->rec_inv_sqrt);
553 			schedule = ktime_sub(now, vars->drop_next);
554 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
555 		}
556 	}
557 
558 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
559 	if (vars->p_drop)
560 		drop |= (get_random_u32() < vars->p_drop);
561 
562 	/* Overload the drop_next field as an activity timeout */
563 	if (!vars->count)
564 		vars->drop_next = ktime_add_ns(now, p->interval);
565 	else if (ktime_to_ns(schedule) > 0 && !drop)
566 		vars->drop_next = now;
567 
568 	return drop;
569 }
570 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)571 static bool cake_update_flowkeys(struct flow_keys *keys,
572 				 const struct sk_buff *skb)
573 {
574 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
575 	struct nf_conntrack_tuple tuple = {};
576 	bool rev = !skb->_nfct, upd = false;
577 	__be32 ip;
578 
579 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
580 		return false;
581 
582 	if (!nf_ct_get_tuple_skb(&tuple, skb))
583 		return false;
584 
585 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
586 	if (ip != keys->addrs.v4addrs.src) {
587 		keys->addrs.v4addrs.src = ip;
588 		upd = true;
589 	}
590 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
591 	if (ip != keys->addrs.v4addrs.dst) {
592 		keys->addrs.v4addrs.dst = ip;
593 		upd = true;
594 	}
595 
596 	if (keys->ports.ports) {
597 		__be16 port;
598 
599 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
600 		if (port != keys->ports.src) {
601 			keys->ports.src = port;
602 			upd = true;
603 		}
604 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
605 		if (port != keys->ports.dst) {
606 			port = keys->ports.dst;
607 			upd = true;
608 		}
609 	}
610 	return upd;
611 #else
612 	return false;
613 #endif
614 }
615 
616 /* Cake has several subtle multiple bit settings. In these cases you
617  *  would be matching triple isolate mode as well.
618  */
619 
cake_dsrc(int flow_mode)620 static bool cake_dsrc(int flow_mode)
621 {
622 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
623 }
624 
cake_ddst(int flow_mode)625 static bool cake_ddst(int flow_mode)
626 {
627 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
628 }
629 
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)630 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
631 					     struct cake_flow *flow,
632 					     int flow_mode)
633 {
634 	if (likely(cake_dsrc(flow_mode) &&
635 		   q->hosts[flow->srchost].srchost_bulk_flow_count))
636 		q->hosts[flow->srchost].srchost_bulk_flow_count--;
637 }
638 
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)639 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
640 					     struct cake_flow *flow,
641 					     int flow_mode)
642 {
643 	if (likely(cake_dsrc(flow_mode) &&
644 		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
645 		q->hosts[flow->srchost].srchost_bulk_flow_count++;
646 }
647 
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)648 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
649 					     struct cake_flow *flow,
650 					     int flow_mode)
651 {
652 	if (likely(cake_ddst(flow_mode) &&
653 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
654 		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
655 }
656 
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)657 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
658 					     struct cake_flow *flow,
659 					     int flow_mode)
660 {
661 	if (likely(cake_ddst(flow_mode) &&
662 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
663 		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
664 }
665 
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)666 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
667 				 struct cake_flow *flow,
668 				 int flow_mode)
669 {
670 	u16 host_load = 1;
671 
672 	if (cake_dsrc(flow_mode))
673 		host_load = max(host_load,
674 				q->hosts[flow->srchost].srchost_bulk_flow_count);
675 
676 	if (cake_ddst(flow_mode))
677 		host_load = max(host_load,
678 				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
679 
680 	/* The get_random_u16() is a way to apply dithering to avoid
681 	 * accumulating roundoff errors
682 	 */
683 	return (q->flow_quantum * quantum_div[host_load] +
684 		get_random_u16()) >> 16;
685 }
686 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)687 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
688 		     int flow_mode, u16 flow_override, u16 host_override)
689 {
690 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
691 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
692 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
693 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
694 	u16 reduced_hash, srchost_idx, dsthost_idx;
695 	struct flow_keys keys, host_keys;
696 	bool use_skbhash = skb->l4_hash;
697 
698 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
699 		return 0;
700 
701 	/* If both overrides are set, or we can use the SKB hash and nat mode is
702 	 * disabled, we can skip packet dissection entirely. If nat mode is
703 	 * enabled there's another check below after doing the conntrack lookup.
704 	 */
705 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
706 		goto skip_hash;
707 
708 	skb_flow_dissect_flow_keys(skb, &keys,
709 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
710 
711 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
712 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
713 		use_skbhash = false;
714 
715 	/* If we can still use the SKB hash and don't need the host hash, we can
716 	 * skip the rest of the hashing procedure
717 	 */
718 	if (use_skbhash && !hash_hosts)
719 		goto skip_hash;
720 
721 	/* flow_hash_from_keys() sorts the addresses by value, so we have
722 	 * to preserve their order in a separate data structure to treat
723 	 * src and dst host addresses as independently selectable.
724 	 */
725 	host_keys = keys;
726 	host_keys.ports.ports     = 0;
727 	host_keys.basic.ip_proto  = 0;
728 	host_keys.keyid.keyid     = 0;
729 	host_keys.tags.flow_label = 0;
730 
731 	switch (host_keys.control.addr_type) {
732 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
733 		host_keys.addrs.v4addrs.src = 0;
734 		dsthost_hash = flow_hash_from_keys(&host_keys);
735 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
736 		host_keys.addrs.v4addrs.dst = 0;
737 		srchost_hash = flow_hash_from_keys(&host_keys);
738 		break;
739 
740 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
741 		memset(&host_keys.addrs.v6addrs.src, 0,
742 		       sizeof(host_keys.addrs.v6addrs.src));
743 		dsthost_hash = flow_hash_from_keys(&host_keys);
744 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
745 		memset(&host_keys.addrs.v6addrs.dst, 0,
746 		       sizeof(host_keys.addrs.v6addrs.dst));
747 		srchost_hash = flow_hash_from_keys(&host_keys);
748 		break;
749 
750 	default:
751 		dsthost_hash = 0;
752 		srchost_hash = 0;
753 	}
754 
755 	/* This *must* be after the above switch, since as a
756 	 * side-effect it sorts the src and dst addresses.
757 	 */
758 	if (hash_flows && !use_skbhash)
759 		flow_hash = flow_hash_from_keys(&keys);
760 
761 skip_hash:
762 	if (flow_override)
763 		flow_hash = flow_override - 1;
764 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
765 		flow_hash = skb->hash;
766 	if (host_override) {
767 		dsthost_hash = host_override - 1;
768 		srchost_hash = host_override - 1;
769 	}
770 
771 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
772 		if (flow_mode & CAKE_FLOW_SRC_IP)
773 			flow_hash ^= srchost_hash;
774 
775 		if (flow_mode & CAKE_FLOW_DST_IP)
776 			flow_hash ^= dsthost_hash;
777 	}
778 
779 	reduced_hash = flow_hash % CAKE_QUEUES;
780 
781 	/* set-associative hashing */
782 	/* fast path if no hash collision (direct lookup succeeds) */
783 	if (likely(q->tags[reduced_hash] == flow_hash &&
784 		   q->flows[reduced_hash].set)) {
785 		q->way_directs++;
786 	} else {
787 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
788 		u32 outer_hash = reduced_hash - inner_hash;
789 		bool allocate_src = false;
790 		bool allocate_dst = false;
791 		u32 i, k;
792 
793 		/* check if any active queue in the set is reserved for
794 		 * this flow.
795 		 */
796 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
797 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
798 			if (q->tags[outer_hash + k] == flow_hash) {
799 				if (i)
800 					q->way_hits++;
801 
802 				if (!q->flows[outer_hash + k].set) {
803 					/* need to increment host refcnts */
804 					allocate_src = cake_dsrc(flow_mode);
805 					allocate_dst = cake_ddst(flow_mode);
806 				}
807 
808 				goto found;
809 			}
810 		}
811 
812 		/* no queue is reserved for this flow, look for an
813 		 * empty one.
814 		 */
815 		for (i = 0; i < CAKE_SET_WAYS;
816 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
817 			if (!q->flows[outer_hash + k].set) {
818 				q->way_misses++;
819 				allocate_src = cake_dsrc(flow_mode);
820 				allocate_dst = cake_ddst(flow_mode);
821 				goto found;
822 			}
823 		}
824 
825 		/* With no empty queues, default to the original
826 		 * queue, accept the collision, update the host tags.
827 		 */
828 		q->way_collisions++;
829 		allocate_src = cake_dsrc(flow_mode);
830 		allocate_dst = cake_ddst(flow_mode);
831 
832 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
833 			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
834 			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
835 		}
836 found:
837 		/* reserve queue for future packets in same flow */
838 		reduced_hash = outer_hash + k;
839 		q->tags[reduced_hash] = flow_hash;
840 
841 		if (allocate_src) {
842 			srchost_idx = srchost_hash % CAKE_QUEUES;
843 			inner_hash = srchost_idx % CAKE_SET_WAYS;
844 			outer_hash = srchost_idx - inner_hash;
845 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
846 				i++, k = (k + 1) % CAKE_SET_WAYS) {
847 				if (q->hosts[outer_hash + k].srchost_tag ==
848 				    srchost_hash)
849 					goto found_src;
850 			}
851 			for (i = 0; i < CAKE_SET_WAYS;
852 				i++, k = (k + 1) % CAKE_SET_WAYS) {
853 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
854 					break;
855 			}
856 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
857 found_src:
858 			srchost_idx = outer_hash + k;
859 			q->flows[reduced_hash].srchost = srchost_idx;
860 
861 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
862 				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
863 		}
864 
865 		if (allocate_dst) {
866 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
867 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
868 			outer_hash = dsthost_idx - inner_hash;
869 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
870 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
871 				if (q->hosts[outer_hash + k].dsthost_tag ==
872 				    dsthost_hash)
873 					goto found_dst;
874 			}
875 			for (i = 0; i < CAKE_SET_WAYS;
876 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
877 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
878 					break;
879 			}
880 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
881 found_dst:
882 			dsthost_idx = outer_hash + k;
883 			q->flows[reduced_hash].dsthost = dsthost_idx;
884 
885 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
886 				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
887 		}
888 	}
889 
890 	return reduced_hash;
891 }
892 
893 /* helper functions : might be changed when/if skb use a standard list_head */
894 /* remove one skb from head of slot queue */
895 
dequeue_head(struct cake_flow * flow)896 static struct sk_buff *dequeue_head(struct cake_flow *flow)
897 {
898 	struct sk_buff *skb = flow->head;
899 
900 	if (skb) {
901 		flow->head = skb->next;
902 		skb_mark_not_on_list(skb);
903 	}
904 
905 	return skb;
906 }
907 
908 /* add skb to flow queue (tail add) */
909 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)910 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
911 {
912 	if (!flow->head)
913 		flow->head = skb;
914 	else
915 		flow->tail->next = skb;
916 	flow->tail = skb;
917 	skb->next = NULL;
918 }
919 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)920 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
921 				    struct ipv6hdr *buf)
922 {
923 	unsigned int offset = skb_network_offset(skb);
924 	struct iphdr *iph;
925 
926 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
927 
928 	if (!iph)
929 		return NULL;
930 
931 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
932 		return skb_header_pointer(skb, offset + iph->ihl * 4,
933 					  sizeof(struct ipv6hdr), buf);
934 
935 	else if (iph->version == 4)
936 		return iph;
937 
938 	else if (iph->version == 6)
939 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
940 					  buf);
941 
942 	return NULL;
943 }
944 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)945 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
946 				      void *buf, unsigned int bufsize)
947 {
948 	unsigned int offset = skb_network_offset(skb);
949 	const struct ipv6hdr *ipv6h;
950 	const struct tcphdr *tcph;
951 	const struct iphdr *iph;
952 	struct ipv6hdr _ipv6h;
953 	struct tcphdr _tcph;
954 
955 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
956 
957 	if (!ipv6h)
958 		return NULL;
959 
960 	if (ipv6h->version == 4) {
961 		iph = (struct iphdr *)ipv6h;
962 		offset += iph->ihl * 4;
963 
964 		/* special-case 6in4 tunnelling, as that is a common way to get
965 		 * v6 connectivity in the home
966 		 */
967 		if (iph->protocol == IPPROTO_IPV6) {
968 			ipv6h = skb_header_pointer(skb, offset,
969 						   sizeof(_ipv6h), &_ipv6h);
970 
971 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
972 				return NULL;
973 
974 			offset += sizeof(struct ipv6hdr);
975 
976 		} else if (iph->protocol != IPPROTO_TCP) {
977 			return NULL;
978 		}
979 
980 	} else if (ipv6h->version == 6) {
981 		if (ipv6h->nexthdr != IPPROTO_TCP)
982 			return NULL;
983 
984 		offset += sizeof(struct ipv6hdr);
985 	} else {
986 		return NULL;
987 	}
988 
989 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
990 	if (!tcph || tcph->doff < 5)
991 		return NULL;
992 
993 	return skb_header_pointer(skb, offset,
994 				  min(__tcp_hdrlen(tcph), bufsize), buf);
995 }
996 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)997 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
998 				   int code, int *oplen)
999 {
1000 	/* inspired by tcp_parse_options in tcp_input.c */
1001 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1002 	const u8 *ptr = (const u8 *)(tcph + 1);
1003 
1004 	while (length > 0) {
1005 		int opcode = *ptr++;
1006 		int opsize;
1007 
1008 		if (opcode == TCPOPT_EOL)
1009 			break;
1010 		if (opcode == TCPOPT_NOP) {
1011 			length--;
1012 			continue;
1013 		}
1014 		if (length < 2)
1015 			break;
1016 		opsize = *ptr++;
1017 		if (opsize < 2 || opsize > length)
1018 			break;
1019 
1020 		if (opcode == code) {
1021 			*oplen = opsize;
1022 			return ptr;
1023 		}
1024 
1025 		ptr += opsize - 2;
1026 		length -= opsize;
1027 	}
1028 
1029 	return NULL;
1030 }
1031 
1032 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1033  * bytes than the other. In the case where both sequences ACKs bytes that the
1034  * other doesn't, A is considered greater. DSACKs in A also makes A be
1035  * considered greater.
1036  *
1037  * @return -1, 0 or 1 as normal compare functions
1038  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1039 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1040 				  const struct tcphdr *tcph_b)
1041 {
1042 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1043 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1044 	u32 bytes_a = 0, bytes_b = 0;
1045 	int oplen_a, oplen_b;
1046 	bool first = true;
1047 
1048 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1049 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1050 
1051 	/* pointers point to option contents */
1052 	oplen_a -= TCPOLEN_SACK_BASE;
1053 	oplen_b -= TCPOLEN_SACK_BASE;
1054 
1055 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1056 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1057 		return -1;
1058 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1059 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1060 		return 1;
1061 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1062 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1063 		return 0;
1064 
1065 	while (oplen_a >= sizeof(*sack_a)) {
1066 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1067 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1068 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1069 		int oplen_tmp = oplen_b;
1070 		bool found = false;
1071 
1072 		/* DSACK; always considered greater to prevent dropping */
1073 		if (before(start_a, ack_seq_a))
1074 			return -1;
1075 
1076 		bytes_a += end_a - start_a;
1077 
1078 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1079 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1080 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1081 
1082 			/* first time through we count the total size */
1083 			if (first)
1084 				bytes_b += end_b - start_b;
1085 
1086 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1087 				found = true;
1088 				if (!first)
1089 					break;
1090 			}
1091 			oplen_tmp -= sizeof(*sack_tmp);
1092 			sack_tmp++;
1093 		}
1094 
1095 		if (!found)
1096 			return -1;
1097 
1098 		oplen_a -= sizeof(*sack_a);
1099 		sack_a++;
1100 		first = false;
1101 	}
1102 
1103 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1104 	 * either the SACKs are equal, or B SACKs more bytes.
1105 	 */
1106 	return bytes_b > bytes_a ? 1 : 0;
1107 }
1108 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1109 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1110 				 u32 *tsval, u32 *tsecr)
1111 {
1112 	const u8 *ptr;
1113 	int opsize;
1114 
1115 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1116 
1117 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1118 		*tsval = get_unaligned_be32(ptr);
1119 		*tsecr = get_unaligned_be32(ptr + 4);
1120 	}
1121 }
1122 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1123 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1124 			       u32 tstamp_new, u32 tsecr_new)
1125 {
1126 	/* inspired by tcp_parse_options in tcp_input.c */
1127 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1128 	const u8 *ptr = (const u8 *)(tcph + 1);
1129 	u32 tstamp, tsecr;
1130 
1131 	/* 3 reserved flags must be unset to avoid future breakage
1132 	 * ACK must be set
1133 	 * ECE/CWR are handled separately
1134 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1135 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1136 	 * 0x00C00000 = CWR/ECE (handled separately)
1137 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1138 	 */
1139 	if (((tcp_flag_word(tcph) &
1140 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1141 		return false;
1142 
1143 	while (length > 0) {
1144 		int opcode = *ptr++;
1145 		int opsize;
1146 
1147 		if (opcode == TCPOPT_EOL)
1148 			break;
1149 		if (opcode == TCPOPT_NOP) {
1150 			length--;
1151 			continue;
1152 		}
1153 		if (length < 2)
1154 			break;
1155 		opsize = *ptr++;
1156 		if (opsize < 2 || opsize > length)
1157 			break;
1158 
1159 		switch (opcode) {
1160 		case TCPOPT_MD5SIG: /* doesn't influence state */
1161 			break;
1162 
1163 		case TCPOPT_SACK: /* stricter checking performed later */
1164 			if (opsize % 8 != 2)
1165 				return false;
1166 			break;
1167 
1168 		case TCPOPT_TIMESTAMP:
1169 			/* only drop timestamps lower than new */
1170 			if (opsize != TCPOLEN_TIMESTAMP)
1171 				return false;
1172 			tstamp = get_unaligned_be32(ptr);
1173 			tsecr = get_unaligned_be32(ptr + 4);
1174 			if (after(tstamp, tstamp_new) ||
1175 			    after(tsecr, tsecr_new))
1176 				return false;
1177 			break;
1178 
1179 		case TCPOPT_MSS:  /* these should only be set on SYN */
1180 		case TCPOPT_WINDOW:
1181 		case TCPOPT_SACK_PERM:
1182 		case TCPOPT_FASTOPEN:
1183 		case TCPOPT_EXP:
1184 		default: /* don't drop if any unknown options are present */
1185 			return false;
1186 		}
1187 
1188 		ptr += opsize - 2;
1189 		length -= opsize;
1190 	}
1191 
1192 	return true;
1193 }
1194 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1195 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1196 				       struct cake_flow *flow)
1197 {
1198 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1199 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1200 	struct sk_buff *skb_check, *skb_prev = NULL;
1201 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1202 	unsigned char _tcph[64], _tcph_check[64];
1203 	const struct tcphdr *tcph, *tcph_check;
1204 	const struct iphdr *iph, *iph_check;
1205 	struct ipv6hdr _iph, _iph_check;
1206 	const struct sk_buff *skb;
1207 	int seglen, num_found = 0;
1208 	u32 tstamp = 0, tsecr = 0;
1209 	__be32 elig_flags = 0;
1210 	int sack_comp;
1211 
1212 	/* no other possible ACKs to filter */
1213 	if (flow->head == flow->tail)
1214 		return NULL;
1215 
1216 	skb = flow->tail;
1217 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1218 	iph = cake_get_iphdr(skb, &_iph);
1219 	if (!tcph)
1220 		return NULL;
1221 
1222 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1223 
1224 	/* the 'triggering' packet need only have the ACK flag set.
1225 	 * also check that SYN is not set, as there won't be any previous ACKs.
1226 	 */
1227 	if ((tcp_flag_word(tcph) &
1228 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1229 		return NULL;
1230 
1231 	/* the 'triggering' ACK is at the tail of the queue, we have already
1232 	 * returned if it is the only packet in the flow. loop through the rest
1233 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1234 	 * triggering one.
1235 	 */
1236 	for (skb_check = flow->head;
1237 	     skb_check && skb_check != skb;
1238 	     skb_prev = skb_check, skb_check = skb_check->next) {
1239 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1240 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1241 					     sizeof(_tcph_check));
1242 
1243 		/* only TCP packets with matching 5-tuple are eligible, and only
1244 		 * drop safe headers
1245 		 */
1246 		if (!tcph_check || iph->version != iph_check->version ||
1247 		    tcph_check->source != tcph->source ||
1248 		    tcph_check->dest != tcph->dest)
1249 			continue;
1250 
1251 		if (iph_check->version == 4) {
1252 			if (iph_check->saddr != iph->saddr ||
1253 			    iph_check->daddr != iph->daddr)
1254 				continue;
1255 
1256 			seglen = iph_totlen(skb, iph_check) -
1257 				       (4 * iph_check->ihl);
1258 		} else if (iph_check->version == 6) {
1259 			ipv6h = (struct ipv6hdr *)iph;
1260 			ipv6h_check = (struct ipv6hdr *)iph_check;
1261 
1262 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1263 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1264 				continue;
1265 
1266 			seglen = ntohs(ipv6h_check->payload_len);
1267 		} else {
1268 			WARN_ON(1);  /* shouldn't happen */
1269 			continue;
1270 		}
1271 
1272 		/* If the ECE/CWR flags changed from the previous eligible
1273 		 * packet in the same flow, we should no longer be dropping that
1274 		 * previous packet as this would lose information.
1275 		 */
1276 		if (elig_ack && (tcp_flag_word(tcph_check) &
1277 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1278 			elig_ack = NULL;
1279 			elig_ack_prev = NULL;
1280 			num_found--;
1281 		}
1282 
1283 		/* Check TCP options and flags, don't drop ACKs with segment
1284 		 * data, and don't drop ACKs with a higher cumulative ACK
1285 		 * counter than the triggering packet. Check ACK seqno here to
1286 		 * avoid parsing SACK options of packets we are going to exclude
1287 		 * anyway.
1288 		 */
1289 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1290 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1291 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1292 			continue;
1293 
1294 		/* Check SACK options. The triggering packet must SACK more data
1295 		 * than the ACK under consideration, or SACK the same range but
1296 		 * have a larger cumulative ACK counter. The latter is a
1297 		 * pathological case, but is contained in the following check
1298 		 * anyway, just to be safe.
1299 		 */
1300 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1301 
1302 		if (sack_comp < 0 ||
1303 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1304 		     sack_comp == 0))
1305 			continue;
1306 
1307 		/* At this point we have found an eligible pure ACK to drop; if
1308 		 * we are in aggressive mode, we are done. Otherwise, keep
1309 		 * searching unless this is the second eligible ACK we
1310 		 * found.
1311 		 *
1312 		 * Since we want to drop ACK closest to the head of the queue,
1313 		 * save the first eligible ACK we find, even if we need to loop
1314 		 * again.
1315 		 */
1316 		if (!elig_ack) {
1317 			elig_ack = skb_check;
1318 			elig_ack_prev = skb_prev;
1319 			elig_flags = (tcp_flag_word(tcph_check)
1320 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1321 		}
1322 
1323 		if (num_found++ > 0)
1324 			goto found;
1325 	}
1326 
1327 	/* We made it through the queue without finding two eligible ACKs . If
1328 	 * we found a single eligible ACK we can drop it in aggressive mode if
1329 	 * we can guarantee that this does not interfere with ECN flag
1330 	 * information. We ensure this by dropping it only if the enqueued
1331 	 * packet is consecutive with the eligible ACK, and their flags match.
1332 	 */
1333 	if (elig_ack && aggressive && elig_ack->next == skb &&
1334 	    (elig_flags == (tcp_flag_word(tcph) &
1335 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1336 		goto found;
1337 
1338 	return NULL;
1339 
1340 found:
1341 	if (elig_ack_prev)
1342 		elig_ack_prev->next = elig_ack->next;
1343 	else
1344 		flow->head = elig_ack->next;
1345 
1346 	skb_mark_not_on_list(elig_ack);
1347 
1348 	return elig_ack;
1349 }
1350 
cake_ewma(u64 avg,u64 sample,u32 shift)1351 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1352 {
1353 	avg -= avg >> shift;
1354 	avg += sample >> shift;
1355 	return avg;
1356 }
1357 
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1358 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1359 {
1360 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1361 		len -= off;
1362 
1363 	if (q->max_netlen < len)
1364 		q->max_netlen = len;
1365 	if (q->min_netlen > len)
1366 		q->min_netlen = len;
1367 
1368 	len += q->rate_overhead;
1369 
1370 	if (len < q->rate_mpu)
1371 		len = q->rate_mpu;
1372 
1373 	if (q->atm_mode == CAKE_ATM_ATM) {
1374 		len += 47;
1375 		len /= 48;
1376 		len *= 53;
1377 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1378 		/* Add one byte per 64 bytes or part thereof.
1379 		 * This is conservative and easier to calculate than the
1380 		 * precise value.
1381 		 */
1382 		len += (len + 63) / 64;
1383 	}
1384 
1385 	if (q->max_adjlen < len)
1386 		q->max_adjlen = len;
1387 	if (q->min_adjlen > len)
1388 		q->min_adjlen = len;
1389 
1390 	return len;
1391 }
1392 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1393 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1394 {
1395 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1396 	unsigned int hdr_len, last_len = 0;
1397 	u32 off = skb_network_offset(skb);
1398 	u32 len = qdisc_pkt_len(skb);
1399 	u16 segs = 1;
1400 
1401 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1402 
1403 	if (!shinfo->gso_size)
1404 		return cake_calc_overhead(q, len, off);
1405 
1406 	/* borrowed from qdisc_pkt_len_init() */
1407 	hdr_len = skb_transport_offset(skb);
1408 
1409 	/* + transport layer */
1410 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1411 						SKB_GSO_TCPV6))) {
1412 		const struct tcphdr *th;
1413 		struct tcphdr _tcphdr;
1414 
1415 		th = skb_header_pointer(skb, hdr_len,
1416 					sizeof(_tcphdr), &_tcphdr);
1417 		if (likely(th))
1418 			hdr_len += __tcp_hdrlen(th);
1419 	} else {
1420 		struct udphdr _udphdr;
1421 
1422 		if (skb_header_pointer(skb, hdr_len,
1423 				       sizeof(_udphdr), &_udphdr))
1424 			hdr_len += sizeof(struct udphdr);
1425 	}
1426 
1427 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1428 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1429 				    shinfo->gso_size);
1430 	else
1431 		segs = shinfo->gso_segs;
1432 
1433 	len = shinfo->gso_size + hdr_len;
1434 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1435 
1436 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1437 		cake_calc_overhead(q, last_len, off));
1438 }
1439 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1440 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1441 {
1442 	struct cake_heap_entry ii = q->overflow_heap[i];
1443 	struct cake_heap_entry jj = q->overflow_heap[j];
1444 
1445 	q->overflow_heap[i] = jj;
1446 	q->overflow_heap[j] = ii;
1447 
1448 	q->tins[ii.t].overflow_idx[ii.b] = j;
1449 	q->tins[jj.t].overflow_idx[jj.b] = i;
1450 }
1451 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1452 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1453 {
1454 	struct cake_heap_entry ii = q->overflow_heap[i];
1455 
1456 	return q->tins[ii.t].backlogs[ii.b];
1457 }
1458 
cake_heapify(struct cake_sched_data * q,u16 i)1459 static void cake_heapify(struct cake_sched_data *q, u16 i)
1460 {
1461 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1462 	u32 mb = cake_heap_get_backlog(q, i);
1463 	u32 m = i;
1464 
1465 	while (m < a) {
1466 		u32 l = m + m + 1;
1467 		u32 r = l + 1;
1468 
1469 		if (l < a) {
1470 			u32 lb = cake_heap_get_backlog(q, l);
1471 
1472 			if (lb > mb) {
1473 				m  = l;
1474 				mb = lb;
1475 			}
1476 		}
1477 
1478 		if (r < a) {
1479 			u32 rb = cake_heap_get_backlog(q, r);
1480 
1481 			if (rb > mb) {
1482 				m  = r;
1483 				mb = rb;
1484 			}
1485 		}
1486 
1487 		if (m != i) {
1488 			cake_heap_swap(q, i, m);
1489 			i = m;
1490 		} else {
1491 			break;
1492 		}
1493 	}
1494 }
1495 
cake_heapify_up(struct cake_sched_data * q,u16 i)1496 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1497 {
1498 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1499 		u16 p = (i - 1) >> 1;
1500 		u32 ib = cake_heap_get_backlog(q, i);
1501 		u32 pb = cake_heap_get_backlog(q, p);
1502 
1503 		if (ib > pb) {
1504 			cake_heap_swap(q, i, p);
1505 			i = p;
1506 		} else {
1507 			break;
1508 		}
1509 	}
1510 }
1511 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1512 static int cake_advance_shaper(struct cake_sched_data *q,
1513 			       struct cake_tin_data *b,
1514 			       struct sk_buff *skb,
1515 			       ktime_t now, bool drop)
1516 {
1517 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1518 
1519 	/* charge packet bandwidth to this tin
1520 	 * and to the global shaper.
1521 	 */
1522 	if (q->rate_ns) {
1523 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1524 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1525 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1526 
1527 		if (ktime_before(b->time_next_packet, now))
1528 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1529 							   tin_dur);
1530 
1531 		else if (ktime_before(b->time_next_packet,
1532 				      ktime_add_ns(now, tin_dur)))
1533 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1534 
1535 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1536 						   global_dur);
1537 		if (!drop)
1538 			q->failsafe_next_packet = \
1539 				ktime_add_ns(q->failsafe_next_packet,
1540 					     failsafe_dur);
1541 	}
1542 	return len;
1543 }
1544 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1545 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1546 {
1547 	struct cake_sched_data *q = qdisc_priv(sch);
1548 	ktime_t now = ktime_get();
1549 	u32 idx = 0, tin = 0, len;
1550 	struct cake_heap_entry qq;
1551 	struct cake_tin_data *b;
1552 	struct cake_flow *flow;
1553 	struct sk_buff *skb;
1554 
1555 	if (!q->overflow_timeout) {
1556 		int i;
1557 		/* Build fresh max-heap */
1558 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1559 			cake_heapify(q, i);
1560 	}
1561 	q->overflow_timeout = 65535;
1562 
1563 	/* select longest queue for pruning */
1564 	qq  = q->overflow_heap[0];
1565 	tin = qq.t;
1566 	idx = qq.b;
1567 
1568 	b = &q->tins[tin];
1569 	flow = &b->flows[idx];
1570 	skb = dequeue_head(flow);
1571 	if (unlikely(!skb)) {
1572 		/* heap has gone wrong, rebuild it next time */
1573 		q->overflow_timeout = 0;
1574 		return idx + (tin << 16);
1575 	}
1576 
1577 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1578 		b->unresponsive_flow_count++;
1579 
1580 	len = qdisc_pkt_len(skb);
1581 	q->buffer_used      -= skb->truesize;
1582 	b->backlogs[idx]    -= len;
1583 	b->tin_backlog      -= len;
1584 	sch->qstats.backlog -= len;
1585 
1586 	flow->dropped++;
1587 	b->tin_dropped++;
1588 	sch->qstats.drops++;
1589 
1590 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1591 		cake_advance_shaper(q, b, skb, now, true);
1592 
1593 	__qdisc_drop(skb, to_free);
1594 	sch->q.qlen--;
1595 	qdisc_tree_reduce_backlog(sch, 1, len);
1596 
1597 	cake_heapify(q, 0);
1598 
1599 	return idx + (tin << 16);
1600 }
1601 
cake_handle_diffserv(struct sk_buff * skb,bool wash)1602 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1603 {
1604 	const int offset = skb_network_offset(skb);
1605 	u16 *buf, buf_;
1606 	u8 dscp;
1607 
1608 	switch (skb_protocol(skb, true)) {
1609 	case htons(ETH_P_IP):
1610 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1611 		if (unlikely(!buf))
1612 			return 0;
1613 
1614 		/* ToS is in the second byte of iphdr */
1615 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1616 
1617 		if (wash && dscp) {
1618 			const int wlen = offset + sizeof(struct iphdr);
1619 
1620 			if (!pskb_may_pull(skb, wlen) ||
1621 			    skb_try_make_writable(skb, wlen))
1622 				return 0;
1623 
1624 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1625 		}
1626 
1627 		return dscp;
1628 
1629 	case htons(ETH_P_IPV6):
1630 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1631 		if (unlikely(!buf))
1632 			return 0;
1633 
1634 		/* Traffic class is in the first and second bytes of ipv6hdr */
1635 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1636 
1637 		if (wash && dscp) {
1638 			const int wlen = offset + sizeof(struct ipv6hdr);
1639 
1640 			if (!pskb_may_pull(skb, wlen) ||
1641 			    skb_try_make_writable(skb, wlen))
1642 				return 0;
1643 
1644 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1645 		}
1646 
1647 		return dscp;
1648 
1649 	case htons(ETH_P_ARP):
1650 		return 0x38;  /* CS7 - Net Control */
1651 
1652 	default:
1653 		/* If there is no Diffserv field, treat as best-effort */
1654 		return 0;
1655 	}
1656 }
1657 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1658 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1659 					     struct sk_buff *skb)
1660 {
1661 	struct cake_sched_data *q = qdisc_priv(sch);
1662 	u32 tin, mark;
1663 	bool wash;
1664 	u8 dscp;
1665 
1666 	/* Tin selection: Default to diffserv-based selection, allow overriding
1667 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1668 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1669 	 */
1670 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1671 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1672 	if (wash)
1673 		dscp = cake_handle_diffserv(skb, wash);
1674 
1675 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1676 		tin = 0;
1677 
1678 	else if (mark && mark <= q->tin_cnt)
1679 		tin = q->tin_order[mark - 1];
1680 
1681 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1682 		 TC_H_MIN(skb->priority) > 0 &&
1683 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1684 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1685 
1686 	else {
1687 		if (!wash)
1688 			dscp = cake_handle_diffserv(skb, wash);
1689 		tin = q->tin_index[dscp];
1690 
1691 		if (unlikely(tin >= q->tin_cnt))
1692 			tin = 0;
1693 	}
1694 
1695 	return &q->tins[tin];
1696 }
1697 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1698 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1699 			 struct sk_buff *skb, int flow_mode, int *qerr)
1700 {
1701 	struct cake_sched_data *q = qdisc_priv(sch);
1702 	struct tcf_proto *filter;
1703 	struct tcf_result res;
1704 	u16 flow = 0, host = 0;
1705 	int result;
1706 
1707 	filter = rcu_dereference_bh(q->filter_list);
1708 	if (!filter)
1709 		goto hash;
1710 
1711 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1712 	result = tcf_classify(skb, NULL, filter, &res, false);
1713 
1714 	if (result >= 0) {
1715 #ifdef CONFIG_NET_CLS_ACT
1716 		switch (result) {
1717 		case TC_ACT_STOLEN:
1718 		case TC_ACT_QUEUED:
1719 		case TC_ACT_TRAP:
1720 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1721 			fallthrough;
1722 		case TC_ACT_SHOT:
1723 			return 0;
1724 		}
1725 #endif
1726 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1727 			flow = TC_H_MIN(res.classid);
1728 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1729 			host = TC_H_MAJ(res.classid) >> 16;
1730 	}
1731 hash:
1732 	*t = cake_select_tin(sch, skb);
1733 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1734 }
1735 
1736 static void cake_reconfigure(struct Qdisc *sch);
1737 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1738 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1739 			struct sk_buff **to_free)
1740 {
1741 	struct cake_sched_data *q = qdisc_priv(sch);
1742 	int len = qdisc_pkt_len(skb);
1743 	int ret;
1744 	struct sk_buff *ack = NULL;
1745 	ktime_t now = ktime_get();
1746 	struct cake_tin_data *b;
1747 	struct cake_flow *flow;
1748 	u32 idx, tin;
1749 
1750 	/* choose flow to insert into */
1751 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1752 	if (idx == 0) {
1753 		if (ret & __NET_XMIT_BYPASS)
1754 			qdisc_qstats_drop(sch);
1755 		__qdisc_drop(skb, to_free);
1756 		return ret;
1757 	}
1758 	tin = (u32)(b - q->tins);
1759 	idx--;
1760 	flow = &b->flows[idx];
1761 
1762 	/* ensure shaper state isn't stale */
1763 	if (!b->tin_backlog) {
1764 		if (ktime_before(b->time_next_packet, now))
1765 			b->time_next_packet = now;
1766 
1767 		if (!sch->q.qlen) {
1768 			if (ktime_before(q->time_next_packet, now)) {
1769 				q->failsafe_next_packet = now;
1770 				q->time_next_packet = now;
1771 			} else if (ktime_after(q->time_next_packet, now) &&
1772 				   ktime_after(q->failsafe_next_packet, now)) {
1773 				u64 next = \
1774 					min(ktime_to_ns(q->time_next_packet),
1775 					    ktime_to_ns(
1776 						   q->failsafe_next_packet));
1777 				sch->qstats.overlimits++;
1778 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1779 			}
1780 		}
1781 	}
1782 
1783 	if (unlikely(len > b->max_skblen))
1784 		b->max_skblen = len;
1785 
1786 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1787 		struct sk_buff *segs, *nskb;
1788 		netdev_features_t features = netif_skb_features(skb);
1789 		unsigned int slen = 0, numsegs = 0;
1790 
1791 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1792 		if (IS_ERR_OR_NULL(segs))
1793 			return qdisc_drop(skb, sch, to_free);
1794 
1795 		skb_list_walk_safe(segs, segs, nskb) {
1796 			skb_mark_not_on_list(segs);
1797 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1798 			cobalt_set_enqueue_time(segs, now);
1799 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1800 									  segs);
1801 			flow_queue_add(flow, segs);
1802 
1803 			sch->q.qlen++;
1804 			numsegs++;
1805 			slen += segs->len;
1806 			q->buffer_used += segs->truesize;
1807 			b->packets++;
1808 		}
1809 
1810 		/* stats */
1811 		b->bytes	    += slen;
1812 		b->backlogs[idx]    += slen;
1813 		b->tin_backlog      += slen;
1814 		sch->qstats.backlog += slen;
1815 		q->avg_window_bytes += slen;
1816 
1817 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1818 		consume_skb(skb);
1819 	} else {
1820 		/* not splitting */
1821 		cobalt_set_enqueue_time(skb, now);
1822 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1823 		flow_queue_add(flow, skb);
1824 
1825 		if (q->ack_filter)
1826 			ack = cake_ack_filter(q, flow);
1827 
1828 		if (ack) {
1829 			b->ack_drops++;
1830 			sch->qstats.drops++;
1831 			b->bytes += qdisc_pkt_len(ack);
1832 			len -= qdisc_pkt_len(ack);
1833 			q->buffer_used += skb->truesize - ack->truesize;
1834 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1835 				cake_advance_shaper(q, b, ack, now, true);
1836 
1837 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1838 			consume_skb(ack);
1839 		} else {
1840 			sch->q.qlen++;
1841 			q->buffer_used      += skb->truesize;
1842 		}
1843 
1844 		/* stats */
1845 		b->packets++;
1846 		b->bytes	    += len;
1847 		b->backlogs[idx]    += len;
1848 		b->tin_backlog      += len;
1849 		sch->qstats.backlog += len;
1850 		q->avg_window_bytes += len;
1851 	}
1852 
1853 	if (q->overflow_timeout)
1854 		cake_heapify_up(q, b->overflow_idx[idx]);
1855 
1856 	/* incoming bandwidth capacity estimate */
1857 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1858 		u64 packet_interval = \
1859 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1860 
1861 		if (packet_interval > NSEC_PER_SEC)
1862 			packet_interval = NSEC_PER_SEC;
1863 
1864 		/* filter out short-term bursts, eg. wifi aggregation */
1865 		q->avg_packet_interval = \
1866 			cake_ewma(q->avg_packet_interval,
1867 				  packet_interval,
1868 				  (packet_interval > q->avg_packet_interval ?
1869 					  2 : 8));
1870 
1871 		q->last_packet_time = now;
1872 
1873 		if (packet_interval > q->avg_packet_interval) {
1874 			u64 window_interval = \
1875 				ktime_to_ns(ktime_sub(now,
1876 						      q->avg_window_begin));
1877 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1878 
1879 			b = div64_u64(b, window_interval);
1880 			q->avg_peak_bandwidth =
1881 				cake_ewma(q->avg_peak_bandwidth, b,
1882 					  b > q->avg_peak_bandwidth ? 2 : 8);
1883 			q->avg_window_bytes = 0;
1884 			q->avg_window_begin = now;
1885 
1886 			if (ktime_after(now,
1887 					ktime_add_ms(q->last_reconfig_time,
1888 						     250))) {
1889 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1890 				cake_reconfigure(sch);
1891 			}
1892 		}
1893 	} else {
1894 		q->avg_window_bytes = 0;
1895 		q->last_packet_time = now;
1896 	}
1897 
1898 	/* flowchain */
1899 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1900 		if (!flow->set) {
1901 			list_add_tail(&flow->flowchain, &b->new_flows);
1902 		} else {
1903 			b->decaying_flow_count--;
1904 			list_move_tail(&flow->flowchain, &b->new_flows);
1905 		}
1906 		flow->set = CAKE_SET_SPARSE;
1907 		b->sparse_flow_count++;
1908 
1909 		flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
1910 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1911 		/* this flow was empty, accounted as a sparse flow, but actually
1912 		 * in the bulk rotation.
1913 		 */
1914 		flow->set = CAKE_SET_BULK;
1915 		b->sparse_flow_count--;
1916 		b->bulk_flow_count++;
1917 
1918 		cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
1919 		cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
1920 	}
1921 
1922 	if (q->buffer_used > q->buffer_max_used)
1923 		q->buffer_max_used = q->buffer_used;
1924 
1925 	if (q->buffer_used > q->buffer_limit) {
1926 		bool same_flow = false;
1927 		u32 dropped = 0;
1928 		u32 drop_id;
1929 
1930 		while (q->buffer_used > q->buffer_limit) {
1931 			dropped++;
1932 			drop_id = cake_drop(sch, to_free);
1933 
1934 			if ((drop_id >> 16) == tin &&
1935 			    (drop_id & 0xFFFF) == idx)
1936 				same_flow = true;
1937 		}
1938 		b->drop_overlimit += dropped;
1939 
1940 		if (same_flow)
1941 			return NET_XMIT_CN;
1942 	}
1943 	return NET_XMIT_SUCCESS;
1944 }
1945 
cake_dequeue_one(struct Qdisc * sch)1946 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1947 {
1948 	struct cake_sched_data *q = qdisc_priv(sch);
1949 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1950 	struct cake_flow *flow = &b->flows[q->cur_flow];
1951 	struct sk_buff *skb = NULL;
1952 	u32 len;
1953 
1954 	if (flow->head) {
1955 		skb = dequeue_head(flow);
1956 		len = qdisc_pkt_len(skb);
1957 		b->backlogs[q->cur_flow] -= len;
1958 		b->tin_backlog		 -= len;
1959 		sch->qstats.backlog      -= len;
1960 		q->buffer_used		 -= skb->truesize;
1961 		sch->q.qlen--;
1962 
1963 		if (q->overflow_timeout)
1964 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1965 	}
1966 	return skb;
1967 }
1968 
1969 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1970 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1971 {
1972 	struct cake_sched_data *q = qdisc_priv(sch);
1973 	struct sk_buff *skb;
1974 
1975 	q->cur_tin = tin;
1976 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1977 		while (!!(skb = cake_dequeue_one(sch)))
1978 			kfree_skb(skb);
1979 }
1980 
cake_dequeue(struct Qdisc * sch)1981 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1982 {
1983 	struct cake_sched_data *q = qdisc_priv(sch);
1984 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1985 	ktime_t now = ktime_get();
1986 	struct cake_flow *flow;
1987 	struct list_head *head;
1988 	bool first_flow = true;
1989 	struct sk_buff *skb;
1990 	u64 delay;
1991 	u32 len;
1992 
1993 begin:
1994 	if (!sch->q.qlen)
1995 		return NULL;
1996 
1997 	/* global hard shaper */
1998 	if (ktime_after(q->time_next_packet, now) &&
1999 	    ktime_after(q->failsafe_next_packet, now)) {
2000 		u64 next = min(ktime_to_ns(q->time_next_packet),
2001 			       ktime_to_ns(q->failsafe_next_packet));
2002 
2003 		sch->qstats.overlimits++;
2004 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2005 		return NULL;
2006 	}
2007 
2008 	/* Choose a class to work on. */
2009 	if (!q->rate_ns) {
2010 		/* In unlimited mode, can't rely on shaper timings, just balance
2011 		 * with DRR
2012 		 */
2013 		bool wrapped = false, empty = true;
2014 
2015 		while (b->tin_deficit < 0 ||
2016 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2017 			if (b->tin_deficit <= 0)
2018 				b->tin_deficit += b->tin_quantum;
2019 			if (b->sparse_flow_count + b->bulk_flow_count)
2020 				empty = false;
2021 
2022 			q->cur_tin++;
2023 			b++;
2024 			if (q->cur_tin >= q->tin_cnt) {
2025 				q->cur_tin = 0;
2026 				b = q->tins;
2027 
2028 				if (wrapped) {
2029 					/* It's possible for q->qlen to be
2030 					 * nonzero when we actually have no
2031 					 * packets anywhere.
2032 					 */
2033 					if (empty)
2034 						return NULL;
2035 				} else {
2036 					wrapped = true;
2037 				}
2038 			}
2039 		}
2040 	} else {
2041 		/* In shaped mode, choose:
2042 		 * - Highest-priority tin with queue and meeting schedule, or
2043 		 * - The earliest-scheduled tin with queue.
2044 		 */
2045 		ktime_t best_time = KTIME_MAX;
2046 		int tin, best_tin = 0;
2047 
2048 		for (tin = 0; tin < q->tin_cnt; tin++) {
2049 			b = q->tins + tin;
2050 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2051 				ktime_t time_to_pkt = \
2052 					ktime_sub(b->time_next_packet, now);
2053 
2054 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2055 				    ktime_compare(time_to_pkt,
2056 						  best_time) <= 0) {
2057 					best_time = time_to_pkt;
2058 					best_tin = tin;
2059 				}
2060 			}
2061 		}
2062 
2063 		q->cur_tin = best_tin;
2064 		b = q->tins + best_tin;
2065 
2066 		/* No point in going further if no packets to deliver. */
2067 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2068 			return NULL;
2069 	}
2070 
2071 retry:
2072 	/* service this class */
2073 	head = &b->decaying_flows;
2074 	if (!first_flow || list_empty(head)) {
2075 		head = &b->new_flows;
2076 		if (list_empty(head)) {
2077 			head = &b->old_flows;
2078 			if (unlikely(list_empty(head))) {
2079 				head = &b->decaying_flows;
2080 				if (unlikely(list_empty(head)))
2081 					goto begin;
2082 			}
2083 		}
2084 	}
2085 	flow = list_first_entry(head, struct cake_flow, flowchain);
2086 	q->cur_flow = flow - b->flows;
2087 	first_flow = false;
2088 
2089 	/* flow isolation (DRR++) */
2090 	if (flow->deficit <= 0) {
2091 		/* Keep all flows with deficits out of the sparse and decaying
2092 		 * rotations.  No non-empty flow can go into the decaying
2093 		 * rotation, so they can't get deficits
2094 		 */
2095 		if (flow->set == CAKE_SET_SPARSE) {
2096 			if (flow->head) {
2097 				b->sparse_flow_count--;
2098 				b->bulk_flow_count++;
2099 
2100 				cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
2101 				cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2102 
2103 				flow->set = CAKE_SET_BULK;
2104 			} else {
2105 				/* we've moved it to the bulk rotation for
2106 				 * correct deficit accounting but we still want
2107 				 * to count it as a sparse flow, not a bulk one.
2108 				 */
2109 				flow->set = CAKE_SET_SPARSE_WAIT;
2110 			}
2111 		}
2112 
2113 		flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
2114 		list_move_tail(&flow->flowchain, &b->old_flows);
2115 
2116 		goto retry;
2117 	}
2118 
2119 	/* Retrieve a packet via the AQM */
2120 	while (1) {
2121 		skb = cake_dequeue_one(sch);
2122 		if (!skb) {
2123 			/* this queue was actually empty */
2124 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2125 				b->unresponsive_flow_count--;
2126 
2127 			if (flow->cvars.p_drop || flow->cvars.count ||
2128 			    ktime_before(now, flow->cvars.drop_next)) {
2129 				/* keep in the flowchain until the state has
2130 				 * decayed to rest
2131 				 */
2132 				list_move_tail(&flow->flowchain,
2133 					       &b->decaying_flows);
2134 				if (flow->set == CAKE_SET_BULK) {
2135 					b->bulk_flow_count--;
2136 
2137 					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2138 					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2139 
2140 					b->decaying_flow_count++;
2141 				} else if (flow->set == CAKE_SET_SPARSE ||
2142 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2143 					b->sparse_flow_count--;
2144 					b->decaying_flow_count++;
2145 				}
2146 				flow->set = CAKE_SET_DECAYING;
2147 			} else {
2148 				/* remove empty queue from the flowchain */
2149 				list_del_init(&flow->flowchain);
2150 				if (flow->set == CAKE_SET_SPARSE ||
2151 				    flow->set == CAKE_SET_SPARSE_WAIT)
2152 					b->sparse_flow_count--;
2153 				else if (flow->set == CAKE_SET_BULK) {
2154 					b->bulk_flow_count--;
2155 
2156 					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2157 					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2158 				} else
2159 					b->decaying_flow_count--;
2160 
2161 				flow->set = CAKE_SET_NONE;
2162 			}
2163 			goto begin;
2164 		}
2165 
2166 		/* Last packet in queue may be marked, shouldn't be dropped */
2167 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2168 					(b->bulk_flow_count *
2169 					 !!(q->rate_flags &
2170 					    CAKE_FLAG_INGRESS))) ||
2171 		    !flow->head)
2172 			break;
2173 
2174 		/* drop this packet, get another one */
2175 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2176 			len = cake_advance_shaper(q, b, skb,
2177 						  now, true);
2178 			flow->deficit -= len;
2179 			b->tin_deficit -= len;
2180 		}
2181 		flow->dropped++;
2182 		b->tin_dropped++;
2183 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2184 		qdisc_qstats_drop(sch);
2185 		kfree_skb(skb);
2186 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2187 			goto retry;
2188 	}
2189 
2190 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2191 	qdisc_bstats_update(sch, skb);
2192 
2193 	/* collect delay stats */
2194 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2195 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2196 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2197 				  delay > b->peak_delay ? 2 : 8);
2198 	b->base_delay = cake_ewma(b->base_delay, delay,
2199 				  delay < b->base_delay ? 2 : 8);
2200 
2201 	len = cake_advance_shaper(q, b, skb, now, false);
2202 	flow->deficit -= len;
2203 	b->tin_deficit -= len;
2204 
2205 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2206 		u64 next = min(ktime_to_ns(q->time_next_packet),
2207 			       ktime_to_ns(q->failsafe_next_packet));
2208 
2209 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2210 	} else if (!sch->q.qlen) {
2211 		int i;
2212 
2213 		for (i = 0; i < q->tin_cnt; i++) {
2214 			if (q->tins[i].decaying_flow_count) {
2215 				ktime_t next = \
2216 					ktime_add_ns(now,
2217 						     q->tins[i].cparams.target);
2218 
2219 				qdisc_watchdog_schedule_ns(&q->watchdog,
2220 							   ktime_to_ns(next));
2221 				break;
2222 			}
2223 		}
2224 	}
2225 
2226 	if (q->overflow_timeout)
2227 		q->overflow_timeout--;
2228 
2229 	return skb;
2230 }
2231 
cake_reset(struct Qdisc * sch)2232 static void cake_reset(struct Qdisc *sch)
2233 {
2234 	struct cake_sched_data *q = qdisc_priv(sch);
2235 	u32 c;
2236 
2237 	if (!q->tins)
2238 		return;
2239 
2240 	for (c = 0; c < CAKE_MAX_TINS; c++)
2241 		cake_clear_tin(sch, c);
2242 }
2243 
2244 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2245 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2246 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2247 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2248 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2249 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2250 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2251 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2252 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2253 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2254 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2255 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2256 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2257 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2258 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2259 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2260 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2261 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2262 };
2263 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2264 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2265 			  u64 target_ns, u64 rtt_est_ns)
2266 {
2267 	/* convert byte-rate into time-per-byte
2268 	 * so it will always unwedge in reasonable time.
2269 	 */
2270 	static const u64 MIN_RATE = 64;
2271 	u32 byte_target = mtu;
2272 	u64 byte_target_ns;
2273 	u8  rate_shft = 0;
2274 	u64 rate_ns = 0;
2275 
2276 	b->flow_quantum = 1514;
2277 	if (rate) {
2278 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2279 		rate_shft = 34;
2280 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2281 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2282 		while (!!(rate_ns >> 34)) {
2283 			rate_ns >>= 1;
2284 			rate_shft--;
2285 		}
2286 	} /* else unlimited, ie. zero delay */
2287 
2288 	b->tin_rate_bps  = rate;
2289 	b->tin_rate_ns   = rate_ns;
2290 	b->tin_rate_shft = rate_shft;
2291 
2292 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2293 
2294 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2295 	b->cparams.interval = max(rtt_est_ns +
2296 				     b->cparams.target - target_ns,
2297 				     b->cparams.target * 2);
2298 	b->cparams.mtu_time = byte_target_ns;
2299 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2300 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2301 }
2302 
cake_config_besteffort(struct Qdisc * sch)2303 static int cake_config_besteffort(struct Qdisc *sch)
2304 {
2305 	struct cake_sched_data *q = qdisc_priv(sch);
2306 	struct cake_tin_data *b = &q->tins[0];
2307 	u32 mtu = psched_mtu(qdisc_dev(sch));
2308 	u64 rate = q->rate_bps;
2309 
2310 	q->tin_cnt = 1;
2311 
2312 	q->tin_index = besteffort;
2313 	q->tin_order = normal_order;
2314 
2315 	cake_set_rate(b, rate, mtu,
2316 		      us_to_ns(q->target), us_to_ns(q->interval));
2317 	b->tin_quantum = 65535;
2318 
2319 	return 0;
2320 }
2321 
cake_config_precedence(struct Qdisc * sch)2322 static int cake_config_precedence(struct Qdisc *sch)
2323 {
2324 	/* convert high-level (user visible) parameters into internal format */
2325 	struct cake_sched_data *q = qdisc_priv(sch);
2326 	u32 mtu = psched_mtu(qdisc_dev(sch));
2327 	u64 rate = q->rate_bps;
2328 	u32 quantum = 256;
2329 	u32 i;
2330 
2331 	q->tin_cnt = 8;
2332 	q->tin_index = precedence;
2333 	q->tin_order = normal_order;
2334 
2335 	for (i = 0; i < q->tin_cnt; i++) {
2336 		struct cake_tin_data *b = &q->tins[i];
2337 
2338 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2339 			      us_to_ns(q->interval));
2340 
2341 		b->tin_quantum = max_t(u16, 1U, quantum);
2342 
2343 		/* calculate next class's parameters */
2344 		rate  *= 7;
2345 		rate >>= 3;
2346 
2347 		quantum  *= 7;
2348 		quantum >>= 3;
2349 	}
2350 
2351 	return 0;
2352 }
2353 
2354 /*	List of known Diffserv codepoints:
2355  *
2356  *	Default Forwarding (DF/CS0) - Best Effort
2357  *	Max Throughput (TOS2)
2358  *	Min Delay (TOS4)
2359  *	LLT "La" (TOS5)
2360  *	Assured Forwarding 1 (AF1x) - x3
2361  *	Assured Forwarding 2 (AF2x) - x3
2362  *	Assured Forwarding 3 (AF3x) - x3
2363  *	Assured Forwarding 4 (AF4x) - x3
2364  *	Precedence Class 1 (CS1)
2365  *	Precedence Class 2 (CS2)
2366  *	Precedence Class 3 (CS3)
2367  *	Precedence Class 4 (CS4)
2368  *	Precedence Class 5 (CS5)
2369  *	Precedence Class 6 (CS6)
2370  *	Precedence Class 7 (CS7)
2371  *	Voice Admit (VA)
2372  *	Expedited Forwarding (EF)
2373  *	Lower Effort (LE)
2374  *
2375  *	Total 26 codepoints.
2376  */
2377 
2378 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2379  *		(roughly descending order of contended priority)
2380  *		(roughly ascending order of uncontended throughput)
2381  *
2382  *	Network Control (CS6,CS7)      - routing traffic
2383  *	Telephony (EF,VA)         - aka. VoIP streams
2384  *	Signalling (CS5)               - VoIP setup
2385  *	Multimedia Conferencing (AF4x) - aka. video calls
2386  *	Realtime Interactive (CS4)     - eg. games
2387  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2388  *	Broadcast Video (CS3)
2389  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2390  *	Ops, Admin, Management (CS2)      - eg. ssh
2391  *	Standard Service (DF & unrecognised codepoints)
2392  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2393  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2394  *
2395  *	Total 12 traffic classes.
2396  */
2397 
cake_config_diffserv8(struct Qdisc * sch)2398 static int cake_config_diffserv8(struct Qdisc *sch)
2399 {
2400 /*	Pruned list of traffic classes for typical applications:
2401  *
2402  *		Network Control          (CS6, CS7)
2403  *		Minimum Latency          (EF, VA, CS5, CS4)
2404  *		Interactive Shell        (CS2)
2405  *		Low Latency Transactions (AF2x, TOS4)
2406  *		Video Streaming          (AF4x, AF3x, CS3)
2407  *		Bog Standard             (DF etc.)
2408  *		High Throughput          (AF1x, TOS2, CS1)
2409  *		Background Traffic       (LE)
2410  *
2411  *		Total 8 traffic classes.
2412  */
2413 
2414 	struct cake_sched_data *q = qdisc_priv(sch);
2415 	u32 mtu = psched_mtu(qdisc_dev(sch));
2416 	u64 rate = q->rate_bps;
2417 	u32 quantum = 256;
2418 	u32 i;
2419 
2420 	q->tin_cnt = 8;
2421 
2422 	/* codepoint to class mapping */
2423 	q->tin_index = diffserv8;
2424 	q->tin_order = normal_order;
2425 
2426 	/* class characteristics */
2427 	for (i = 0; i < q->tin_cnt; i++) {
2428 		struct cake_tin_data *b = &q->tins[i];
2429 
2430 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2431 			      us_to_ns(q->interval));
2432 
2433 		b->tin_quantum = max_t(u16, 1U, quantum);
2434 
2435 		/* calculate next class's parameters */
2436 		rate  *= 7;
2437 		rate >>= 3;
2438 
2439 		quantum  *= 7;
2440 		quantum >>= 3;
2441 	}
2442 
2443 	return 0;
2444 }
2445 
cake_config_diffserv4(struct Qdisc * sch)2446 static int cake_config_diffserv4(struct Qdisc *sch)
2447 {
2448 /*  Further pruned list of traffic classes for four-class system:
2449  *
2450  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2451  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2452  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2453  *	    Background Traffic (LE, CS1)
2454  *
2455  *		Total 4 traffic classes.
2456  */
2457 
2458 	struct cake_sched_data *q = qdisc_priv(sch);
2459 	u32 mtu = psched_mtu(qdisc_dev(sch));
2460 	u64 rate = q->rate_bps;
2461 	u32 quantum = 1024;
2462 
2463 	q->tin_cnt = 4;
2464 
2465 	/* codepoint to class mapping */
2466 	q->tin_index = diffserv4;
2467 	q->tin_order = bulk_order;
2468 
2469 	/* class characteristics */
2470 	cake_set_rate(&q->tins[0], rate, mtu,
2471 		      us_to_ns(q->target), us_to_ns(q->interval));
2472 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2473 		      us_to_ns(q->target), us_to_ns(q->interval));
2474 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2475 		      us_to_ns(q->target), us_to_ns(q->interval));
2476 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2477 		      us_to_ns(q->target), us_to_ns(q->interval));
2478 
2479 	/* bandwidth-sharing weights */
2480 	q->tins[0].tin_quantum = quantum;
2481 	q->tins[1].tin_quantum = quantum >> 4;
2482 	q->tins[2].tin_quantum = quantum >> 1;
2483 	q->tins[3].tin_quantum = quantum >> 2;
2484 
2485 	return 0;
2486 }
2487 
cake_config_diffserv3(struct Qdisc * sch)2488 static int cake_config_diffserv3(struct Qdisc *sch)
2489 {
2490 /*  Simplified Diffserv structure with 3 tins.
2491  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2492  *		Best Effort
2493  *		Low Priority		(LE, CS1)
2494  */
2495 	struct cake_sched_data *q = qdisc_priv(sch);
2496 	u32 mtu = psched_mtu(qdisc_dev(sch));
2497 	u64 rate = q->rate_bps;
2498 	u32 quantum = 1024;
2499 
2500 	q->tin_cnt = 3;
2501 
2502 	/* codepoint to class mapping */
2503 	q->tin_index = diffserv3;
2504 	q->tin_order = bulk_order;
2505 
2506 	/* class characteristics */
2507 	cake_set_rate(&q->tins[0], rate, mtu,
2508 		      us_to_ns(q->target), us_to_ns(q->interval));
2509 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2510 		      us_to_ns(q->target), us_to_ns(q->interval));
2511 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2512 		      us_to_ns(q->target), us_to_ns(q->interval));
2513 
2514 	/* bandwidth-sharing weights */
2515 	q->tins[0].tin_quantum = quantum;
2516 	q->tins[1].tin_quantum = quantum >> 4;
2517 	q->tins[2].tin_quantum = quantum >> 2;
2518 
2519 	return 0;
2520 }
2521 
cake_reconfigure(struct Qdisc * sch)2522 static void cake_reconfigure(struct Qdisc *sch)
2523 {
2524 	struct cake_sched_data *q = qdisc_priv(sch);
2525 	int c, ft;
2526 
2527 	switch (q->tin_mode) {
2528 	case CAKE_DIFFSERV_BESTEFFORT:
2529 		ft = cake_config_besteffort(sch);
2530 		break;
2531 
2532 	case CAKE_DIFFSERV_PRECEDENCE:
2533 		ft = cake_config_precedence(sch);
2534 		break;
2535 
2536 	case CAKE_DIFFSERV_DIFFSERV8:
2537 		ft = cake_config_diffserv8(sch);
2538 		break;
2539 
2540 	case CAKE_DIFFSERV_DIFFSERV4:
2541 		ft = cake_config_diffserv4(sch);
2542 		break;
2543 
2544 	case CAKE_DIFFSERV_DIFFSERV3:
2545 	default:
2546 		ft = cake_config_diffserv3(sch);
2547 		break;
2548 	}
2549 
2550 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2551 		cake_clear_tin(sch, c);
2552 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2553 	}
2554 
2555 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2556 	q->rate_shft = q->tins[ft].tin_rate_shft;
2557 
2558 	if (q->buffer_config_limit) {
2559 		q->buffer_limit = q->buffer_config_limit;
2560 	} else if (q->rate_bps) {
2561 		u64 t = q->rate_bps * q->interval;
2562 
2563 		do_div(t, USEC_PER_SEC / 4);
2564 		q->buffer_limit = max_t(u32, t, 4U << 20);
2565 	} else {
2566 		q->buffer_limit = ~0;
2567 	}
2568 
2569 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2570 
2571 	q->buffer_limit = min(q->buffer_limit,
2572 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2573 				  q->buffer_config_limit));
2574 }
2575 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2576 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2577 		       struct netlink_ext_ack *extack)
2578 {
2579 	struct cake_sched_data *q = qdisc_priv(sch);
2580 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2581 	u16 rate_flags;
2582 	u8 flow_mode;
2583 	int err;
2584 
2585 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2586 					  extack);
2587 	if (err < 0)
2588 		return err;
2589 
2590 	flow_mode = q->flow_mode;
2591 	if (tb[TCA_CAKE_NAT]) {
2592 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2593 		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2594 		flow_mode |= CAKE_FLOW_NAT_FLAG *
2595 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2596 #else
2597 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2598 				    "No conntrack support in kernel");
2599 		return -EOPNOTSUPP;
2600 #endif
2601 	}
2602 
2603 	if (tb[TCA_CAKE_BASE_RATE64])
2604 		WRITE_ONCE(q->rate_bps,
2605 			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2606 
2607 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2608 		WRITE_ONCE(q->tin_mode,
2609 			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2610 
2611 	rate_flags = q->rate_flags;
2612 	if (tb[TCA_CAKE_WASH]) {
2613 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2614 			rate_flags |= CAKE_FLAG_WASH;
2615 		else
2616 			rate_flags &= ~CAKE_FLAG_WASH;
2617 	}
2618 
2619 	if (tb[TCA_CAKE_FLOW_MODE])
2620 		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2621 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2622 					CAKE_FLOW_MASK));
2623 
2624 	if (tb[TCA_CAKE_ATM])
2625 		WRITE_ONCE(q->atm_mode,
2626 			   nla_get_u32(tb[TCA_CAKE_ATM]));
2627 
2628 	if (tb[TCA_CAKE_OVERHEAD]) {
2629 		WRITE_ONCE(q->rate_overhead,
2630 			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2631 		rate_flags |= CAKE_FLAG_OVERHEAD;
2632 
2633 		q->max_netlen = 0;
2634 		q->max_adjlen = 0;
2635 		q->min_netlen = ~0;
2636 		q->min_adjlen = ~0;
2637 	}
2638 
2639 	if (tb[TCA_CAKE_RAW]) {
2640 		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2641 
2642 		q->max_netlen = 0;
2643 		q->max_adjlen = 0;
2644 		q->min_netlen = ~0;
2645 		q->min_adjlen = ~0;
2646 	}
2647 
2648 	if (tb[TCA_CAKE_MPU])
2649 		WRITE_ONCE(q->rate_mpu,
2650 			   nla_get_u32(tb[TCA_CAKE_MPU]));
2651 
2652 	if (tb[TCA_CAKE_RTT]) {
2653 		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2654 
2655 		WRITE_ONCE(q->interval, max(interval, 1U));
2656 	}
2657 
2658 	if (tb[TCA_CAKE_TARGET]) {
2659 		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2660 
2661 		WRITE_ONCE(q->target, max(target, 1U));
2662 	}
2663 
2664 	if (tb[TCA_CAKE_AUTORATE]) {
2665 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2666 			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2667 		else
2668 			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2669 	}
2670 
2671 	if (tb[TCA_CAKE_INGRESS]) {
2672 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2673 			rate_flags |= CAKE_FLAG_INGRESS;
2674 		else
2675 			rate_flags &= ~CAKE_FLAG_INGRESS;
2676 	}
2677 
2678 	if (tb[TCA_CAKE_ACK_FILTER])
2679 		WRITE_ONCE(q->ack_filter,
2680 			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2681 
2682 	if (tb[TCA_CAKE_MEMORY])
2683 		WRITE_ONCE(q->buffer_config_limit,
2684 			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2685 
2686 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2687 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2688 			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2689 		else
2690 			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2691 	}
2692 
2693 	if (tb[TCA_CAKE_FWMARK]) {
2694 		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2695 		WRITE_ONCE(q->fwmark_shft,
2696 			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2697 	}
2698 
2699 	WRITE_ONCE(q->rate_flags, rate_flags);
2700 	WRITE_ONCE(q->flow_mode, flow_mode);
2701 	if (q->tins) {
2702 		sch_tree_lock(sch);
2703 		cake_reconfigure(sch);
2704 		sch_tree_unlock(sch);
2705 	}
2706 
2707 	return 0;
2708 }
2709 
cake_destroy(struct Qdisc * sch)2710 static void cake_destroy(struct Qdisc *sch)
2711 {
2712 	struct cake_sched_data *q = qdisc_priv(sch);
2713 
2714 	qdisc_watchdog_cancel(&q->watchdog);
2715 	tcf_block_put(q->block);
2716 	kvfree(q->tins);
2717 }
2718 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2719 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2720 		     struct netlink_ext_ack *extack)
2721 {
2722 	struct cake_sched_data *q = qdisc_priv(sch);
2723 	int i, j, err;
2724 
2725 	sch->limit = 10240;
2726 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2727 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2728 
2729 	q->rate_bps = 0; /* unlimited by default */
2730 
2731 	q->interval = 100000; /* 100ms default */
2732 	q->target   =   5000; /* 5ms: codel RFC argues
2733 			       * for 5 to 10% of interval
2734 			       */
2735 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2736 	q->cur_tin = 0;
2737 	q->cur_flow  = 0;
2738 
2739 	qdisc_watchdog_init(&q->watchdog, sch);
2740 
2741 	if (opt) {
2742 		err = cake_change(sch, opt, extack);
2743 
2744 		if (err)
2745 			return err;
2746 	}
2747 
2748 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2749 	if (err)
2750 		return err;
2751 
2752 	quantum_div[0] = ~0;
2753 	for (i = 1; i <= CAKE_QUEUES; i++)
2754 		quantum_div[i] = 65535 / i;
2755 
2756 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2757 			   GFP_KERNEL);
2758 	if (!q->tins)
2759 		return -ENOMEM;
2760 
2761 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2762 		struct cake_tin_data *b = q->tins + i;
2763 
2764 		INIT_LIST_HEAD(&b->new_flows);
2765 		INIT_LIST_HEAD(&b->old_flows);
2766 		INIT_LIST_HEAD(&b->decaying_flows);
2767 		b->sparse_flow_count = 0;
2768 		b->bulk_flow_count = 0;
2769 		b->decaying_flow_count = 0;
2770 
2771 		for (j = 0; j < CAKE_QUEUES; j++) {
2772 			struct cake_flow *flow = b->flows + j;
2773 			u32 k = j * CAKE_MAX_TINS + i;
2774 
2775 			INIT_LIST_HEAD(&flow->flowchain);
2776 			cobalt_vars_init(&flow->cvars);
2777 
2778 			q->overflow_heap[k].t = i;
2779 			q->overflow_heap[k].b = j;
2780 			b->overflow_idx[j] = k;
2781 		}
2782 	}
2783 
2784 	cake_reconfigure(sch);
2785 	q->avg_peak_bandwidth = q->rate_bps;
2786 	q->min_netlen = ~0;
2787 	q->min_adjlen = ~0;
2788 	return 0;
2789 }
2790 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2791 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2792 {
2793 	struct cake_sched_data *q = qdisc_priv(sch);
2794 	struct nlattr *opts;
2795 	u16 rate_flags;
2796 	u8 flow_mode;
2797 
2798 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2799 	if (!opts)
2800 		goto nla_put_failure;
2801 
2802 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2803 			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2804 		goto nla_put_failure;
2805 
2806 	flow_mode = READ_ONCE(q->flow_mode);
2807 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2808 		goto nla_put_failure;
2809 
2810 	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2811 		goto nla_put_failure;
2812 
2813 	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2814 		goto nla_put_failure;
2815 
2816 	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2817 			READ_ONCE(q->buffer_config_limit)))
2818 		goto nla_put_failure;
2819 
2820 	rate_flags = READ_ONCE(q->rate_flags);
2821 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2822 			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2823 		goto nla_put_failure;
2824 
2825 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2826 			!!(rate_flags & CAKE_FLAG_INGRESS)))
2827 		goto nla_put_failure;
2828 
2829 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2830 		goto nla_put_failure;
2831 
2832 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2833 			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2834 		goto nla_put_failure;
2835 
2836 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2837 		goto nla_put_failure;
2838 
2839 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2840 			!!(rate_flags & CAKE_FLAG_WASH)))
2841 		goto nla_put_failure;
2842 
2843 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2844 		goto nla_put_failure;
2845 
2846 	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2847 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2848 			goto nla_put_failure;
2849 
2850 	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2851 		goto nla_put_failure;
2852 
2853 	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2854 		goto nla_put_failure;
2855 
2856 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2857 			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2858 		goto nla_put_failure;
2859 
2860 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2861 		goto nla_put_failure;
2862 
2863 	return nla_nest_end(skb, opts);
2864 
2865 nla_put_failure:
2866 	return -1;
2867 }
2868 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2869 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2870 {
2871 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2872 	struct cake_sched_data *q = qdisc_priv(sch);
2873 	struct nlattr *tstats, *ts;
2874 	int i;
2875 
2876 	if (!stats)
2877 		return -1;
2878 
2879 #define PUT_STAT_U32(attr, data) do {				       \
2880 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2881 			goto nla_put_failure;			       \
2882 	} while (0)
2883 #define PUT_STAT_U64(attr, data) do {				       \
2884 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2885 					data, TCA_CAKE_STATS_PAD)) \
2886 			goto nla_put_failure;			       \
2887 	} while (0)
2888 
2889 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2890 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2891 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2892 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2893 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2894 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2895 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2896 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2897 
2898 #undef PUT_STAT_U32
2899 #undef PUT_STAT_U64
2900 
2901 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2902 	if (!tstats)
2903 		goto nla_put_failure;
2904 
2905 #define PUT_TSTAT_U32(attr, data) do {					\
2906 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2907 			goto nla_put_failure;				\
2908 	} while (0)
2909 #define PUT_TSTAT_U64(attr, data) do {					\
2910 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2911 					data, TCA_CAKE_TIN_STATS_PAD))	\
2912 			goto nla_put_failure;				\
2913 	} while (0)
2914 
2915 	for (i = 0; i < q->tin_cnt; i++) {
2916 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2917 
2918 		ts = nla_nest_start_noflag(d->skb, i + 1);
2919 		if (!ts)
2920 			goto nla_put_failure;
2921 
2922 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2923 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2924 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2925 
2926 		PUT_TSTAT_U32(TARGET_US,
2927 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2928 		PUT_TSTAT_U32(INTERVAL_US,
2929 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2930 
2931 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2932 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2933 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2934 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2935 
2936 		PUT_TSTAT_U32(PEAK_DELAY_US,
2937 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2938 		PUT_TSTAT_U32(AVG_DELAY_US,
2939 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2940 		PUT_TSTAT_U32(BASE_DELAY_US,
2941 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2942 
2943 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2944 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2945 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2946 
2947 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2948 					    b->decaying_flow_count);
2949 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2950 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2951 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2952 
2953 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2954 		nla_nest_end(d->skb, ts);
2955 	}
2956 
2957 #undef PUT_TSTAT_U32
2958 #undef PUT_TSTAT_U64
2959 
2960 	nla_nest_end(d->skb, tstats);
2961 	return nla_nest_end(d->skb, stats);
2962 
2963 nla_put_failure:
2964 	nla_nest_cancel(d->skb, stats);
2965 	return -1;
2966 }
2967 
cake_leaf(struct Qdisc * sch,unsigned long arg)2968 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2969 {
2970 	return NULL;
2971 }
2972 
cake_find(struct Qdisc * sch,u32 classid)2973 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2974 {
2975 	return 0;
2976 }
2977 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2978 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2979 			       u32 classid)
2980 {
2981 	return 0;
2982 }
2983 
cake_unbind(struct Qdisc * q,unsigned long cl)2984 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2985 {
2986 }
2987 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2988 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2989 					struct netlink_ext_ack *extack)
2990 {
2991 	struct cake_sched_data *q = qdisc_priv(sch);
2992 
2993 	if (cl)
2994 		return NULL;
2995 	return q->block;
2996 }
2997 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2998 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2999 			   struct sk_buff *skb, struct tcmsg *tcm)
3000 {
3001 	tcm->tcm_handle |= TC_H_MIN(cl);
3002 	return 0;
3003 }
3004 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)3005 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3006 				 struct gnet_dump *d)
3007 {
3008 	struct cake_sched_data *q = qdisc_priv(sch);
3009 	const struct cake_flow *flow = NULL;
3010 	struct gnet_stats_queue qs = { 0 };
3011 	struct nlattr *stats;
3012 	u32 idx = cl - 1;
3013 
3014 	if (idx < CAKE_QUEUES * q->tin_cnt) {
3015 		const struct cake_tin_data *b = \
3016 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3017 		const struct sk_buff *skb;
3018 
3019 		flow = &b->flows[idx % CAKE_QUEUES];
3020 
3021 		if (flow->head) {
3022 			sch_tree_lock(sch);
3023 			skb = flow->head;
3024 			while (skb) {
3025 				qs.qlen++;
3026 				skb = skb->next;
3027 			}
3028 			sch_tree_unlock(sch);
3029 		}
3030 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3031 		qs.drops = flow->dropped;
3032 	}
3033 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3034 		return -1;
3035 	if (flow) {
3036 		ktime_t now = ktime_get();
3037 
3038 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3039 		if (!stats)
3040 			return -1;
3041 
3042 #define PUT_STAT_U32(attr, data) do {				       \
3043 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3044 			goto nla_put_failure;			       \
3045 	} while (0)
3046 #define PUT_STAT_S32(attr, data) do {				       \
3047 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3048 			goto nla_put_failure;			       \
3049 	} while (0)
3050 
3051 		PUT_STAT_S32(DEFICIT, flow->deficit);
3052 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3053 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3054 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3055 		if (flow->cvars.p_drop) {
3056 			PUT_STAT_S32(BLUE_TIMER_US,
3057 				     ktime_to_us(
3058 					     ktime_sub(now,
3059 						       flow->cvars.blue_timer)));
3060 		}
3061 		if (flow->cvars.dropping) {
3062 			PUT_STAT_S32(DROP_NEXT_US,
3063 				     ktime_to_us(
3064 					     ktime_sub(now,
3065 						       flow->cvars.drop_next)));
3066 		}
3067 
3068 		if (nla_nest_end(d->skb, stats) < 0)
3069 			return -1;
3070 	}
3071 
3072 	return 0;
3073 
3074 nla_put_failure:
3075 	nla_nest_cancel(d->skb, stats);
3076 	return -1;
3077 }
3078 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3079 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3080 {
3081 	struct cake_sched_data *q = qdisc_priv(sch);
3082 	unsigned int i, j;
3083 
3084 	if (arg->stop)
3085 		return;
3086 
3087 	for (i = 0; i < q->tin_cnt; i++) {
3088 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3089 
3090 		for (j = 0; j < CAKE_QUEUES; j++) {
3091 			if (list_empty(&b->flows[j].flowchain)) {
3092 				arg->count++;
3093 				continue;
3094 			}
3095 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3096 						 arg))
3097 				break;
3098 		}
3099 	}
3100 }
3101 
3102 static const struct Qdisc_class_ops cake_class_ops = {
3103 	.leaf		=	cake_leaf,
3104 	.find		=	cake_find,
3105 	.tcf_block	=	cake_tcf_block,
3106 	.bind_tcf	=	cake_bind,
3107 	.unbind_tcf	=	cake_unbind,
3108 	.dump		=	cake_dump_class,
3109 	.dump_stats	=	cake_dump_class_stats,
3110 	.walk		=	cake_walk,
3111 };
3112 
3113 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3114 	.cl_ops		=	&cake_class_ops,
3115 	.id		=	"cake",
3116 	.priv_size	=	sizeof(struct cake_sched_data),
3117 	.enqueue	=	cake_enqueue,
3118 	.dequeue	=	cake_dequeue,
3119 	.peek		=	qdisc_peek_dequeued,
3120 	.init		=	cake_init,
3121 	.reset		=	cake_reset,
3122 	.destroy	=	cake_destroy,
3123 	.change		=	cake_change,
3124 	.dump		=	cake_dump,
3125 	.dump_stats	=	cake_dump_stats,
3126 	.owner		=	THIS_MODULE,
3127 };
3128 MODULE_ALIAS_NET_SCH("cake");
3129 
cake_module_init(void)3130 static int __init cake_module_init(void)
3131 {
3132 	return register_qdisc(&cake_qdisc_ops);
3133 }
3134 
cake_module_exit(void)3135 static void __exit cake_module_exit(void)
3136 {
3137 	unregister_qdisc(&cake_qdisc_ops);
3138 }
3139 
3140 module_init(cake_module_init)
3141 module_exit(cake_module_exit)
3142 MODULE_AUTHOR("Jonathan Morton");
3143 MODULE_LICENSE("Dual BSD/GPL");
3144 MODULE_DESCRIPTION("The CAKE shaper.");
3145