• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * paramters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
50  * device-specific coefficients.
51  *
52  * 2. Control Strategy
53  *
54  * The device virtual time (vtime) is used as the primary control metric.
55  * The control strategy is composed of the following three parts.
56  *
57  * 2-1. Vtime Distribution
58  *
59  * When a cgroup becomes active in terms of IOs, its hierarchical share is
60  * calculated.  Please consider the following hierarchy where the numbers
61  * inside parentheses denote the configured weights.
62  *
63  *           root
64  *         /       \
65  *      A (w:100)  B (w:300)
66  *      /       \
67  *  A0 (w:100)  A1 (w:100)
68  *
69  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
70  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
71  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
72  * 12.5% each.  The distribution mechanism only cares about these flattened
73  * shares.  They're called hweights (hierarchical weights) and always add
74  * upto 1 (HWEIGHT_WHOLE).
75  *
76  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
77  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
78  * against the device vtime - an IO which takes 10ms on the underlying
79  * device is considered to take 80ms on A0.
80  *
81  * This constitutes the basis of IO capacity distribution.  Each cgroup's
82  * vtime is running at a rate determined by its hweight.  A cgroup tracks
83  * the vtime consumed by past IOs and can issue a new IO iff doing so
84  * wouldn't outrun the current device vtime.  Otherwise, the IO is
85  * suspended until the vtime has progressed enough to cover it.
86  *
87  * 2-2. Vrate Adjustment
88  *
89  * It's unrealistic to expect the cost model to be perfect.  There are too
90  * many devices and even on the same device the overall performance
91  * fluctuates depending on numerous factors such as IO mixture and device
92  * internal garbage collection.  The controller needs to adapt dynamically.
93  *
94  * This is achieved by adjusting the overall IO rate according to how busy
95  * the device is.  If the device becomes overloaded, we're sending down too
96  * many IOs and should generally slow down.  If there are waiting issuers
97  * but the device isn't saturated, we're issuing too few and should
98  * generally speed up.
99  *
100  * To slow down, we lower the vrate - the rate at which the device vtime
101  * passes compared to the wall clock.  For example, if the vtime is running
102  * at the vrate of 75%, all cgroups added up would only be able to issue
103  * 750ms worth of IOs per second, and vice-versa for speeding up.
104  *
105  * Device business is determined using two criteria - rq wait and
106  * completion latencies.
107  *
108  * When a device gets saturated, the on-device and then the request queues
109  * fill up and a bio which is ready to be issued has to wait for a request
110  * to become available.  When this delay becomes noticeable, it's a clear
111  * indication that the device is saturated and we lower the vrate.  This
112  * saturation signal is fairly conservative as it only triggers when both
113  * hardware and software queues are filled up, and is used as the default
114  * busy signal.
115  *
116  * As devices can have deep queues and be unfair in how the queued commands
117  * are executed, soley depending on rq wait may not result in satisfactory
118  * control quality.  For a better control quality, completion latency QoS
119  * parameters can be configured so that the device is considered saturated
120  * if N'th percentile completion latency rises above the set point.
121  *
122  * The completion latency requirements are a function of both the
123  * underlying device characteristics and the desired IO latency quality of
124  * service.  There is an inherent trade-off - the tighter the latency QoS,
125  * the higher the bandwidth lossage.  Latency QoS is disabled by default
126  * and can be set through /sys/fs/cgroup/io.cost.qos.
127  *
128  * 2-3. Work Conservation
129  *
130  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
131  * periodically while B is sending out enough parallel IOs to saturate the
132  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
133  * cost per second, i.e., 10% of the device capacity.  The naive
134  * distribution of half and half would lead to 60% utilization of the
135  * device, a significant reduction in the total amount of work done
136  * compared to free-for-all competition.  This is too high a cost to pay
137  * for IO control.
138  *
139  * To conserve the total amount of work done, we keep track of how much
140  * each active cgroup is actually using and yield part of its weight if
141  * there are other cgroups which can make use of it.  In the above case,
142  * A's weight will be lowered so that it hovers above the actual usage and
143  * B would be able to use the rest.
144  *
145  * As we don't want to penalize a cgroup for donating its weight, the
146  * surplus weight adjustment factors in a margin and has an immediate
147  * snapback mechanism in case the cgroup needs more IO vtime for itself.
148  *
149  * Note that adjusting down surplus weights has the same effects as
150  * accelerating vtime for other cgroups and work conservation can also be
151  * implemented by adjusting vrate dynamically.  However, squaring who can
152  * donate and should take back how much requires hweight propagations
153  * anyway making it easier to implement and understand as a separate
154  * mechanism.
155  *
156  * 3. Monitoring
157  *
158  * Instead of debugfs or other clumsy monitoring mechanisms, this
159  * controller uses a drgn based monitoring script -
160  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
161  * https://github.com/osandov/drgn.  The ouput looks like the following.
162  *
163  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
164  *                 active      weight      hweight% inflt% dbt  delay usages%
165  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
166  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
167  *
168  * - per	: Timer period
169  * - cur_per	: Internal wall and device vtime clock
170  * - vrate	: Device virtual time rate against wall clock
171  * - weight	: Surplus-adjusted and configured weights
172  * - hweight	: Surplus-adjusted and configured hierarchical weights
173  * - inflt	: The percentage of in-flight IO cost at the end of last period
174  * - del_ms	: Deferred issuer delay induction level and duration
175  * - usages	: Usage history
176  */
177 
178 #include <linux/kernel.h>
179 #include <linux/module.h>
180 #include <linux/timer.h>
181 #include <linux/time64.h>
182 #include <linux/parser.h>
183 #include <linux/sched/signal.h>
184 #include <linux/blk-cgroup.h>
185 #include "blk-rq-qos.h"
186 #include "blk-stat.h"
187 #include "blk-wbt.h"
188 
189 #ifdef CONFIG_TRACEPOINTS
190 
191 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
192 #define TRACE_IOCG_PATH_LEN 1024
193 static DEFINE_SPINLOCK(trace_iocg_path_lock);
194 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
195 
196 #define TRACE_IOCG_PATH(type, iocg, ...)					\
197 	do {									\
198 		unsigned long flags;						\
199 		if (trace_iocost_##type##_enabled()) {				\
200 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
201 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
202 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
203 			trace_iocost_##type(iocg, trace_iocg_path,		\
204 					      ##__VA_ARGS__);			\
205 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
206 		}								\
207 	} while (0)
208 
209 #else	/* CONFIG_TRACE_POINTS */
210 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
211 #endif	/* CONFIG_TRACE_POINTS */
212 
213 enum {
214 	MILLION			= 1000000,
215 
216 	/* timer period is calculated from latency requirements, bound it */
217 	MIN_PERIOD		= USEC_PER_MSEC,
218 	MAX_PERIOD		= USEC_PER_SEC,
219 
220 	/*
221 	 * A cgroup's vtime can run 50% behind the device vtime, which
222 	 * serves as its IO credit buffer.  Surplus weight adjustment is
223 	 * immediately canceled if the vtime margin runs below 10%.
224 	 */
225 	MARGIN_PCT		= 50,
226 	INUSE_MARGIN_PCT	= 10,
227 
228 	/* Have some play in waitq timer operations */
229 	WAITQ_TIMER_MARGIN_PCT	= 5,
230 
231 	/*
232 	 * vtime can wrap well within a reasonable uptime when vrate is
233 	 * consistently raised.  Don't trust recorded cgroup vtime if the
234 	 * period counter indicates that it's older than 5mins.
235 	 */
236 	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
237 
238 	/*
239 	 * Remember the past three non-zero usages and use the max for
240 	 * surplus calculation.  Three slots guarantee that we remember one
241 	 * full period usage from the last active stretch even after
242 	 * partial deactivation and re-activation periods.  Don't start
243 	 * giving away weight before collecting two data points to prevent
244 	 * hweight adjustments based on one partial activation period.
245 	 */
246 	NR_USAGE_SLOTS		= 3,
247 	MIN_VALID_USAGES	= 2,
248 
249 	/* 1/64k is granular enough and can easily be handled w/ u32 */
250 	HWEIGHT_WHOLE		= 1 << 16,
251 };
252 
253 enum {
254 	/*
255 	 * As vtime is used to calculate the cost of each IO, it needs to
256 	 * be fairly high precision.  For example, it should be able to
257 	 * represent the cost of a single page worth of discard with
258 	 * suffificient accuracy.  At the same time, it should be able to
259 	 * represent reasonably long enough durations to be useful and
260 	 * convenient during operation.
261 	 *
262 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
263 	 * granularity and days of wrap-around time even at extreme vrates.
264 	 */
265 	VTIME_PER_SEC_SHIFT	= 37,
266 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
267 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
268 
269 	/* bound vrate adjustments within two orders of magnitude */
270 	VRATE_MIN_PPM		= 10000,	/* 1% */
271 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
272 
273 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
274 	VRATE_CLAMP_ADJ_PCT	= 4,
275 
276 	/* switch iff the conditions are met for longer than this */
277 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
278 };
279 
280 enum {
281 	/* if IOs end up waiting for requests, issue less */
282 	RQ_WAIT_BUSY_PCT	= 5,
283 
284 	/* unbusy hysterisis */
285 	UNBUSY_THR_PCT		= 75,
286 
287 	/* don't let cmds which take a very long time pin lagging for too long */
288 	MAX_LAGGING_PERIODS	= 10,
289 
290 	/*
291 	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
292 	 * donate the surplus.
293 	 */
294 	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
295 	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
296 	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
297 
298 	/*
299 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
300 	 * size-proportional components of cost calculation in closer
301 	 * numbers of digits to per-IO cost components.
302 	 */
303 	IOC_PAGE_SHIFT		= 12,
304 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
305 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
306 
307 	/* if apart further than 16M, consider randio for linear model */
308 	LCOEF_RANDIO_PAGES	= 4096,
309 };
310 
311 enum ioc_running {
312 	IOC_IDLE,
313 	IOC_RUNNING,
314 	IOC_STOP,
315 };
316 
317 /* io.cost.qos controls including per-dev enable of the whole controller */
318 enum {
319 	QOS_ENABLE,
320 	QOS_CTRL,
321 	NR_QOS_CTRL_PARAMS,
322 };
323 
324 /* io.cost.qos params */
325 enum {
326 	QOS_RPPM,
327 	QOS_RLAT,
328 	QOS_WPPM,
329 	QOS_WLAT,
330 	QOS_MIN,
331 	QOS_MAX,
332 	NR_QOS_PARAMS,
333 };
334 
335 /* io.cost.model controls */
336 enum {
337 	COST_CTRL,
338 	COST_MODEL,
339 	NR_COST_CTRL_PARAMS,
340 };
341 
342 /* builtin linear cost model coefficients */
343 enum {
344 	I_LCOEF_RBPS,
345 	I_LCOEF_RSEQIOPS,
346 	I_LCOEF_RRANDIOPS,
347 	I_LCOEF_WBPS,
348 	I_LCOEF_WSEQIOPS,
349 	I_LCOEF_WRANDIOPS,
350 	NR_I_LCOEFS,
351 };
352 
353 enum {
354 	LCOEF_RPAGE,
355 	LCOEF_RSEQIO,
356 	LCOEF_RRANDIO,
357 	LCOEF_WPAGE,
358 	LCOEF_WSEQIO,
359 	LCOEF_WRANDIO,
360 	NR_LCOEFS,
361 };
362 
363 enum {
364 	AUTOP_INVALID,
365 	AUTOP_HDD,
366 	AUTOP_SSD_QD1,
367 	AUTOP_SSD_DFL,
368 	AUTOP_SSD_FAST,
369 };
370 
371 struct ioc_gq;
372 
373 struct ioc_params {
374 	u32				qos[NR_QOS_PARAMS];
375 	u64				i_lcoefs[NR_I_LCOEFS];
376 	u64				lcoefs[NR_LCOEFS];
377 	u32				too_fast_vrate_pct;
378 	u32				too_slow_vrate_pct;
379 };
380 
381 struct ioc_missed {
382 	u32				nr_met;
383 	u32				nr_missed;
384 	u32				last_met;
385 	u32				last_missed;
386 };
387 
388 struct ioc_pcpu_stat {
389 	struct ioc_missed		missed[2];
390 
391 	u64				rq_wait_ns;
392 	u64				last_rq_wait_ns;
393 };
394 
395 /* per device */
396 struct ioc {
397 	struct rq_qos			rqos;
398 
399 	bool				enabled;
400 
401 	struct ioc_params		params;
402 	u32				period_us;
403 	u32				margin_us;
404 	u64				vrate_min;
405 	u64				vrate_max;
406 
407 	spinlock_t			lock;
408 	struct timer_list		timer;
409 	struct list_head		active_iocgs;	/* active cgroups */
410 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
411 
412 	enum ioc_running		running;
413 	atomic64_t			vtime_rate;
414 
415 	seqcount_t			period_seqcount;
416 	u32				period_at;	/* wallclock starttime */
417 	u64				period_at_vtime; /* vtime starttime */
418 
419 	atomic64_t			cur_period;	/* inc'd each period */
420 	int				busy_level;	/* saturation history */
421 
422 	u64				inuse_margin_vtime;
423 	bool				weights_updated;
424 	atomic_t			hweight_gen;	/* for lazy hweights */
425 
426 	u64				autop_too_fast_at;
427 	u64				autop_too_slow_at;
428 	int				autop_idx;
429 	bool				user_qos_params:1;
430 	bool				user_cost_model:1;
431 };
432 
433 /* per device-cgroup pair */
434 struct ioc_gq {
435 	struct blkg_policy_data		pd;
436 	struct ioc			*ioc;
437 
438 	/*
439 	 * A iocg can get its weight from two sources - an explicit
440 	 * per-device-cgroup configuration or the default weight of the
441 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
442 	 * configuration.  `weight` is the effective considering both
443 	 * sources.
444 	 *
445 	 * When an idle cgroup becomes active its `active` goes from 0 to
446 	 * `weight`.  `inuse` is the surplus adjusted active weight.
447 	 * `active` and `inuse` are used to calculate `hweight_active` and
448 	 * `hweight_inuse`.
449 	 *
450 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
451 	 * surplus adjustments.
452 	 */
453 	u32				cfg_weight;
454 	u32				weight;
455 	u32				active;
456 	u32				inuse;
457 	u32				last_inuse;
458 
459 	sector_t			cursor;		/* to detect randio */
460 
461 	/*
462 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
463 	 * issued.  If lagging behind device vtime, the delta represents
464 	 * the currently available IO budget.  If runnning ahead, the
465 	 * overage.
466 	 *
467 	 * `vtime_done` is the same but progressed on completion rather
468 	 * than issue.  The delta behind `vtime` represents the cost of
469 	 * currently in-flight IOs.
470 	 *
471 	 * `last_vtime` is used to remember `vtime` at the end of the last
472 	 * period to calculate utilization.
473 	 */
474 	atomic64_t			vtime;
475 	atomic64_t			done_vtime;
476 	u64				abs_vdebt;
477 	u64				last_vtime;
478 
479 	/*
480 	 * The period this iocg was last active in.  Used for deactivation
481 	 * and invalidating `vtime`.
482 	 */
483 	atomic64_t			active_period;
484 	struct list_head		active_list;
485 
486 	/* see __propagate_active_weight() and current_hweight() for details */
487 	u64				child_active_sum;
488 	u64				child_inuse_sum;
489 	int				hweight_gen;
490 	u32				hweight_active;
491 	u32				hweight_inuse;
492 	bool				has_surplus;
493 
494 	struct wait_queue_head		waitq;
495 	struct hrtimer			waitq_timer;
496 	struct hrtimer			delay_timer;
497 
498 	/* usage is recorded as fractions of HWEIGHT_WHOLE */
499 	int				usage_idx;
500 	u32				usages[NR_USAGE_SLOTS];
501 
502 	/* this iocg's depth in the hierarchy and ancestors including self */
503 	int				level;
504 	struct ioc_gq			*ancestors[];
505 };
506 
507 /* per cgroup */
508 struct ioc_cgrp {
509 	struct blkcg_policy_data	cpd;
510 	unsigned int			dfl_weight;
511 };
512 
513 struct ioc_now {
514 	u64				now_ns;
515 	u32				now;
516 	u64				vnow;
517 	u64				vrate;
518 };
519 
520 struct iocg_wait {
521 	struct wait_queue_entry		wait;
522 	struct bio			*bio;
523 	u64				abs_cost;
524 	bool				committed;
525 };
526 
527 struct iocg_wake_ctx {
528 	struct ioc_gq			*iocg;
529 	u32				hw_inuse;
530 	s64				vbudget;
531 };
532 
533 static const struct ioc_params autop[] = {
534 	[AUTOP_HDD] = {
535 		.qos				= {
536 			[QOS_RLAT]		=        250000, /* 250ms */
537 			[QOS_WLAT]		=        250000,
538 			[QOS_MIN]		= VRATE_MIN_PPM,
539 			[QOS_MAX]		= VRATE_MAX_PPM,
540 		},
541 		.i_lcoefs			= {
542 			[I_LCOEF_RBPS]		=     174019176,
543 			[I_LCOEF_RSEQIOPS]	=         41708,
544 			[I_LCOEF_RRANDIOPS]	=           370,
545 			[I_LCOEF_WBPS]		=     178075866,
546 			[I_LCOEF_WSEQIOPS]	=         42705,
547 			[I_LCOEF_WRANDIOPS]	=           378,
548 		},
549 	},
550 	[AUTOP_SSD_QD1] = {
551 		.qos				= {
552 			[QOS_RLAT]		=         25000, /* 25ms */
553 			[QOS_WLAT]		=         25000,
554 			[QOS_MIN]		= VRATE_MIN_PPM,
555 			[QOS_MAX]		= VRATE_MAX_PPM,
556 		},
557 		.i_lcoefs			= {
558 			[I_LCOEF_RBPS]		=     245855193,
559 			[I_LCOEF_RSEQIOPS]	=         61575,
560 			[I_LCOEF_RRANDIOPS]	=          6946,
561 			[I_LCOEF_WBPS]		=     141365009,
562 			[I_LCOEF_WSEQIOPS]	=         33716,
563 			[I_LCOEF_WRANDIOPS]	=         26796,
564 		},
565 	},
566 	[AUTOP_SSD_DFL] = {
567 		.qos				= {
568 			[QOS_RLAT]		=         25000, /* 25ms */
569 			[QOS_WLAT]		=         25000,
570 			[QOS_MIN]		= VRATE_MIN_PPM,
571 			[QOS_MAX]		= VRATE_MAX_PPM,
572 		},
573 		.i_lcoefs			= {
574 			[I_LCOEF_RBPS]		=     488636629,
575 			[I_LCOEF_RSEQIOPS]	=          8932,
576 			[I_LCOEF_RRANDIOPS]	=          8518,
577 			[I_LCOEF_WBPS]		=     427891549,
578 			[I_LCOEF_WSEQIOPS]	=         28755,
579 			[I_LCOEF_WRANDIOPS]	=         21940,
580 		},
581 		.too_fast_vrate_pct		=           500,
582 	},
583 	[AUTOP_SSD_FAST] = {
584 		.qos				= {
585 			[QOS_RLAT]		=          5000, /* 5ms */
586 			[QOS_WLAT]		=          5000,
587 			[QOS_MIN]		= VRATE_MIN_PPM,
588 			[QOS_MAX]		= VRATE_MAX_PPM,
589 		},
590 		.i_lcoefs			= {
591 			[I_LCOEF_RBPS]		=    3102524156LLU,
592 			[I_LCOEF_RSEQIOPS]	=        724816,
593 			[I_LCOEF_RRANDIOPS]	=        778122,
594 			[I_LCOEF_WBPS]		=    1742780862LLU,
595 			[I_LCOEF_WSEQIOPS]	=        425702,
596 			[I_LCOEF_WRANDIOPS]	=	 443193,
597 		},
598 		.too_slow_vrate_pct		=            10,
599 	},
600 };
601 
602 /*
603  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
604  * vtime credit shortage and down on device saturation.
605  */
606 static u32 vrate_adj_pct[] =
607 	{ 0, 0, 0, 0,
608 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
610 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
611 
612 static struct blkcg_policy blkcg_policy_iocost;
613 
614 /* accessors and helpers */
rqos_to_ioc(struct rq_qos * rqos)615 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
616 {
617 	return container_of(rqos, struct ioc, rqos);
618 }
619 
q_to_ioc(struct request_queue * q)620 static struct ioc *q_to_ioc(struct request_queue *q)
621 {
622 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
623 }
624 
q_name(struct request_queue * q)625 static const char *q_name(struct request_queue *q)
626 {
627 	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
628 		return kobject_name(q->kobj.parent);
629 	else
630 		return "<unknown>";
631 }
632 
ioc_name(struct ioc * ioc)633 static const char __maybe_unused *ioc_name(struct ioc *ioc)
634 {
635 	return q_name(ioc->rqos.q);
636 }
637 
pd_to_iocg(struct blkg_policy_data * pd)638 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
639 {
640 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
641 }
642 
blkg_to_iocg(struct blkcg_gq * blkg)643 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
644 {
645 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
646 }
647 
iocg_to_blkg(struct ioc_gq * iocg)648 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
649 {
650 	return pd_to_blkg(&iocg->pd);
651 }
652 
blkcg_to_iocc(struct blkcg * blkcg)653 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
654 {
655 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
656 			    struct ioc_cgrp, cpd);
657 }
658 
659 /*
660  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
661  * weight, the more expensive each IO.  Must round up.
662  */
abs_cost_to_cost(u64 abs_cost,u32 hw_inuse)663 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
664 {
665 	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
666 }
667 
668 /*
669  * The inverse of abs_cost_to_cost().  Must round up.
670  */
cost_to_abs_cost(u64 cost,u32 hw_inuse)671 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
672 {
673 	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
674 }
675 
iocg_commit_bio(struct ioc_gq * iocg,struct bio * bio,u64 cost)676 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
677 {
678 	bio->bi_iocost_cost = cost;
679 	atomic64_add(cost, &iocg->vtime);
680 }
681 
682 #define CREATE_TRACE_POINTS
683 #include <trace/events/iocost.h>
684 
685 /* latency Qos params changed, update period_us and all the dependent params */
ioc_refresh_period_us(struct ioc * ioc)686 static void ioc_refresh_period_us(struct ioc *ioc)
687 {
688 	u32 ppm, lat, multi, period_us;
689 
690 	lockdep_assert_held(&ioc->lock);
691 
692 	/* pick the higher latency target */
693 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
694 		ppm = ioc->params.qos[QOS_RPPM];
695 		lat = ioc->params.qos[QOS_RLAT];
696 	} else {
697 		ppm = ioc->params.qos[QOS_WPPM];
698 		lat = ioc->params.qos[QOS_WLAT];
699 	}
700 
701 	/*
702 	 * We want the period to be long enough to contain a healthy number
703 	 * of IOs while short enough for granular control.  Define it as a
704 	 * multiple of the latency target.  Ideally, the multiplier should
705 	 * be scaled according to the percentile so that it would nominally
706 	 * contain a certain number of requests.  Let's be simpler and
707 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
708 	 */
709 	if (ppm)
710 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
711 	else
712 		multi = 2;
713 	period_us = multi * lat;
714 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
715 
716 	/* calculate dependent params */
717 	ioc->period_us = period_us;
718 	ioc->margin_us = period_us * MARGIN_PCT / 100;
719 	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
720 			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
721 }
722 
ioc_autop_idx(struct ioc * ioc)723 static int ioc_autop_idx(struct ioc *ioc)
724 {
725 	int idx = ioc->autop_idx;
726 	const struct ioc_params *p = &autop[idx];
727 	u32 vrate_pct;
728 	u64 now_ns;
729 
730 	/* rotational? */
731 	if (!blk_queue_nonrot(ioc->rqos.q))
732 		return AUTOP_HDD;
733 
734 	/* handle SATA SSDs w/ broken NCQ */
735 	if (blk_queue_depth(ioc->rqos.q) == 1)
736 		return AUTOP_SSD_QD1;
737 
738 	/* use one of the normal ssd sets */
739 	if (idx < AUTOP_SSD_DFL)
740 		return AUTOP_SSD_DFL;
741 
742 	/* if user is overriding anything, maintain what was there */
743 	if (ioc->user_qos_params || ioc->user_cost_model)
744 		return idx;
745 
746 	/* step up/down based on the vrate */
747 	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
748 			      VTIME_PER_USEC);
749 	now_ns = ktime_get_ns();
750 
751 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
752 		if (!ioc->autop_too_fast_at)
753 			ioc->autop_too_fast_at = now_ns;
754 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
755 			return idx + 1;
756 	} else {
757 		ioc->autop_too_fast_at = 0;
758 	}
759 
760 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
761 		if (!ioc->autop_too_slow_at)
762 			ioc->autop_too_slow_at = now_ns;
763 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
764 			return idx - 1;
765 	} else {
766 		ioc->autop_too_slow_at = 0;
767 	}
768 
769 	return idx;
770 }
771 
772 /*
773  * Take the followings as input
774  *
775  *  @bps	maximum sequential throughput
776  *  @seqiops	maximum sequential 4k iops
777  *  @randiops	maximum random 4k iops
778  *
779  * and calculate the linear model cost coefficients.
780  *
781  *  *@page	per-page cost		1s / (@bps / 4096)
782  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
783  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
784  */
calc_lcoefs(u64 bps,u64 seqiops,u64 randiops,u64 * page,u64 * seqio,u64 * randio)785 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
786 			u64 *page, u64 *seqio, u64 *randio)
787 {
788 	u64 v;
789 
790 	*page = *seqio = *randio = 0;
791 
792 	if (bps) {
793 		u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
794 
795 		if (bps_pages)
796 			*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
797 		else
798 			*page = 1;
799 	}
800 
801 	if (seqiops) {
802 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
803 		if (v > *page)
804 			*seqio = v - *page;
805 	}
806 
807 	if (randiops) {
808 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
809 		if (v > *page)
810 			*randio = v - *page;
811 	}
812 }
813 
ioc_refresh_lcoefs(struct ioc * ioc)814 static void ioc_refresh_lcoefs(struct ioc *ioc)
815 {
816 	u64 *u = ioc->params.i_lcoefs;
817 	u64 *c = ioc->params.lcoefs;
818 
819 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
820 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
821 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
822 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
823 }
824 
ioc_refresh_params(struct ioc * ioc,bool force)825 static bool ioc_refresh_params(struct ioc *ioc, bool force)
826 {
827 	const struct ioc_params *p;
828 	int idx;
829 
830 	lockdep_assert_held(&ioc->lock);
831 
832 	idx = ioc_autop_idx(ioc);
833 	p = &autop[idx];
834 
835 	if (idx == ioc->autop_idx && !force)
836 		return false;
837 
838 	if (idx != ioc->autop_idx)
839 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
840 
841 	ioc->autop_idx = idx;
842 	ioc->autop_too_fast_at = 0;
843 	ioc->autop_too_slow_at = 0;
844 
845 	if (!ioc->user_qos_params)
846 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
847 	if (!ioc->user_cost_model)
848 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
849 
850 	ioc_refresh_period_us(ioc);
851 	ioc_refresh_lcoefs(ioc);
852 
853 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
854 					    VTIME_PER_USEC, MILLION);
855 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
856 				   VTIME_PER_USEC, MILLION);
857 
858 	return true;
859 }
860 
861 /* take a snapshot of the current [v]time and vrate */
ioc_now(struct ioc * ioc,struct ioc_now * now)862 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
863 {
864 	unsigned seq;
865 
866 	now->now_ns = ktime_get();
867 	now->now = ktime_to_us(now->now_ns);
868 	now->vrate = atomic64_read(&ioc->vtime_rate);
869 
870 	/*
871 	 * The current vtime is
872 	 *
873 	 *   vtime at period start + (wallclock time since the start) * vrate
874 	 *
875 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
876 	 * needed, they're seqcount protected.
877 	 */
878 	do {
879 		seq = read_seqcount_begin(&ioc->period_seqcount);
880 		now->vnow = ioc->period_at_vtime +
881 			(now->now - ioc->period_at) * now->vrate;
882 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
883 }
884 
ioc_start_period(struct ioc * ioc,struct ioc_now * now)885 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
886 {
887 	lockdep_assert_held(&ioc->lock);
888 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
889 
890 	write_seqcount_begin(&ioc->period_seqcount);
891 	ioc->period_at = now->now;
892 	ioc->period_at_vtime = now->vnow;
893 	write_seqcount_end(&ioc->period_seqcount);
894 
895 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
896 	add_timer(&ioc->timer);
897 }
898 
899 /*
900  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
901  * weight sums and propagate upwards accordingly.
902  */
__propagate_active_weight(struct ioc_gq * iocg,u32 active,u32 inuse)903 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
904 {
905 	struct ioc *ioc = iocg->ioc;
906 	int lvl;
907 
908 	lockdep_assert_held(&ioc->lock);
909 
910 	inuse = min(active, inuse);
911 
912 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
913 		struct ioc_gq *parent = iocg->ancestors[lvl];
914 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
915 		u32 parent_active = 0, parent_inuse = 0;
916 
917 		/* update the level sums */
918 		parent->child_active_sum += (s32)(active - child->active);
919 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
920 		/* apply the udpates */
921 		child->active = active;
922 		child->inuse = inuse;
923 
924 		/*
925 		 * The delta between inuse and active sums indicates that
926 		 * that much of weight is being given away.  Parent's inuse
927 		 * and active should reflect the ratio.
928 		 */
929 		if (parent->child_active_sum) {
930 			parent_active = parent->weight;
931 			parent_inuse = DIV64_U64_ROUND_UP(
932 				parent_active * parent->child_inuse_sum,
933 				parent->child_active_sum);
934 		}
935 
936 		/* do we need to keep walking up? */
937 		if (parent_active == parent->active &&
938 		    parent_inuse == parent->inuse)
939 			break;
940 
941 		active = parent_active;
942 		inuse = parent_inuse;
943 	}
944 
945 	ioc->weights_updated = true;
946 }
947 
commit_active_weights(struct ioc * ioc)948 static void commit_active_weights(struct ioc *ioc)
949 {
950 	lockdep_assert_held(&ioc->lock);
951 
952 	if (ioc->weights_updated) {
953 		/* paired with rmb in current_hweight(), see there */
954 		smp_wmb();
955 		atomic_inc(&ioc->hweight_gen);
956 		ioc->weights_updated = false;
957 	}
958 }
959 
propagate_active_weight(struct ioc_gq * iocg,u32 active,u32 inuse)960 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
961 {
962 	__propagate_active_weight(iocg, active, inuse);
963 	commit_active_weights(iocg->ioc);
964 }
965 
current_hweight(struct ioc_gq * iocg,u32 * hw_activep,u32 * hw_inusep)966 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
967 {
968 	struct ioc *ioc = iocg->ioc;
969 	int lvl;
970 	u32 hwa, hwi;
971 	int ioc_gen;
972 
973 	/* hot path - if uptodate, use cached */
974 	ioc_gen = atomic_read(&ioc->hweight_gen);
975 	if (ioc_gen == iocg->hweight_gen)
976 		goto out;
977 
978 	/*
979 	 * Paired with wmb in commit_active_weights().  If we saw the
980 	 * updated hweight_gen, all the weight updates from
981 	 * __propagate_active_weight() are visible too.
982 	 *
983 	 * We can race with weight updates during calculation and get it
984 	 * wrong.  However, hweight_gen would have changed and a future
985 	 * reader will recalculate and we're guaranteed to discard the
986 	 * wrong result soon.
987 	 */
988 	smp_rmb();
989 
990 	hwa = hwi = HWEIGHT_WHOLE;
991 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
992 		struct ioc_gq *parent = iocg->ancestors[lvl];
993 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
994 		u32 active_sum = READ_ONCE(parent->child_active_sum);
995 		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
996 		u32 active = READ_ONCE(child->active);
997 		u32 inuse = READ_ONCE(child->inuse);
998 
999 		/* we can race with deactivations and either may read as zero */
1000 		if (!active_sum || !inuse_sum)
1001 			continue;
1002 
1003 		active_sum = max(active, active_sum);
1004 		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
1005 
1006 		inuse_sum = max(inuse, inuse_sum);
1007 		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
1008 	}
1009 
1010 	iocg->hweight_active = max_t(u32, hwa, 1);
1011 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1012 	iocg->hweight_gen = ioc_gen;
1013 out:
1014 	if (hw_activep)
1015 		*hw_activep = iocg->hweight_active;
1016 	if (hw_inusep)
1017 		*hw_inusep = iocg->hweight_inuse;
1018 }
1019 
weight_updated(struct ioc_gq * iocg)1020 static void weight_updated(struct ioc_gq *iocg)
1021 {
1022 	struct ioc *ioc = iocg->ioc;
1023 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1024 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1025 	u32 weight;
1026 
1027 	lockdep_assert_held(&ioc->lock);
1028 
1029 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1030 	if (weight != iocg->weight && iocg->active)
1031 		propagate_active_weight(iocg, weight,
1032 			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1033 	iocg->weight = weight;
1034 }
1035 
iocg_activate(struct ioc_gq * iocg,struct ioc_now * now)1036 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1037 {
1038 	struct ioc *ioc = iocg->ioc;
1039 	u64 last_period, cur_period, max_period_delta;
1040 	u64 vtime, vmargin, vmin;
1041 	int i;
1042 
1043 	/*
1044 	 * If seem to be already active, just update the stamp to tell the
1045 	 * timer that we're still active.  We don't mind occassional races.
1046 	 */
1047 	if (!list_empty(&iocg->active_list)) {
1048 		ioc_now(ioc, now);
1049 		cur_period = atomic64_read(&ioc->cur_period);
1050 		if (atomic64_read(&iocg->active_period) != cur_period)
1051 			atomic64_set(&iocg->active_period, cur_period);
1052 		return true;
1053 	}
1054 
1055 	/* racy check on internal node IOs, treat as root level IOs */
1056 	if (iocg->child_active_sum)
1057 		return false;
1058 
1059 	spin_lock_irq(&ioc->lock);
1060 
1061 	ioc_now(ioc, now);
1062 
1063 	/* update period */
1064 	cur_period = atomic64_read(&ioc->cur_period);
1065 	last_period = atomic64_read(&iocg->active_period);
1066 	atomic64_set(&iocg->active_period, cur_period);
1067 
1068 	/* already activated or breaking leaf-only constraint? */
1069 	if (!list_empty(&iocg->active_list))
1070 		goto succeed_unlock;
1071 	for (i = iocg->level - 1; i > 0; i--)
1072 		if (!list_empty(&iocg->ancestors[i]->active_list))
1073 			goto fail_unlock;
1074 
1075 	if (iocg->child_active_sum)
1076 		goto fail_unlock;
1077 
1078 	/*
1079 	 * vtime may wrap when vrate is raised substantially due to
1080 	 * underestimated IO costs.  Look at the period and ignore its
1081 	 * vtime if the iocg has been idle for too long.  Also, cap the
1082 	 * budget it can start with to the margin.
1083 	 */
1084 	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1085 	vtime = atomic64_read(&iocg->vtime);
1086 	vmargin = ioc->margin_us * now->vrate;
1087 	vmin = now->vnow - vmargin;
1088 
1089 	if (last_period + max_period_delta < cur_period ||
1090 	    time_before64(vtime, vmin)) {
1091 		atomic64_add(vmin - vtime, &iocg->vtime);
1092 		atomic64_add(vmin - vtime, &iocg->done_vtime);
1093 		vtime = vmin;
1094 	}
1095 
1096 	/*
1097 	 * Activate, propagate weight and start period timer if not
1098 	 * running.  Reset hweight_gen to avoid accidental match from
1099 	 * wrapping.
1100 	 */
1101 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1102 	list_add(&iocg->active_list, &ioc->active_iocgs);
1103 	propagate_active_weight(iocg, iocg->weight,
1104 				iocg->last_inuse ?: iocg->weight);
1105 
1106 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1107 			last_period, cur_period, vtime);
1108 
1109 	iocg->last_vtime = vtime;
1110 
1111 	if (ioc->running == IOC_IDLE) {
1112 		ioc->running = IOC_RUNNING;
1113 		ioc_start_period(ioc, now);
1114 	}
1115 
1116 succeed_unlock:
1117 	spin_unlock_irq(&ioc->lock);
1118 	return true;
1119 
1120 fail_unlock:
1121 	spin_unlock_irq(&ioc->lock);
1122 	return false;
1123 }
1124 
iocg_wake_fn(struct wait_queue_entry * wq_entry,unsigned mode,int flags,void * key)1125 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1126 			int flags, void *key)
1127 {
1128 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1129 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1130 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1131 
1132 	ctx->vbudget -= cost;
1133 
1134 	if (ctx->vbudget < 0)
1135 		return -1;
1136 
1137 	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1138 
1139 	/*
1140 	 * autoremove_wake_function() removes the wait entry only when it
1141 	 * actually changed the task state.  We want the wait always
1142 	 * removed.  Remove explicitly and use default_wake_function().
1143 	 */
1144 	list_del_init(&wq_entry->entry);
1145 	wait->committed = true;
1146 
1147 	default_wake_function(wq_entry, mode, flags, key);
1148 	return 0;
1149 }
1150 
iocg_kick_waitq(struct ioc_gq * iocg,struct ioc_now * now)1151 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1152 {
1153 	struct ioc *ioc = iocg->ioc;
1154 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1155 	u64 margin_ns = (u64)(ioc->period_us *
1156 			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1157 	u64 vdebt, vshortage, expires, oexpires;
1158 	s64 vbudget;
1159 	u32 hw_inuse;
1160 
1161 	lockdep_assert_held(&iocg->waitq.lock);
1162 
1163 	current_hweight(iocg, NULL, &hw_inuse);
1164 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1165 
1166 	/* pay off debt */
1167 	vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1168 	if (vdebt && vbudget > 0) {
1169 		u64 delta = min_t(u64, vbudget, vdebt);
1170 		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1171 				    iocg->abs_vdebt);
1172 
1173 		atomic64_add(delta, &iocg->vtime);
1174 		atomic64_add(delta, &iocg->done_vtime);
1175 		iocg->abs_vdebt -= abs_delta;
1176 	}
1177 
1178 	/*
1179 	 * Wake up the ones which are due and see how much vtime we'll need
1180 	 * for the next one.
1181 	 */
1182 	ctx.hw_inuse = hw_inuse;
1183 	ctx.vbudget = vbudget - vdebt;
1184 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1185 	if (!waitqueue_active(&iocg->waitq))
1186 		return;
1187 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1188 		return;
1189 
1190 	/* determine next wakeup, add a quarter margin to guarantee chunking */
1191 	vshortage = -ctx.vbudget;
1192 	expires = now->now_ns +
1193 		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1194 	expires += margin_ns / 4;
1195 
1196 	/* if already active and close enough, don't bother */
1197 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1198 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1199 	    abs(oexpires - expires) <= margin_ns / 4)
1200 		return;
1201 
1202 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1203 			       margin_ns / 4, HRTIMER_MODE_ABS);
1204 }
1205 
iocg_waitq_timer_fn(struct hrtimer * timer)1206 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1207 {
1208 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1209 	struct ioc_now now;
1210 	unsigned long flags;
1211 
1212 	ioc_now(iocg->ioc, &now);
1213 
1214 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1215 	iocg_kick_waitq(iocg, &now);
1216 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1217 
1218 	return HRTIMER_NORESTART;
1219 }
1220 
iocg_kick_delay(struct ioc_gq * iocg,struct ioc_now * now,u64 cost)1221 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1222 {
1223 	struct ioc *ioc = iocg->ioc;
1224 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1225 	u64 vtime = atomic64_read(&iocg->vtime);
1226 	u64 vmargin = ioc->margin_us * now->vrate;
1227 	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1228 	u64 expires, oexpires;
1229 	u32 hw_inuse;
1230 
1231 	lockdep_assert_held(&iocg->waitq.lock);
1232 
1233 	/* debt-adjust vtime */
1234 	current_hweight(iocg, NULL, &hw_inuse);
1235 	vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1236 
1237 	/*
1238 	 * Clear or maintain depending on the overage. Non-zero vdebt is what
1239 	 * guarantees that @iocg is online and future iocg_kick_delay() will
1240 	 * clear use_delay. Don't leave it on when there's no vdebt.
1241 	 */
1242 	if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1243 		blkcg_clear_delay(blkg);
1244 		return false;
1245 	}
1246 	if (!atomic_read(&blkg->use_delay) &&
1247 	    time_before_eq64(vtime, now->vnow + vmargin))
1248 		return false;
1249 
1250 	/* use delay */
1251 	if (cost) {
1252 		u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1253 						 now->vrate);
1254 		blkcg_add_delay(blkg, now->now_ns, cost_ns);
1255 	}
1256 	blkcg_use_delay(blkg);
1257 
1258 	expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1259 						   now->vrate) * NSEC_PER_USEC;
1260 
1261 	/* if already active and close enough, don't bother */
1262 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1263 	if (hrtimer_is_queued(&iocg->delay_timer) &&
1264 	    abs(oexpires - expires) <= margin_ns / 4)
1265 		return true;
1266 
1267 	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1268 			       margin_ns / 4, HRTIMER_MODE_ABS);
1269 	return true;
1270 }
1271 
iocg_delay_timer_fn(struct hrtimer * timer)1272 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1273 {
1274 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1275 	struct ioc_now now;
1276 	unsigned long flags;
1277 
1278 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1279 	ioc_now(iocg->ioc, &now);
1280 	iocg_kick_delay(iocg, &now, 0);
1281 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1282 
1283 	return HRTIMER_NORESTART;
1284 }
1285 
ioc_lat_stat(struct ioc * ioc,u32 * missed_ppm_ar,u32 * rq_wait_pct_p)1286 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1287 {
1288 	u32 nr_met[2] = { };
1289 	u32 nr_missed[2] = { };
1290 	u64 rq_wait_ns = 0;
1291 	int cpu, rw;
1292 
1293 	for_each_online_cpu(cpu) {
1294 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1295 		u64 this_rq_wait_ns;
1296 
1297 		for (rw = READ; rw <= WRITE; rw++) {
1298 			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1299 			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1300 
1301 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1302 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1303 			stat->missed[rw].last_met = this_met;
1304 			stat->missed[rw].last_missed = this_missed;
1305 		}
1306 
1307 		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1308 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1309 		stat->last_rq_wait_ns = this_rq_wait_ns;
1310 	}
1311 
1312 	for (rw = READ; rw <= WRITE; rw++) {
1313 		if (nr_met[rw] + nr_missed[rw])
1314 			missed_ppm_ar[rw] =
1315 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1316 						   nr_met[rw] + nr_missed[rw]);
1317 		else
1318 			missed_ppm_ar[rw] = 0;
1319 	}
1320 
1321 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1322 				   ioc->period_us * NSEC_PER_USEC);
1323 }
1324 
1325 /* was iocg idle this period? */
iocg_is_idle(struct ioc_gq * iocg)1326 static bool iocg_is_idle(struct ioc_gq *iocg)
1327 {
1328 	struct ioc *ioc = iocg->ioc;
1329 
1330 	/* did something get issued this period? */
1331 	if (atomic64_read(&iocg->active_period) ==
1332 	    atomic64_read(&ioc->cur_period))
1333 		return false;
1334 
1335 	/* is something in flight? */
1336 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1337 		return false;
1338 
1339 	return true;
1340 }
1341 
1342 /* returns usage with margin added if surplus is large enough */
surplus_adjusted_hweight_inuse(u32 usage,u32 hw_inuse)1343 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1344 {
1345 	/* add margin */
1346 	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1347 	usage += SURPLUS_SCALE_ABS;
1348 
1349 	/* don't bother if the surplus is too small */
1350 	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1351 		return 0;
1352 
1353 	return usage;
1354 }
1355 
ioc_timer_fn(struct timer_list * timer)1356 static void ioc_timer_fn(struct timer_list *timer)
1357 {
1358 	struct ioc *ioc = container_of(timer, struct ioc, timer);
1359 	struct ioc_gq *iocg, *tiocg;
1360 	struct ioc_now now;
1361 	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1362 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1363 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1364 	u32 missed_ppm[2], rq_wait_pct;
1365 	u64 period_vtime;
1366 	int prev_busy_level, i;
1367 
1368 	/* how were the latencies during the period? */
1369 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1370 
1371 	/* take care of active iocgs */
1372 	spin_lock_irq(&ioc->lock);
1373 
1374 	ioc_now(ioc, &now);
1375 
1376 	period_vtime = now.vnow - ioc->period_at_vtime;
1377 	if (WARN_ON_ONCE(!period_vtime)) {
1378 		spin_unlock_irq(&ioc->lock);
1379 		return;
1380 	}
1381 
1382 	/*
1383 	 * Waiters determine the sleep durations based on the vrate they
1384 	 * saw at the time of sleep.  If vrate has increased, some waiters
1385 	 * could be sleeping for too long.  Wake up tardy waiters which
1386 	 * should have woken up in the last period and expire idle iocgs.
1387 	 */
1388 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1389 		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1390 		    !iocg_is_idle(iocg))
1391 			continue;
1392 
1393 		spin_lock(&iocg->waitq.lock);
1394 
1395 		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1396 			/* might be oversleeping vtime / hweight changes, kick */
1397 			iocg_kick_waitq(iocg, &now);
1398 			iocg_kick_delay(iocg, &now, 0);
1399 		} else if (iocg_is_idle(iocg)) {
1400 			/* no waiter and idle, deactivate */
1401 			iocg->last_inuse = iocg->inuse;
1402 			__propagate_active_weight(iocg, 0, 0);
1403 			list_del_init(&iocg->active_list);
1404 		}
1405 
1406 		spin_unlock(&iocg->waitq.lock);
1407 	}
1408 	commit_active_weights(ioc);
1409 
1410 	/* calc usages and see whether some weights need to be moved around */
1411 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1412 		u64 vdone, vtime, vusage, vmargin, vmin;
1413 		u32 hw_active, hw_inuse, usage;
1414 
1415 		/*
1416 		 * Collect unused and wind vtime closer to vnow to prevent
1417 		 * iocgs from accumulating a large amount of budget.
1418 		 */
1419 		vdone = atomic64_read(&iocg->done_vtime);
1420 		vtime = atomic64_read(&iocg->vtime);
1421 		current_hweight(iocg, &hw_active, &hw_inuse);
1422 
1423 		/*
1424 		 * Latency QoS detection doesn't account for IOs which are
1425 		 * in-flight for longer than a period.  Detect them by
1426 		 * comparing vdone against period start.  If lagging behind
1427 		 * IOs from past periods, don't increase vrate.
1428 		 */
1429 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1430 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1431 		    time_after64(vtime, vdone) &&
1432 		    time_after64(vtime, now.vnow -
1433 				 MAX_LAGGING_PERIODS * period_vtime) &&
1434 		    time_before64(vdone, now.vnow - period_vtime))
1435 			nr_lagging++;
1436 
1437 		if (waitqueue_active(&iocg->waitq))
1438 			vusage = now.vnow - iocg->last_vtime;
1439 		else if (time_before64(iocg->last_vtime, vtime))
1440 			vusage = vtime - iocg->last_vtime;
1441 		else
1442 			vusage = 0;
1443 
1444 		iocg->last_vtime += vusage;
1445 		/*
1446 		 * Factor in in-flight vtime into vusage to avoid
1447 		 * high-latency completions appearing as idle.  This should
1448 		 * be done after the above ->last_time adjustment.
1449 		 */
1450 		vusage = max(vusage, vtime - vdone);
1451 
1452 		/* calculate hweight based usage ratio and record */
1453 		if (vusage) {
1454 			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1455 						   period_vtime);
1456 			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1457 			iocg->usages[iocg->usage_idx] = usage;
1458 		} else {
1459 			usage = 0;
1460 		}
1461 
1462 		/* see whether there's surplus vtime */
1463 		vmargin = ioc->margin_us * now.vrate;
1464 		vmin = now.vnow - vmargin;
1465 
1466 		iocg->has_surplus = false;
1467 
1468 		if (!waitqueue_active(&iocg->waitq) &&
1469 		    time_before64(vtime, vmin)) {
1470 			u64 delta = vmin - vtime;
1471 
1472 			/* throw away surplus vtime */
1473 			atomic64_add(delta, &iocg->vtime);
1474 			atomic64_add(delta, &iocg->done_vtime);
1475 			iocg->last_vtime += delta;
1476 			/* if usage is sufficiently low, maybe it can donate */
1477 			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1478 				iocg->has_surplus = true;
1479 				nr_surpluses++;
1480 			}
1481 		} else if (hw_inuse < hw_active) {
1482 			u32 new_hwi, new_inuse;
1483 
1484 			/* was donating but might need to take back some */
1485 			if (waitqueue_active(&iocg->waitq)) {
1486 				new_hwi = hw_active;
1487 			} else {
1488 				new_hwi = max(hw_inuse,
1489 					      usage * SURPLUS_SCALE_PCT / 100 +
1490 					      SURPLUS_SCALE_ABS);
1491 			}
1492 
1493 			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1494 					      hw_inuse);
1495 			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1496 
1497 			if (new_inuse > iocg->inuse) {
1498 				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1499 						iocg->inuse, new_inuse,
1500 						hw_inuse, new_hwi);
1501 				__propagate_active_weight(iocg, iocg->weight,
1502 							  new_inuse);
1503 			}
1504 		} else {
1505 			/* genuninely out of vtime */
1506 			nr_shortages++;
1507 		}
1508 	}
1509 
1510 	if (!nr_shortages || !nr_surpluses)
1511 		goto skip_surplus_transfers;
1512 
1513 	/* there are both shortages and surpluses, transfer surpluses */
1514 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1515 		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1516 		int nr_valid = 0;
1517 
1518 		if (!iocg->has_surplus)
1519 			continue;
1520 
1521 		/* base the decision on max historical usage */
1522 		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1523 			if (iocg->usages[i]) {
1524 				usage = max(usage, iocg->usages[i]);
1525 				nr_valid++;
1526 			}
1527 		}
1528 		if (nr_valid < MIN_VALID_USAGES)
1529 			continue;
1530 
1531 		current_hweight(iocg, &hw_active, &hw_inuse);
1532 		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1533 		if (!new_hwi)
1534 			continue;
1535 
1536 		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1537 					       hw_inuse);
1538 		if (new_inuse < iocg->inuse) {
1539 			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1540 					iocg->inuse, new_inuse,
1541 					hw_inuse, new_hwi);
1542 			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1543 		}
1544 	}
1545 skip_surplus_transfers:
1546 	commit_active_weights(ioc);
1547 
1548 	/*
1549 	 * If q is getting clogged or we're missing too much, we're issuing
1550 	 * too much IO and should lower vtime rate.  If we're not missing
1551 	 * and experiencing shortages but not surpluses, we're too stingy
1552 	 * and should increase vtime rate.
1553 	 */
1554 	prev_busy_level = ioc->busy_level;
1555 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1556 	    missed_ppm[READ] > ppm_rthr ||
1557 	    missed_ppm[WRITE] > ppm_wthr) {
1558 		/* clearly missing QoS targets, slow down vrate */
1559 		ioc->busy_level = max(ioc->busy_level, 0);
1560 		ioc->busy_level++;
1561 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1562 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1563 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1564 		/* QoS targets are being met with >25% margin */
1565 		if (nr_shortages) {
1566 			/*
1567 			 * We're throttling while the device has spare
1568 			 * capacity.  If vrate was being slowed down, stop.
1569 			 */
1570 			ioc->busy_level = min(ioc->busy_level, 0);
1571 
1572 			/*
1573 			 * If there are IOs spanning multiple periods, wait
1574 			 * them out before pushing the device harder.  If
1575 			 * there are surpluses, let redistribution work it
1576 			 * out first.
1577 			 */
1578 			if (!nr_lagging && !nr_surpluses)
1579 				ioc->busy_level--;
1580 		} else {
1581 			/*
1582 			 * Nobody is being throttled and the users aren't
1583 			 * issuing enough IOs to saturate the device.  We
1584 			 * simply don't know how close the device is to
1585 			 * saturation.  Coast.
1586 			 */
1587 			ioc->busy_level = 0;
1588 		}
1589 	} else {
1590 		/* inside the hysterisis margin, we're good */
1591 		ioc->busy_level = 0;
1592 	}
1593 
1594 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1595 
1596 	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1597 		u64 vrate = atomic64_read(&ioc->vtime_rate);
1598 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1599 
1600 		/* rq_wait signal is always reliable, ignore user vrate_min */
1601 		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1602 			vrate_min = VRATE_MIN;
1603 
1604 		/*
1605 		 * If vrate is out of bounds, apply clamp gradually as the
1606 		 * bounds can change abruptly.  Otherwise, apply busy_level
1607 		 * based adjustment.
1608 		 */
1609 		if (vrate < vrate_min) {
1610 			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1611 					  100);
1612 			vrate = min(vrate, vrate_min);
1613 		} else if (vrate > vrate_max) {
1614 			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1615 					  100);
1616 			vrate = max(vrate, vrate_max);
1617 		} else {
1618 			int idx = min_t(int, abs(ioc->busy_level),
1619 					ARRAY_SIZE(vrate_adj_pct) - 1);
1620 			u32 adj_pct = vrate_adj_pct[idx];
1621 
1622 			if (ioc->busy_level > 0)
1623 				adj_pct = 100 - adj_pct;
1624 			else
1625 				adj_pct = 100 + adj_pct;
1626 
1627 			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1628 				      vrate_min, vrate_max);
1629 		}
1630 
1631 		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1632 					   nr_lagging, nr_shortages,
1633 					   nr_surpluses);
1634 
1635 		atomic64_set(&ioc->vtime_rate, vrate);
1636 		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1637 			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1638 	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1639 		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1640 					   missed_ppm, rq_wait_pct, nr_lagging,
1641 					   nr_shortages, nr_surpluses);
1642 	}
1643 
1644 	ioc_refresh_params(ioc, false);
1645 
1646 	/*
1647 	 * This period is done.  Move onto the next one.  If nothing's
1648 	 * going on with the device, stop the timer.
1649 	 */
1650 	atomic64_inc(&ioc->cur_period);
1651 
1652 	if (ioc->running != IOC_STOP) {
1653 		if (!list_empty(&ioc->active_iocgs)) {
1654 			ioc_start_period(ioc, &now);
1655 		} else {
1656 			ioc->busy_level = 0;
1657 			ioc->running = IOC_IDLE;
1658 		}
1659 	}
1660 
1661 	spin_unlock_irq(&ioc->lock);
1662 }
1663 
calc_vtime_cost_builtin(struct bio * bio,struct ioc_gq * iocg,bool is_merge,u64 * costp)1664 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1665 				    bool is_merge, u64 *costp)
1666 {
1667 	struct ioc *ioc = iocg->ioc;
1668 	u64 coef_seqio, coef_randio, coef_page;
1669 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1670 	u64 seek_pages = 0;
1671 	u64 cost = 0;
1672 
1673 	switch (bio_op(bio)) {
1674 	case REQ_OP_READ:
1675 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1676 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1677 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1678 		break;
1679 	case REQ_OP_WRITE:
1680 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1681 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1682 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1683 		break;
1684 	default:
1685 		goto out;
1686 	}
1687 
1688 	if (iocg->cursor) {
1689 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1690 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1691 	}
1692 
1693 	if (!is_merge) {
1694 		if (seek_pages > LCOEF_RANDIO_PAGES) {
1695 			cost += coef_randio;
1696 		} else {
1697 			cost += coef_seqio;
1698 		}
1699 	}
1700 	cost += pages * coef_page;
1701 out:
1702 	*costp = cost;
1703 }
1704 
calc_vtime_cost(struct bio * bio,struct ioc_gq * iocg,bool is_merge)1705 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1706 {
1707 	u64 cost;
1708 
1709 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1710 	return cost;
1711 }
1712 
ioc_rqos_throttle(struct rq_qos * rqos,struct bio * bio)1713 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1714 {
1715 	struct blkcg_gq *blkg = bio->bi_blkg;
1716 	struct ioc *ioc = rqos_to_ioc(rqos);
1717 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1718 	struct ioc_now now;
1719 	struct iocg_wait wait;
1720 	u32 hw_active, hw_inuse;
1721 	u64 abs_cost, cost, vtime;
1722 
1723 	/* bypass IOs if disabled or for root cgroup */
1724 	if (!ioc->enabled || !iocg->level)
1725 		return;
1726 
1727 	/* always activate so that even 0 cost IOs get protected to some level */
1728 	if (!iocg_activate(iocg, &now))
1729 		return;
1730 
1731 	/* calculate the absolute vtime cost */
1732 	abs_cost = calc_vtime_cost(bio, iocg, false);
1733 	if (!abs_cost)
1734 		return;
1735 
1736 	iocg->cursor = bio_end_sector(bio);
1737 
1738 	vtime = atomic64_read(&iocg->vtime);
1739 	current_hweight(iocg, &hw_active, &hw_inuse);
1740 
1741 	if (hw_inuse < hw_active &&
1742 	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1743 		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1744 				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1745 		spin_lock_irq(&ioc->lock);
1746 		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1747 		spin_unlock_irq(&ioc->lock);
1748 		current_hweight(iocg, &hw_active, &hw_inuse);
1749 	}
1750 
1751 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1752 
1753 	/*
1754 	 * If no one's waiting and within budget, issue right away.  The
1755 	 * tests are racy but the races aren't systemic - we only miss once
1756 	 * in a while which is fine.
1757 	 */
1758 	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1759 	    time_before_eq64(vtime + cost, now.vnow)) {
1760 		iocg_commit_bio(iocg, bio, cost);
1761 		return;
1762 	}
1763 
1764 	/*
1765 	 * We activated above but w/o any synchronization. Deactivation is
1766 	 * synchronized with waitq.lock and we won't get deactivated as long
1767 	 * as we're waiting or has debt, so we're good if we're activated
1768 	 * here. In the unlikely case that we aren't, just issue the IO.
1769 	 */
1770 	spin_lock_irq(&iocg->waitq.lock);
1771 
1772 	if (unlikely(list_empty(&iocg->active_list))) {
1773 		spin_unlock_irq(&iocg->waitq.lock);
1774 		iocg_commit_bio(iocg, bio, cost);
1775 		return;
1776 	}
1777 
1778 	/*
1779 	 * We're over budget. If @bio has to be issued regardless, remember
1780 	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1781 	 * off the debt before waking more IOs.
1782 	 *
1783 	 * This way, the debt is continuously paid off each period with the
1784 	 * actual budget available to the cgroup. If we just wound vtime, we
1785 	 * would incorrectly use the current hw_inuse for the entire amount
1786 	 * which, for example, can lead to the cgroup staying blocked for a
1787 	 * long time even with substantially raised hw_inuse.
1788 	 *
1789 	 * An iocg with vdebt should stay online so that the timer can keep
1790 	 * deducting its vdebt and [de]activate use_delay mechanism
1791 	 * accordingly. We don't want to race against the timer trying to
1792 	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1793 	 * penalizing the cgroup and its descendants.
1794 	 */
1795 	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1796 		iocg->abs_vdebt += abs_cost;
1797 		if (iocg_kick_delay(iocg, &now, cost))
1798 			blkcg_schedule_throttle(rqos->q,
1799 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1800 		spin_unlock_irq(&iocg->waitq.lock);
1801 		return;
1802 	}
1803 
1804 	/*
1805 	 * Append self to the waitq and schedule the wakeup timer if we're
1806 	 * the first waiter.  The timer duration is calculated based on the
1807 	 * current vrate.  vtime and hweight changes can make it too short
1808 	 * or too long.  Each wait entry records the absolute cost it's
1809 	 * waiting for to allow re-evaluation using a custom wait entry.
1810 	 *
1811 	 * If too short, the timer simply reschedules itself.  If too long,
1812 	 * the period timer will notice and trigger wakeups.
1813 	 *
1814 	 * All waiters are on iocg->waitq and the wait states are
1815 	 * synchronized using waitq.lock.
1816 	 */
1817 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1818 	wait.wait.private = current;
1819 	wait.bio = bio;
1820 	wait.abs_cost = abs_cost;
1821 	wait.committed = false;	/* will be set true by waker */
1822 
1823 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1824 	iocg_kick_waitq(iocg, &now);
1825 
1826 	spin_unlock_irq(&iocg->waitq.lock);
1827 
1828 	while (true) {
1829 		set_current_state(TASK_UNINTERRUPTIBLE);
1830 		if (wait.committed)
1831 			break;
1832 		io_schedule();
1833 	}
1834 
1835 	/* waker already committed us, proceed */
1836 	finish_wait(&iocg->waitq, &wait.wait);
1837 }
1838 
ioc_rqos_merge(struct rq_qos * rqos,struct request * rq,struct bio * bio)1839 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1840 			   struct bio *bio)
1841 {
1842 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1843 	struct ioc *ioc = iocg->ioc;
1844 	sector_t bio_end = bio_end_sector(bio);
1845 	struct ioc_now now;
1846 	u32 hw_inuse;
1847 	u64 abs_cost, cost;
1848 	unsigned long flags;
1849 
1850 	/* bypass if disabled or for root cgroup */
1851 	if (!ioc->enabled || !iocg->level)
1852 		return;
1853 
1854 	abs_cost = calc_vtime_cost(bio, iocg, true);
1855 	if (!abs_cost)
1856 		return;
1857 
1858 	ioc_now(ioc, &now);
1859 	current_hweight(iocg, NULL, &hw_inuse);
1860 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1861 
1862 	/* update cursor if backmerging into the request at the cursor */
1863 	if (blk_rq_pos(rq) < bio_end &&
1864 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1865 		iocg->cursor = bio_end;
1866 
1867 	/*
1868 	 * Charge if there's enough vtime budget and the existing request has
1869 	 * cost assigned.
1870 	 */
1871 	if (rq->bio && rq->bio->bi_iocost_cost &&
1872 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1873 		iocg_commit_bio(iocg, bio, cost);
1874 		return;
1875 	}
1876 
1877 	/*
1878 	 * Otherwise, account it as debt if @iocg is online, which it should
1879 	 * be for the vast majority of cases. See debt handling in
1880 	 * ioc_rqos_throttle() for details.
1881 	 */
1882 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1883 	if (likely(!list_empty(&iocg->active_list))) {
1884 		iocg->abs_vdebt += abs_cost;
1885 		iocg_kick_delay(iocg, &now, cost);
1886 	} else {
1887 		iocg_commit_bio(iocg, bio, cost);
1888 	}
1889 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1890 }
1891 
ioc_rqos_done_bio(struct rq_qos * rqos,struct bio * bio)1892 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1893 {
1894 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1895 
1896 	if (iocg && bio->bi_iocost_cost)
1897 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1898 }
1899 
ioc_rqos_done(struct rq_qos * rqos,struct request * rq)1900 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1901 {
1902 	struct ioc *ioc = rqos_to_ioc(rqos);
1903 	u64 on_q_ns, rq_wait_ns;
1904 	int pidx, rw;
1905 
1906 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1907 		return;
1908 
1909 	switch (req_op(rq) & REQ_OP_MASK) {
1910 	case REQ_OP_READ:
1911 		pidx = QOS_RLAT;
1912 		rw = READ;
1913 		break;
1914 	case REQ_OP_WRITE:
1915 		pidx = QOS_WLAT;
1916 		rw = WRITE;
1917 		break;
1918 	default:
1919 		return;
1920 	}
1921 
1922 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1923 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1924 
1925 	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1926 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1927 	else
1928 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1929 
1930 	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1931 }
1932 
ioc_rqos_queue_depth_changed(struct rq_qos * rqos)1933 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1934 {
1935 	struct ioc *ioc = rqos_to_ioc(rqos);
1936 
1937 	spin_lock_irq(&ioc->lock);
1938 	ioc_refresh_params(ioc, false);
1939 	spin_unlock_irq(&ioc->lock);
1940 }
1941 
ioc_rqos_exit(struct rq_qos * rqos)1942 static void ioc_rqos_exit(struct rq_qos *rqos)
1943 {
1944 	struct ioc *ioc = rqos_to_ioc(rqos);
1945 
1946 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1947 
1948 	spin_lock_irq(&ioc->lock);
1949 	ioc->running = IOC_STOP;
1950 	spin_unlock_irq(&ioc->lock);
1951 
1952 	del_timer_sync(&ioc->timer);
1953 	free_percpu(ioc->pcpu_stat);
1954 	kfree(ioc);
1955 }
1956 
1957 static struct rq_qos_ops ioc_rqos_ops = {
1958 	.throttle = ioc_rqos_throttle,
1959 	.merge = ioc_rqos_merge,
1960 	.done_bio = ioc_rqos_done_bio,
1961 	.done = ioc_rqos_done,
1962 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1963 	.exit = ioc_rqos_exit,
1964 };
1965 
blk_iocost_init(struct request_queue * q)1966 static int blk_iocost_init(struct request_queue *q)
1967 {
1968 	struct ioc *ioc;
1969 	struct rq_qos *rqos;
1970 	int ret;
1971 
1972 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1973 	if (!ioc)
1974 		return -ENOMEM;
1975 
1976 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1977 	if (!ioc->pcpu_stat) {
1978 		kfree(ioc);
1979 		return -ENOMEM;
1980 	}
1981 
1982 	rqos = &ioc->rqos;
1983 	rqos->id = RQ_QOS_COST;
1984 	rqos->ops = &ioc_rqos_ops;
1985 	rqos->q = q;
1986 
1987 	spin_lock_init(&ioc->lock);
1988 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
1989 	INIT_LIST_HEAD(&ioc->active_iocgs);
1990 
1991 	ioc->running = IOC_IDLE;
1992 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1993 	seqcount_init(&ioc->period_seqcount);
1994 	ioc->period_at = ktime_to_us(ktime_get());
1995 	atomic64_set(&ioc->cur_period, 0);
1996 	atomic_set(&ioc->hweight_gen, 0);
1997 
1998 	spin_lock_irq(&ioc->lock);
1999 	ioc->autop_idx = AUTOP_INVALID;
2000 	ioc_refresh_params(ioc, true);
2001 	spin_unlock_irq(&ioc->lock);
2002 
2003 	rq_qos_add(q, rqos);
2004 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2005 	if (ret) {
2006 		rq_qos_del(q, rqos);
2007 		free_percpu(ioc->pcpu_stat);
2008 		kfree(ioc);
2009 		return ret;
2010 	}
2011 	return 0;
2012 }
2013 
ioc_cpd_alloc(gfp_t gfp)2014 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2015 {
2016 	struct ioc_cgrp *iocc;
2017 
2018 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2019 	if (!iocc)
2020 		return NULL;
2021 
2022 	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2023 	return &iocc->cpd;
2024 }
2025 
ioc_cpd_free(struct blkcg_policy_data * cpd)2026 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2027 {
2028 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2029 }
2030 
ioc_pd_alloc(gfp_t gfp,struct request_queue * q,struct blkcg * blkcg)2031 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2032 					     struct blkcg *blkcg)
2033 {
2034 	int levels = blkcg->css.cgroup->level + 1;
2035 	struct ioc_gq *iocg;
2036 
2037 	iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
2038 			    gfp, q->node);
2039 	if (!iocg)
2040 		return NULL;
2041 
2042 	return &iocg->pd;
2043 }
2044 
ioc_pd_init(struct blkg_policy_data * pd)2045 static void ioc_pd_init(struct blkg_policy_data *pd)
2046 {
2047 	struct ioc_gq *iocg = pd_to_iocg(pd);
2048 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2049 	struct ioc *ioc = q_to_ioc(blkg->q);
2050 	struct ioc_now now;
2051 	struct blkcg_gq *tblkg;
2052 	unsigned long flags;
2053 
2054 	ioc_now(ioc, &now);
2055 
2056 	iocg->ioc = ioc;
2057 	atomic64_set(&iocg->vtime, now.vnow);
2058 	atomic64_set(&iocg->done_vtime, now.vnow);
2059 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2060 	INIT_LIST_HEAD(&iocg->active_list);
2061 	iocg->hweight_active = HWEIGHT_WHOLE;
2062 	iocg->hweight_inuse = HWEIGHT_WHOLE;
2063 
2064 	init_waitqueue_head(&iocg->waitq);
2065 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2066 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2067 	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2068 	iocg->delay_timer.function = iocg_delay_timer_fn;
2069 
2070 	iocg->level = blkg->blkcg->css.cgroup->level;
2071 
2072 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2073 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2074 		iocg->ancestors[tiocg->level] = tiocg;
2075 	}
2076 
2077 	spin_lock_irqsave(&ioc->lock, flags);
2078 	weight_updated(iocg);
2079 	spin_unlock_irqrestore(&ioc->lock, flags);
2080 }
2081 
ioc_pd_free(struct blkg_policy_data * pd)2082 static void ioc_pd_free(struct blkg_policy_data *pd)
2083 {
2084 	struct ioc_gq *iocg = pd_to_iocg(pd);
2085 	struct ioc *ioc = iocg->ioc;
2086 	unsigned long flags;
2087 
2088 	if (ioc) {
2089 		spin_lock_irqsave(&ioc->lock, flags);
2090 		if (!list_empty(&iocg->active_list)) {
2091 			propagate_active_weight(iocg, 0, 0);
2092 			list_del_init(&iocg->active_list);
2093 		}
2094 		spin_unlock_irqrestore(&ioc->lock, flags);
2095 
2096 		hrtimer_cancel(&iocg->waitq_timer);
2097 		hrtimer_cancel(&iocg->delay_timer);
2098 	}
2099 	kfree(iocg);
2100 }
2101 
ioc_weight_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)2102 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2103 			     int off)
2104 {
2105 	const char *dname = blkg_dev_name(pd->blkg);
2106 	struct ioc_gq *iocg = pd_to_iocg(pd);
2107 
2108 	if (dname && iocg->cfg_weight)
2109 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2110 	return 0;
2111 }
2112 
2113 
ioc_weight_show(struct seq_file * sf,void * v)2114 static int ioc_weight_show(struct seq_file *sf, void *v)
2115 {
2116 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2117 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2118 
2119 	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2120 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2121 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2122 	return 0;
2123 }
2124 
ioc_weight_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2125 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2126 				size_t nbytes, loff_t off)
2127 {
2128 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2129 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2130 	struct blkg_conf_ctx ctx;
2131 	struct ioc_gq *iocg;
2132 	u32 v;
2133 	int ret;
2134 
2135 	if (!strchr(buf, ':')) {
2136 		struct blkcg_gq *blkg;
2137 
2138 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2139 			return -EINVAL;
2140 
2141 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2142 			return -EINVAL;
2143 
2144 		spin_lock(&blkcg->lock);
2145 		iocc->dfl_weight = v;
2146 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2147 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2148 
2149 			if (iocg) {
2150 				spin_lock_irq(&iocg->ioc->lock);
2151 				weight_updated(iocg);
2152 				spin_unlock_irq(&iocg->ioc->lock);
2153 			}
2154 		}
2155 		spin_unlock(&blkcg->lock);
2156 
2157 		return nbytes;
2158 	}
2159 
2160 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2161 	if (ret)
2162 		return ret;
2163 
2164 	iocg = blkg_to_iocg(ctx.blkg);
2165 
2166 	if (!strncmp(ctx.body, "default", 7)) {
2167 		v = 0;
2168 	} else {
2169 		if (!sscanf(ctx.body, "%u", &v))
2170 			goto einval;
2171 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2172 			goto einval;
2173 	}
2174 
2175 	spin_lock(&iocg->ioc->lock);
2176 	iocg->cfg_weight = v;
2177 	weight_updated(iocg);
2178 	spin_unlock(&iocg->ioc->lock);
2179 
2180 	blkg_conf_finish(&ctx);
2181 	return nbytes;
2182 
2183 einval:
2184 	blkg_conf_finish(&ctx);
2185 	return -EINVAL;
2186 }
2187 
ioc_qos_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)2188 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2189 			  int off)
2190 {
2191 	const char *dname = blkg_dev_name(pd->blkg);
2192 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2193 
2194 	if (!dname)
2195 		return 0;
2196 
2197 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2198 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2199 		   ioc->params.qos[QOS_RPPM] / 10000,
2200 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2201 		   ioc->params.qos[QOS_RLAT],
2202 		   ioc->params.qos[QOS_WPPM] / 10000,
2203 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2204 		   ioc->params.qos[QOS_WLAT],
2205 		   ioc->params.qos[QOS_MIN] / 10000,
2206 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2207 		   ioc->params.qos[QOS_MAX] / 10000,
2208 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2209 	return 0;
2210 }
2211 
ioc_qos_show(struct seq_file * sf,void * v)2212 static int ioc_qos_show(struct seq_file *sf, void *v)
2213 {
2214 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2215 
2216 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2217 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2218 	return 0;
2219 }
2220 
2221 static const match_table_t qos_ctrl_tokens = {
2222 	{ QOS_ENABLE,		"enable=%u"	},
2223 	{ QOS_CTRL,		"ctrl=%s"	},
2224 	{ NR_QOS_CTRL_PARAMS,	NULL		},
2225 };
2226 
2227 static const match_table_t qos_tokens = {
2228 	{ QOS_RPPM,		"rpct=%s"	},
2229 	{ QOS_RLAT,		"rlat=%u"	},
2230 	{ QOS_WPPM,		"wpct=%s"	},
2231 	{ QOS_WLAT,		"wlat=%u"	},
2232 	{ QOS_MIN,		"min=%s"	},
2233 	{ QOS_MAX,		"max=%s"	},
2234 	{ NR_QOS_PARAMS,	NULL		},
2235 };
2236 
ioc_qos_write(struct kernfs_open_file * of,char * input,size_t nbytes,loff_t off)2237 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2238 			     size_t nbytes, loff_t off)
2239 {
2240 	struct gendisk *disk;
2241 	struct ioc *ioc;
2242 	u32 qos[NR_QOS_PARAMS];
2243 	bool enable, user;
2244 	char *p;
2245 	int ret;
2246 
2247 	disk = blkcg_conf_get_disk(&input);
2248 	if (IS_ERR(disk))
2249 		return PTR_ERR(disk);
2250 
2251 	ioc = q_to_ioc(disk->queue);
2252 	if (!ioc) {
2253 		ret = blk_iocost_init(disk->queue);
2254 		if (ret)
2255 			goto err;
2256 		ioc = q_to_ioc(disk->queue);
2257 	}
2258 
2259 	spin_lock_irq(&ioc->lock);
2260 	memcpy(qos, ioc->params.qos, sizeof(qos));
2261 	enable = ioc->enabled;
2262 	user = ioc->user_qos_params;
2263 	spin_unlock_irq(&ioc->lock);
2264 
2265 	while ((p = strsep(&input, " \t\n"))) {
2266 		substring_t args[MAX_OPT_ARGS];
2267 		char buf[32];
2268 		int tok;
2269 		s64 v;
2270 
2271 		if (!*p)
2272 			continue;
2273 
2274 		switch (match_token(p, qos_ctrl_tokens, args)) {
2275 		case QOS_ENABLE:
2276 			match_u64(&args[0], &v);
2277 			enable = v;
2278 			continue;
2279 		case QOS_CTRL:
2280 			match_strlcpy(buf, &args[0], sizeof(buf));
2281 			if (!strcmp(buf, "auto"))
2282 				user = false;
2283 			else if (!strcmp(buf, "user"))
2284 				user = true;
2285 			else
2286 				goto einval;
2287 			continue;
2288 		}
2289 
2290 		tok = match_token(p, qos_tokens, args);
2291 		switch (tok) {
2292 		case QOS_RPPM:
2293 		case QOS_WPPM:
2294 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2295 			    sizeof(buf))
2296 				goto einval;
2297 			if (cgroup_parse_float(buf, 2, &v))
2298 				goto einval;
2299 			if (v < 0 || v > 10000)
2300 				goto einval;
2301 			qos[tok] = v * 100;
2302 			break;
2303 		case QOS_RLAT:
2304 		case QOS_WLAT:
2305 			if (match_u64(&args[0], &v))
2306 				goto einval;
2307 			qos[tok] = v;
2308 			break;
2309 		case QOS_MIN:
2310 		case QOS_MAX:
2311 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2312 			    sizeof(buf))
2313 				goto einval;
2314 			if (cgroup_parse_float(buf, 2, &v))
2315 				goto einval;
2316 			if (v < 0)
2317 				goto einval;
2318 			qos[tok] = clamp_t(s64, v * 100,
2319 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2320 			break;
2321 		default:
2322 			goto einval;
2323 		}
2324 		user = true;
2325 	}
2326 
2327 	if (qos[QOS_MIN] > qos[QOS_MAX])
2328 		goto einval;
2329 
2330 	spin_lock_irq(&ioc->lock);
2331 
2332 	if (enable) {
2333 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2334 		ioc->enabled = true;
2335 	} else {
2336 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2337 		ioc->enabled = false;
2338 	}
2339 
2340 	if (user) {
2341 		memcpy(ioc->params.qos, qos, sizeof(qos));
2342 		ioc->user_qos_params = true;
2343 	} else {
2344 		ioc->user_qos_params = false;
2345 	}
2346 
2347 	ioc_refresh_params(ioc, true);
2348 	spin_unlock_irq(&ioc->lock);
2349 
2350 	put_disk_and_module(disk);
2351 	return nbytes;
2352 einval:
2353 	ret = -EINVAL;
2354 err:
2355 	put_disk_and_module(disk);
2356 	return ret;
2357 }
2358 
ioc_cost_model_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)2359 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2360 				 struct blkg_policy_data *pd, int off)
2361 {
2362 	const char *dname = blkg_dev_name(pd->blkg);
2363 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2364 	u64 *u = ioc->params.i_lcoefs;
2365 
2366 	if (!dname)
2367 		return 0;
2368 
2369 	seq_printf(sf, "%s ctrl=%s model=linear "
2370 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2371 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2372 		   dname, ioc->user_cost_model ? "user" : "auto",
2373 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2374 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2375 	return 0;
2376 }
2377 
ioc_cost_model_show(struct seq_file * sf,void * v)2378 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2379 {
2380 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2381 
2382 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2383 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2384 	return 0;
2385 }
2386 
2387 static const match_table_t cost_ctrl_tokens = {
2388 	{ COST_CTRL,		"ctrl=%s"	},
2389 	{ COST_MODEL,		"model=%s"	},
2390 	{ NR_COST_CTRL_PARAMS,	NULL		},
2391 };
2392 
2393 static const match_table_t i_lcoef_tokens = {
2394 	{ I_LCOEF_RBPS,		"rbps=%u"	},
2395 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2396 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2397 	{ I_LCOEF_WBPS,		"wbps=%u"	},
2398 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2399 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2400 	{ NR_I_LCOEFS,		NULL		},
2401 };
2402 
ioc_cost_model_write(struct kernfs_open_file * of,char * input,size_t nbytes,loff_t off)2403 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2404 				    size_t nbytes, loff_t off)
2405 {
2406 	struct gendisk *disk;
2407 	struct ioc *ioc;
2408 	u64 u[NR_I_LCOEFS];
2409 	bool user;
2410 	char *p;
2411 	int ret;
2412 
2413 	disk = blkcg_conf_get_disk(&input);
2414 	if (IS_ERR(disk))
2415 		return PTR_ERR(disk);
2416 
2417 	ioc = q_to_ioc(disk->queue);
2418 	if (!ioc) {
2419 		ret = blk_iocost_init(disk->queue);
2420 		if (ret)
2421 			goto err;
2422 		ioc = q_to_ioc(disk->queue);
2423 	}
2424 
2425 	spin_lock_irq(&ioc->lock);
2426 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2427 	user = ioc->user_cost_model;
2428 	spin_unlock_irq(&ioc->lock);
2429 
2430 	while ((p = strsep(&input, " \t\n"))) {
2431 		substring_t args[MAX_OPT_ARGS];
2432 		char buf[32];
2433 		int tok;
2434 		u64 v;
2435 
2436 		if (!*p)
2437 			continue;
2438 
2439 		switch (match_token(p, cost_ctrl_tokens, args)) {
2440 		case COST_CTRL:
2441 			match_strlcpy(buf, &args[0], sizeof(buf));
2442 			if (!strcmp(buf, "auto"))
2443 				user = false;
2444 			else if (!strcmp(buf, "user"))
2445 				user = true;
2446 			else
2447 				goto einval;
2448 			continue;
2449 		case COST_MODEL:
2450 			match_strlcpy(buf, &args[0], sizeof(buf));
2451 			if (strcmp(buf, "linear"))
2452 				goto einval;
2453 			continue;
2454 		}
2455 
2456 		tok = match_token(p, i_lcoef_tokens, args);
2457 		if (tok == NR_I_LCOEFS)
2458 			goto einval;
2459 		if (match_u64(&args[0], &v))
2460 			goto einval;
2461 		u[tok] = v;
2462 		user = true;
2463 	}
2464 
2465 	spin_lock_irq(&ioc->lock);
2466 	if (user) {
2467 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2468 		ioc->user_cost_model = true;
2469 	} else {
2470 		ioc->user_cost_model = false;
2471 	}
2472 	ioc_refresh_params(ioc, true);
2473 	spin_unlock_irq(&ioc->lock);
2474 
2475 	put_disk_and_module(disk);
2476 	return nbytes;
2477 
2478 einval:
2479 	ret = -EINVAL;
2480 err:
2481 	put_disk_and_module(disk);
2482 	return ret;
2483 }
2484 
2485 static struct cftype ioc_files[] = {
2486 	{
2487 		.name = "weight",
2488 		.flags = CFTYPE_NOT_ON_ROOT,
2489 		.seq_show = ioc_weight_show,
2490 		.write = ioc_weight_write,
2491 	},
2492 	{
2493 		.name = "cost.qos",
2494 		.flags = CFTYPE_ONLY_ON_ROOT,
2495 		.seq_show = ioc_qos_show,
2496 		.write = ioc_qos_write,
2497 	},
2498 	{
2499 		.name = "cost.model",
2500 		.flags = CFTYPE_ONLY_ON_ROOT,
2501 		.seq_show = ioc_cost_model_show,
2502 		.write = ioc_cost_model_write,
2503 	},
2504 	{}
2505 };
2506 
2507 static struct blkcg_policy blkcg_policy_iocost = {
2508 	.dfl_cftypes	= ioc_files,
2509 	.cpd_alloc_fn	= ioc_cpd_alloc,
2510 	.cpd_free_fn	= ioc_cpd_free,
2511 	.pd_alloc_fn	= ioc_pd_alloc,
2512 	.pd_init_fn	= ioc_pd_init,
2513 	.pd_free_fn	= ioc_pd_free,
2514 };
2515 
ioc_init(void)2516 static int __init ioc_init(void)
2517 {
2518 	return blkcg_policy_register(&blkcg_policy_iocost);
2519 }
2520 
ioc_exit(void)2521 static void __exit ioc_exit(void)
2522 {
2523 	return blkcg_policy_unregister(&blkcg_policy_iocost);
2524 }
2525 
2526 module_init(ioc_init);
2527 module_exit(ioc_exit);
2528