1 /* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
50 * device-specific coefficients.
51 *
52 * 2. Control Strategy
53 *
54 * The device virtual time (vtime) is used as the primary control metric.
55 * The control strategy is composed of the following three parts.
56 *
57 * 2-1. Vtime Distribution
58 *
59 * When a cgroup becomes active in terms of IOs, its hierarchical share is
60 * calculated. Please consider the following hierarchy where the numbers
61 * inside parentheses denote the configured weights.
62 *
63 * root
64 * / \
65 * A (w:100) B (w:300)
66 * / \
67 * A0 (w:100) A1 (w:100)
68 *
69 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
70 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
71 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
72 * 12.5% each. The distribution mechanism only cares about these flattened
73 * shares. They're called hweights (hierarchical weights) and always add
74 * upto 1 (HWEIGHT_WHOLE).
75 *
76 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
77 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
78 * against the device vtime - an IO which takes 10ms on the underlying
79 * device is considered to take 80ms on A0.
80 *
81 * This constitutes the basis of IO capacity distribution. Each cgroup's
82 * vtime is running at a rate determined by its hweight. A cgroup tracks
83 * the vtime consumed by past IOs and can issue a new IO iff doing so
84 * wouldn't outrun the current device vtime. Otherwise, the IO is
85 * suspended until the vtime has progressed enough to cover it.
86 *
87 * 2-2. Vrate Adjustment
88 *
89 * It's unrealistic to expect the cost model to be perfect. There are too
90 * many devices and even on the same device the overall performance
91 * fluctuates depending on numerous factors such as IO mixture and device
92 * internal garbage collection. The controller needs to adapt dynamically.
93 *
94 * This is achieved by adjusting the overall IO rate according to how busy
95 * the device is. If the device becomes overloaded, we're sending down too
96 * many IOs and should generally slow down. If there are waiting issuers
97 * but the device isn't saturated, we're issuing too few and should
98 * generally speed up.
99 *
100 * To slow down, we lower the vrate - the rate at which the device vtime
101 * passes compared to the wall clock. For example, if the vtime is running
102 * at the vrate of 75%, all cgroups added up would only be able to issue
103 * 750ms worth of IOs per second, and vice-versa for speeding up.
104 *
105 * Device business is determined using two criteria - rq wait and
106 * completion latencies.
107 *
108 * When a device gets saturated, the on-device and then the request queues
109 * fill up and a bio which is ready to be issued has to wait for a request
110 * to become available. When this delay becomes noticeable, it's a clear
111 * indication that the device is saturated and we lower the vrate. This
112 * saturation signal is fairly conservative as it only triggers when both
113 * hardware and software queues are filled up, and is used as the default
114 * busy signal.
115 *
116 * As devices can have deep queues and be unfair in how the queued commands
117 * are executed, soley depending on rq wait may not result in satisfactory
118 * control quality. For a better control quality, completion latency QoS
119 * parameters can be configured so that the device is considered saturated
120 * if N'th percentile completion latency rises above the set point.
121 *
122 * The completion latency requirements are a function of both the
123 * underlying device characteristics and the desired IO latency quality of
124 * service. There is an inherent trade-off - the tighter the latency QoS,
125 * the higher the bandwidth lossage. Latency QoS is disabled by default
126 * and can be set through /sys/fs/cgroup/io.cost.qos.
127 *
128 * 2-3. Work Conservation
129 *
130 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
131 * periodically while B is sending out enough parallel IOs to saturate the
132 * device on its own. Let's say A's usage amounts to 100ms worth of IO
133 * cost per second, i.e., 10% of the device capacity. The naive
134 * distribution of half and half would lead to 60% utilization of the
135 * device, a significant reduction in the total amount of work done
136 * compared to free-for-all competition. This is too high a cost to pay
137 * for IO control.
138 *
139 * To conserve the total amount of work done, we keep track of how much
140 * each active cgroup is actually using and yield part of its weight if
141 * there are other cgroups which can make use of it. In the above case,
142 * A's weight will be lowered so that it hovers above the actual usage and
143 * B would be able to use the rest.
144 *
145 * As we don't want to penalize a cgroup for donating its weight, the
146 * surplus weight adjustment factors in a margin and has an immediate
147 * snapback mechanism in case the cgroup needs more IO vtime for itself.
148 *
149 * Note that adjusting down surplus weights has the same effects as
150 * accelerating vtime for other cgroups and work conservation can also be
151 * implemented by adjusting vrate dynamically. However, squaring who can
152 * donate and should take back how much requires hweight propagations
153 * anyway making it easier to implement and understand as a separate
154 * mechanism.
155 *
156 * 3. Monitoring
157 *
158 * Instead of debugfs or other clumsy monitoring mechanisms, this
159 * controller uses a drgn based monitoring script -
160 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
161 * https://github.com/osandov/drgn. The ouput looks like the following.
162 *
163 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
164 * active weight hweight% inflt% dbt delay usages%
165 * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
166 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
167 *
168 * - per : Timer period
169 * - cur_per : Internal wall and device vtime clock
170 * - vrate : Device virtual time rate against wall clock
171 * - weight : Surplus-adjusted and configured weights
172 * - hweight : Surplus-adjusted and configured hierarchical weights
173 * - inflt : The percentage of in-flight IO cost at the end of last period
174 * - del_ms : Deferred issuer delay induction level and duration
175 * - usages : Usage history
176 */
177
178 #include <linux/kernel.h>
179 #include <linux/module.h>
180 #include <linux/timer.h>
181 #include <linux/time64.h>
182 #include <linux/parser.h>
183 #include <linux/sched/signal.h>
184 #include <linux/blk-cgroup.h>
185 #include "blk-rq-qos.h"
186 #include "blk-stat.h"
187 #include "blk-wbt.h"
188
189 #ifdef CONFIG_TRACEPOINTS
190
191 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
192 #define TRACE_IOCG_PATH_LEN 1024
193 static DEFINE_SPINLOCK(trace_iocg_path_lock);
194 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
195
196 #define TRACE_IOCG_PATH(type, iocg, ...) \
197 do { \
198 unsigned long flags; \
199 if (trace_iocost_##type##_enabled()) { \
200 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
201 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
202 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
203 trace_iocost_##type(iocg, trace_iocg_path, \
204 ##__VA_ARGS__); \
205 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
206 } \
207 } while (0)
208
209 #else /* CONFIG_TRACE_POINTS */
210 #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
211 #endif /* CONFIG_TRACE_POINTS */
212
213 enum {
214 MILLION = 1000000,
215
216 /* timer period is calculated from latency requirements, bound it */
217 MIN_PERIOD = USEC_PER_MSEC,
218 MAX_PERIOD = USEC_PER_SEC,
219
220 /*
221 * A cgroup's vtime can run 50% behind the device vtime, which
222 * serves as its IO credit buffer. Surplus weight adjustment is
223 * immediately canceled if the vtime margin runs below 10%.
224 */
225 MARGIN_PCT = 50,
226 INUSE_MARGIN_PCT = 10,
227
228 /* Have some play in waitq timer operations */
229 WAITQ_TIMER_MARGIN_PCT = 5,
230
231 /*
232 * vtime can wrap well within a reasonable uptime when vrate is
233 * consistently raised. Don't trust recorded cgroup vtime if the
234 * period counter indicates that it's older than 5mins.
235 */
236 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
237
238 /*
239 * Remember the past three non-zero usages and use the max for
240 * surplus calculation. Three slots guarantee that we remember one
241 * full period usage from the last active stretch even after
242 * partial deactivation and re-activation periods. Don't start
243 * giving away weight before collecting two data points to prevent
244 * hweight adjustments based on one partial activation period.
245 */
246 NR_USAGE_SLOTS = 3,
247 MIN_VALID_USAGES = 2,
248
249 /* 1/64k is granular enough and can easily be handled w/ u32 */
250 HWEIGHT_WHOLE = 1 << 16,
251 };
252
253 enum {
254 /*
255 * As vtime is used to calculate the cost of each IO, it needs to
256 * be fairly high precision. For example, it should be able to
257 * represent the cost of a single page worth of discard with
258 * suffificient accuracy. At the same time, it should be able to
259 * represent reasonably long enough durations to be useful and
260 * convenient during operation.
261 *
262 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
263 * granularity and days of wrap-around time even at extreme vrates.
264 */
265 VTIME_PER_SEC_SHIFT = 37,
266 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
267 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
268
269 /* bound vrate adjustments within two orders of magnitude */
270 VRATE_MIN_PPM = 10000, /* 1% */
271 VRATE_MAX_PPM = 100000000, /* 10000% */
272
273 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
274 VRATE_CLAMP_ADJ_PCT = 4,
275
276 /* switch iff the conditions are met for longer than this */
277 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
278 };
279
280 enum {
281 /* if IOs end up waiting for requests, issue less */
282 RQ_WAIT_BUSY_PCT = 5,
283
284 /* unbusy hysterisis */
285 UNBUSY_THR_PCT = 75,
286
287 /* don't let cmds which take a very long time pin lagging for too long */
288 MAX_LAGGING_PERIODS = 10,
289
290 /*
291 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
292 * donate the surplus.
293 */
294 SURPLUS_SCALE_PCT = 125, /* * 125% */
295 SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
296 SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
297
298 /*
299 * Count IO size in 4k pages. The 12bit shift helps keeping
300 * size-proportional components of cost calculation in closer
301 * numbers of digits to per-IO cost components.
302 */
303 IOC_PAGE_SHIFT = 12,
304 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
305 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
306
307 /* if apart further than 16M, consider randio for linear model */
308 LCOEF_RANDIO_PAGES = 4096,
309 };
310
311 enum ioc_running {
312 IOC_IDLE,
313 IOC_RUNNING,
314 IOC_STOP,
315 };
316
317 /* io.cost.qos controls including per-dev enable of the whole controller */
318 enum {
319 QOS_ENABLE,
320 QOS_CTRL,
321 NR_QOS_CTRL_PARAMS,
322 };
323
324 /* io.cost.qos params */
325 enum {
326 QOS_RPPM,
327 QOS_RLAT,
328 QOS_WPPM,
329 QOS_WLAT,
330 QOS_MIN,
331 QOS_MAX,
332 NR_QOS_PARAMS,
333 };
334
335 /* io.cost.model controls */
336 enum {
337 COST_CTRL,
338 COST_MODEL,
339 NR_COST_CTRL_PARAMS,
340 };
341
342 /* builtin linear cost model coefficients */
343 enum {
344 I_LCOEF_RBPS,
345 I_LCOEF_RSEQIOPS,
346 I_LCOEF_RRANDIOPS,
347 I_LCOEF_WBPS,
348 I_LCOEF_WSEQIOPS,
349 I_LCOEF_WRANDIOPS,
350 NR_I_LCOEFS,
351 };
352
353 enum {
354 LCOEF_RPAGE,
355 LCOEF_RSEQIO,
356 LCOEF_RRANDIO,
357 LCOEF_WPAGE,
358 LCOEF_WSEQIO,
359 LCOEF_WRANDIO,
360 NR_LCOEFS,
361 };
362
363 enum {
364 AUTOP_INVALID,
365 AUTOP_HDD,
366 AUTOP_SSD_QD1,
367 AUTOP_SSD_DFL,
368 AUTOP_SSD_FAST,
369 };
370
371 struct ioc_gq;
372
373 struct ioc_params {
374 u32 qos[NR_QOS_PARAMS];
375 u64 i_lcoefs[NR_I_LCOEFS];
376 u64 lcoefs[NR_LCOEFS];
377 u32 too_fast_vrate_pct;
378 u32 too_slow_vrate_pct;
379 };
380
381 struct ioc_missed {
382 u32 nr_met;
383 u32 nr_missed;
384 u32 last_met;
385 u32 last_missed;
386 };
387
388 struct ioc_pcpu_stat {
389 struct ioc_missed missed[2];
390
391 u64 rq_wait_ns;
392 u64 last_rq_wait_ns;
393 };
394
395 /* per device */
396 struct ioc {
397 struct rq_qos rqos;
398
399 bool enabled;
400
401 struct ioc_params params;
402 u32 period_us;
403 u32 margin_us;
404 u64 vrate_min;
405 u64 vrate_max;
406
407 spinlock_t lock;
408 struct timer_list timer;
409 struct list_head active_iocgs; /* active cgroups */
410 struct ioc_pcpu_stat __percpu *pcpu_stat;
411
412 enum ioc_running running;
413 atomic64_t vtime_rate;
414
415 seqcount_t period_seqcount;
416 u32 period_at; /* wallclock starttime */
417 u64 period_at_vtime; /* vtime starttime */
418
419 atomic64_t cur_period; /* inc'd each period */
420 int busy_level; /* saturation history */
421
422 u64 inuse_margin_vtime;
423 bool weights_updated;
424 atomic_t hweight_gen; /* for lazy hweights */
425
426 u64 autop_too_fast_at;
427 u64 autop_too_slow_at;
428 int autop_idx;
429 bool user_qos_params:1;
430 bool user_cost_model:1;
431 };
432
433 /* per device-cgroup pair */
434 struct ioc_gq {
435 struct blkg_policy_data pd;
436 struct ioc *ioc;
437
438 /*
439 * A iocg can get its weight from two sources - an explicit
440 * per-device-cgroup configuration or the default weight of the
441 * cgroup. `cfg_weight` is the explicit per-device-cgroup
442 * configuration. `weight` is the effective considering both
443 * sources.
444 *
445 * When an idle cgroup becomes active its `active` goes from 0 to
446 * `weight`. `inuse` is the surplus adjusted active weight.
447 * `active` and `inuse` are used to calculate `hweight_active` and
448 * `hweight_inuse`.
449 *
450 * `last_inuse` remembers `inuse` while an iocg is idle to persist
451 * surplus adjustments.
452 */
453 u32 cfg_weight;
454 u32 weight;
455 u32 active;
456 u32 inuse;
457 u32 last_inuse;
458
459 sector_t cursor; /* to detect randio */
460
461 /*
462 * `vtime` is this iocg's vtime cursor which progresses as IOs are
463 * issued. If lagging behind device vtime, the delta represents
464 * the currently available IO budget. If runnning ahead, the
465 * overage.
466 *
467 * `vtime_done` is the same but progressed on completion rather
468 * than issue. The delta behind `vtime` represents the cost of
469 * currently in-flight IOs.
470 *
471 * `last_vtime` is used to remember `vtime` at the end of the last
472 * period to calculate utilization.
473 */
474 atomic64_t vtime;
475 atomic64_t done_vtime;
476 u64 abs_vdebt;
477 u64 last_vtime;
478
479 /*
480 * The period this iocg was last active in. Used for deactivation
481 * and invalidating `vtime`.
482 */
483 atomic64_t active_period;
484 struct list_head active_list;
485
486 /* see __propagate_active_weight() and current_hweight() for details */
487 u64 child_active_sum;
488 u64 child_inuse_sum;
489 int hweight_gen;
490 u32 hweight_active;
491 u32 hweight_inuse;
492 bool has_surplus;
493
494 struct wait_queue_head waitq;
495 struct hrtimer waitq_timer;
496 struct hrtimer delay_timer;
497
498 /* usage is recorded as fractions of HWEIGHT_WHOLE */
499 int usage_idx;
500 u32 usages[NR_USAGE_SLOTS];
501
502 /* this iocg's depth in the hierarchy and ancestors including self */
503 int level;
504 struct ioc_gq *ancestors[];
505 };
506
507 /* per cgroup */
508 struct ioc_cgrp {
509 struct blkcg_policy_data cpd;
510 unsigned int dfl_weight;
511 };
512
513 struct ioc_now {
514 u64 now_ns;
515 u32 now;
516 u64 vnow;
517 u64 vrate;
518 };
519
520 struct iocg_wait {
521 struct wait_queue_entry wait;
522 struct bio *bio;
523 u64 abs_cost;
524 bool committed;
525 };
526
527 struct iocg_wake_ctx {
528 struct ioc_gq *iocg;
529 u32 hw_inuse;
530 s64 vbudget;
531 };
532
533 static const struct ioc_params autop[] = {
534 [AUTOP_HDD] = {
535 .qos = {
536 [QOS_RLAT] = 250000, /* 250ms */
537 [QOS_WLAT] = 250000,
538 [QOS_MIN] = VRATE_MIN_PPM,
539 [QOS_MAX] = VRATE_MAX_PPM,
540 },
541 .i_lcoefs = {
542 [I_LCOEF_RBPS] = 174019176,
543 [I_LCOEF_RSEQIOPS] = 41708,
544 [I_LCOEF_RRANDIOPS] = 370,
545 [I_LCOEF_WBPS] = 178075866,
546 [I_LCOEF_WSEQIOPS] = 42705,
547 [I_LCOEF_WRANDIOPS] = 378,
548 },
549 },
550 [AUTOP_SSD_QD1] = {
551 .qos = {
552 [QOS_RLAT] = 25000, /* 25ms */
553 [QOS_WLAT] = 25000,
554 [QOS_MIN] = VRATE_MIN_PPM,
555 [QOS_MAX] = VRATE_MAX_PPM,
556 },
557 .i_lcoefs = {
558 [I_LCOEF_RBPS] = 245855193,
559 [I_LCOEF_RSEQIOPS] = 61575,
560 [I_LCOEF_RRANDIOPS] = 6946,
561 [I_LCOEF_WBPS] = 141365009,
562 [I_LCOEF_WSEQIOPS] = 33716,
563 [I_LCOEF_WRANDIOPS] = 26796,
564 },
565 },
566 [AUTOP_SSD_DFL] = {
567 .qos = {
568 [QOS_RLAT] = 25000, /* 25ms */
569 [QOS_WLAT] = 25000,
570 [QOS_MIN] = VRATE_MIN_PPM,
571 [QOS_MAX] = VRATE_MAX_PPM,
572 },
573 .i_lcoefs = {
574 [I_LCOEF_RBPS] = 488636629,
575 [I_LCOEF_RSEQIOPS] = 8932,
576 [I_LCOEF_RRANDIOPS] = 8518,
577 [I_LCOEF_WBPS] = 427891549,
578 [I_LCOEF_WSEQIOPS] = 28755,
579 [I_LCOEF_WRANDIOPS] = 21940,
580 },
581 .too_fast_vrate_pct = 500,
582 },
583 [AUTOP_SSD_FAST] = {
584 .qos = {
585 [QOS_RLAT] = 5000, /* 5ms */
586 [QOS_WLAT] = 5000,
587 [QOS_MIN] = VRATE_MIN_PPM,
588 [QOS_MAX] = VRATE_MAX_PPM,
589 },
590 .i_lcoefs = {
591 [I_LCOEF_RBPS] = 3102524156LLU,
592 [I_LCOEF_RSEQIOPS] = 724816,
593 [I_LCOEF_RRANDIOPS] = 778122,
594 [I_LCOEF_WBPS] = 1742780862LLU,
595 [I_LCOEF_WSEQIOPS] = 425702,
596 [I_LCOEF_WRANDIOPS] = 443193,
597 },
598 .too_slow_vrate_pct = 10,
599 },
600 };
601
602 /*
603 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
604 * vtime credit shortage and down on device saturation.
605 */
606 static u32 vrate_adj_pct[] =
607 { 0, 0, 0, 0,
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
610 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
611
612 static struct blkcg_policy blkcg_policy_iocost;
613
614 /* accessors and helpers */
rqos_to_ioc(struct rq_qos * rqos)615 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
616 {
617 return container_of(rqos, struct ioc, rqos);
618 }
619
q_to_ioc(struct request_queue * q)620 static struct ioc *q_to_ioc(struct request_queue *q)
621 {
622 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
623 }
624
q_name(struct request_queue * q)625 static const char *q_name(struct request_queue *q)
626 {
627 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
628 return kobject_name(q->kobj.parent);
629 else
630 return "<unknown>";
631 }
632
ioc_name(struct ioc * ioc)633 static const char __maybe_unused *ioc_name(struct ioc *ioc)
634 {
635 return q_name(ioc->rqos.q);
636 }
637
pd_to_iocg(struct blkg_policy_data * pd)638 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
639 {
640 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
641 }
642
blkg_to_iocg(struct blkcg_gq * blkg)643 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
644 {
645 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
646 }
647
iocg_to_blkg(struct ioc_gq * iocg)648 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
649 {
650 return pd_to_blkg(&iocg->pd);
651 }
652
blkcg_to_iocc(struct blkcg * blkcg)653 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
654 {
655 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
656 struct ioc_cgrp, cpd);
657 }
658
659 /*
660 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
661 * weight, the more expensive each IO. Must round up.
662 */
abs_cost_to_cost(u64 abs_cost,u32 hw_inuse)663 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
664 {
665 return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
666 }
667
668 /*
669 * The inverse of abs_cost_to_cost(). Must round up.
670 */
cost_to_abs_cost(u64 cost,u32 hw_inuse)671 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
672 {
673 return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
674 }
675
iocg_commit_bio(struct ioc_gq * iocg,struct bio * bio,u64 cost)676 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
677 {
678 bio->bi_iocost_cost = cost;
679 atomic64_add(cost, &iocg->vtime);
680 }
681
682 #define CREATE_TRACE_POINTS
683 #include <trace/events/iocost.h>
684
685 /* latency Qos params changed, update period_us and all the dependent params */
ioc_refresh_period_us(struct ioc * ioc)686 static void ioc_refresh_period_us(struct ioc *ioc)
687 {
688 u32 ppm, lat, multi, period_us;
689
690 lockdep_assert_held(&ioc->lock);
691
692 /* pick the higher latency target */
693 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
694 ppm = ioc->params.qos[QOS_RPPM];
695 lat = ioc->params.qos[QOS_RLAT];
696 } else {
697 ppm = ioc->params.qos[QOS_WPPM];
698 lat = ioc->params.qos[QOS_WLAT];
699 }
700
701 /*
702 * We want the period to be long enough to contain a healthy number
703 * of IOs while short enough for granular control. Define it as a
704 * multiple of the latency target. Ideally, the multiplier should
705 * be scaled according to the percentile so that it would nominally
706 * contain a certain number of requests. Let's be simpler and
707 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
708 */
709 if (ppm)
710 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
711 else
712 multi = 2;
713 period_us = multi * lat;
714 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
715
716 /* calculate dependent params */
717 ioc->period_us = period_us;
718 ioc->margin_us = period_us * MARGIN_PCT / 100;
719 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
720 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
721 }
722
ioc_autop_idx(struct ioc * ioc)723 static int ioc_autop_idx(struct ioc *ioc)
724 {
725 int idx = ioc->autop_idx;
726 const struct ioc_params *p = &autop[idx];
727 u32 vrate_pct;
728 u64 now_ns;
729
730 /* rotational? */
731 if (!blk_queue_nonrot(ioc->rqos.q))
732 return AUTOP_HDD;
733
734 /* handle SATA SSDs w/ broken NCQ */
735 if (blk_queue_depth(ioc->rqos.q) == 1)
736 return AUTOP_SSD_QD1;
737
738 /* use one of the normal ssd sets */
739 if (idx < AUTOP_SSD_DFL)
740 return AUTOP_SSD_DFL;
741
742 /* if user is overriding anything, maintain what was there */
743 if (ioc->user_qos_params || ioc->user_cost_model)
744 return idx;
745
746 /* step up/down based on the vrate */
747 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
748 VTIME_PER_USEC);
749 now_ns = ktime_get_ns();
750
751 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
752 if (!ioc->autop_too_fast_at)
753 ioc->autop_too_fast_at = now_ns;
754 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
755 return idx + 1;
756 } else {
757 ioc->autop_too_fast_at = 0;
758 }
759
760 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
761 if (!ioc->autop_too_slow_at)
762 ioc->autop_too_slow_at = now_ns;
763 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
764 return idx - 1;
765 } else {
766 ioc->autop_too_slow_at = 0;
767 }
768
769 return idx;
770 }
771
772 /*
773 * Take the followings as input
774 *
775 * @bps maximum sequential throughput
776 * @seqiops maximum sequential 4k iops
777 * @randiops maximum random 4k iops
778 *
779 * and calculate the linear model cost coefficients.
780 *
781 * *@page per-page cost 1s / (@bps / 4096)
782 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
783 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
784 */
calc_lcoefs(u64 bps,u64 seqiops,u64 randiops,u64 * page,u64 * seqio,u64 * randio)785 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
786 u64 *page, u64 *seqio, u64 *randio)
787 {
788 u64 v;
789
790 *page = *seqio = *randio = 0;
791
792 if (bps) {
793 u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
794
795 if (bps_pages)
796 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
797 else
798 *page = 1;
799 }
800
801 if (seqiops) {
802 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
803 if (v > *page)
804 *seqio = v - *page;
805 }
806
807 if (randiops) {
808 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
809 if (v > *page)
810 *randio = v - *page;
811 }
812 }
813
ioc_refresh_lcoefs(struct ioc * ioc)814 static void ioc_refresh_lcoefs(struct ioc *ioc)
815 {
816 u64 *u = ioc->params.i_lcoefs;
817 u64 *c = ioc->params.lcoefs;
818
819 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
820 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
821 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
822 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
823 }
824
ioc_refresh_params(struct ioc * ioc,bool force)825 static bool ioc_refresh_params(struct ioc *ioc, bool force)
826 {
827 const struct ioc_params *p;
828 int idx;
829
830 lockdep_assert_held(&ioc->lock);
831
832 idx = ioc_autop_idx(ioc);
833 p = &autop[idx];
834
835 if (idx == ioc->autop_idx && !force)
836 return false;
837
838 if (idx != ioc->autop_idx)
839 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
840
841 ioc->autop_idx = idx;
842 ioc->autop_too_fast_at = 0;
843 ioc->autop_too_slow_at = 0;
844
845 if (!ioc->user_qos_params)
846 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
847 if (!ioc->user_cost_model)
848 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
849
850 ioc_refresh_period_us(ioc);
851 ioc_refresh_lcoefs(ioc);
852
853 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
854 VTIME_PER_USEC, MILLION);
855 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
856 VTIME_PER_USEC, MILLION);
857
858 return true;
859 }
860
861 /* take a snapshot of the current [v]time and vrate */
ioc_now(struct ioc * ioc,struct ioc_now * now)862 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
863 {
864 unsigned seq;
865
866 now->now_ns = ktime_get();
867 now->now = ktime_to_us(now->now_ns);
868 now->vrate = atomic64_read(&ioc->vtime_rate);
869
870 /*
871 * The current vtime is
872 *
873 * vtime at period start + (wallclock time since the start) * vrate
874 *
875 * As a consistent snapshot of `period_at_vtime` and `period_at` is
876 * needed, they're seqcount protected.
877 */
878 do {
879 seq = read_seqcount_begin(&ioc->period_seqcount);
880 now->vnow = ioc->period_at_vtime +
881 (now->now - ioc->period_at) * now->vrate;
882 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
883 }
884
ioc_start_period(struct ioc * ioc,struct ioc_now * now)885 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
886 {
887 lockdep_assert_held(&ioc->lock);
888 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
889
890 write_seqcount_begin(&ioc->period_seqcount);
891 ioc->period_at = now->now;
892 ioc->period_at_vtime = now->vnow;
893 write_seqcount_end(&ioc->period_seqcount);
894
895 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
896 add_timer(&ioc->timer);
897 }
898
899 /*
900 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
901 * weight sums and propagate upwards accordingly.
902 */
__propagate_active_weight(struct ioc_gq * iocg,u32 active,u32 inuse)903 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
904 {
905 struct ioc *ioc = iocg->ioc;
906 int lvl;
907
908 lockdep_assert_held(&ioc->lock);
909
910 inuse = min(active, inuse);
911
912 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
913 struct ioc_gq *parent = iocg->ancestors[lvl];
914 struct ioc_gq *child = iocg->ancestors[lvl + 1];
915 u32 parent_active = 0, parent_inuse = 0;
916
917 /* update the level sums */
918 parent->child_active_sum += (s32)(active - child->active);
919 parent->child_inuse_sum += (s32)(inuse - child->inuse);
920 /* apply the udpates */
921 child->active = active;
922 child->inuse = inuse;
923
924 /*
925 * The delta between inuse and active sums indicates that
926 * that much of weight is being given away. Parent's inuse
927 * and active should reflect the ratio.
928 */
929 if (parent->child_active_sum) {
930 parent_active = parent->weight;
931 parent_inuse = DIV64_U64_ROUND_UP(
932 parent_active * parent->child_inuse_sum,
933 parent->child_active_sum);
934 }
935
936 /* do we need to keep walking up? */
937 if (parent_active == parent->active &&
938 parent_inuse == parent->inuse)
939 break;
940
941 active = parent_active;
942 inuse = parent_inuse;
943 }
944
945 ioc->weights_updated = true;
946 }
947
commit_active_weights(struct ioc * ioc)948 static void commit_active_weights(struct ioc *ioc)
949 {
950 lockdep_assert_held(&ioc->lock);
951
952 if (ioc->weights_updated) {
953 /* paired with rmb in current_hweight(), see there */
954 smp_wmb();
955 atomic_inc(&ioc->hweight_gen);
956 ioc->weights_updated = false;
957 }
958 }
959
propagate_active_weight(struct ioc_gq * iocg,u32 active,u32 inuse)960 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
961 {
962 __propagate_active_weight(iocg, active, inuse);
963 commit_active_weights(iocg->ioc);
964 }
965
current_hweight(struct ioc_gq * iocg,u32 * hw_activep,u32 * hw_inusep)966 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
967 {
968 struct ioc *ioc = iocg->ioc;
969 int lvl;
970 u32 hwa, hwi;
971 int ioc_gen;
972
973 /* hot path - if uptodate, use cached */
974 ioc_gen = atomic_read(&ioc->hweight_gen);
975 if (ioc_gen == iocg->hweight_gen)
976 goto out;
977
978 /*
979 * Paired with wmb in commit_active_weights(). If we saw the
980 * updated hweight_gen, all the weight updates from
981 * __propagate_active_weight() are visible too.
982 *
983 * We can race with weight updates during calculation and get it
984 * wrong. However, hweight_gen would have changed and a future
985 * reader will recalculate and we're guaranteed to discard the
986 * wrong result soon.
987 */
988 smp_rmb();
989
990 hwa = hwi = HWEIGHT_WHOLE;
991 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
992 struct ioc_gq *parent = iocg->ancestors[lvl];
993 struct ioc_gq *child = iocg->ancestors[lvl + 1];
994 u32 active_sum = READ_ONCE(parent->child_active_sum);
995 u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
996 u32 active = READ_ONCE(child->active);
997 u32 inuse = READ_ONCE(child->inuse);
998
999 /* we can race with deactivations and either may read as zero */
1000 if (!active_sum || !inuse_sum)
1001 continue;
1002
1003 active_sum = max(active, active_sum);
1004 hwa = hwa * active / active_sum; /* max 16bits * 10000 */
1005
1006 inuse_sum = max(inuse, inuse_sum);
1007 hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
1008 }
1009
1010 iocg->hweight_active = max_t(u32, hwa, 1);
1011 iocg->hweight_inuse = max_t(u32, hwi, 1);
1012 iocg->hweight_gen = ioc_gen;
1013 out:
1014 if (hw_activep)
1015 *hw_activep = iocg->hweight_active;
1016 if (hw_inusep)
1017 *hw_inusep = iocg->hweight_inuse;
1018 }
1019
weight_updated(struct ioc_gq * iocg)1020 static void weight_updated(struct ioc_gq *iocg)
1021 {
1022 struct ioc *ioc = iocg->ioc;
1023 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1024 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1025 u32 weight;
1026
1027 lockdep_assert_held(&ioc->lock);
1028
1029 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1030 if (weight != iocg->weight && iocg->active)
1031 propagate_active_weight(iocg, weight,
1032 DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1033 iocg->weight = weight;
1034 }
1035
iocg_activate(struct ioc_gq * iocg,struct ioc_now * now)1036 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1037 {
1038 struct ioc *ioc = iocg->ioc;
1039 u64 last_period, cur_period, max_period_delta;
1040 u64 vtime, vmargin, vmin;
1041 int i;
1042
1043 /*
1044 * If seem to be already active, just update the stamp to tell the
1045 * timer that we're still active. We don't mind occassional races.
1046 */
1047 if (!list_empty(&iocg->active_list)) {
1048 ioc_now(ioc, now);
1049 cur_period = atomic64_read(&ioc->cur_period);
1050 if (atomic64_read(&iocg->active_period) != cur_period)
1051 atomic64_set(&iocg->active_period, cur_period);
1052 return true;
1053 }
1054
1055 /* racy check on internal node IOs, treat as root level IOs */
1056 if (iocg->child_active_sum)
1057 return false;
1058
1059 spin_lock_irq(&ioc->lock);
1060
1061 ioc_now(ioc, now);
1062
1063 /* update period */
1064 cur_period = atomic64_read(&ioc->cur_period);
1065 last_period = atomic64_read(&iocg->active_period);
1066 atomic64_set(&iocg->active_period, cur_period);
1067
1068 /* already activated or breaking leaf-only constraint? */
1069 if (!list_empty(&iocg->active_list))
1070 goto succeed_unlock;
1071 for (i = iocg->level - 1; i > 0; i--)
1072 if (!list_empty(&iocg->ancestors[i]->active_list))
1073 goto fail_unlock;
1074
1075 if (iocg->child_active_sum)
1076 goto fail_unlock;
1077
1078 /*
1079 * vtime may wrap when vrate is raised substantially due to
1080 * underestimated IO costs. Look at the period and ignore its
1081 * vtime if the iocg has been idle for too long. Also, cap the
1082 * budget it can start with to the margin.
1083 */
1084 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1085 vtime = atomic64_read(&iocg->vtime);
1086 vmargin = ioc->margin_us * now->vrate;
1087 vmin = now->vnow - vmargin;
1088
1089 if (last_period + max_period_delta < cur_period ||
1090 time_before64(vtime, vmin)) {
1091 atomic64_add(vmin - vtime, &iocg->vtime);
1092 atomic64_add(vmin - vtime, &iocg->done_vtime);
1093 vtime = vmin;
1094 }
1095
1096 /*
1097 * Activate, propagate weight and start period timer if not
1098 * running. Reset hweight_gen to avoid accidental match from
1099 * wrapping.
1100 */
1101 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1102 list_add(&iocg->active_list, &ioc->active_iocgs);
1103 propagate_active_weight(iocg, iocg->weight,
1104 iocg->last_inuse ?: iocg->weight);
1105
1106 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1107 last_period, cur_period, vtime);
1108
1109 iocg->last_vtime = vtime;
1110
1111 if (ioc->running == IOC_IDLE) {
1112 ioc->running = IOC_RUNNING;
1113 ioc_start_period(ioc, now);
1114 }
1115
1116 succeed_unlock:
1117 spin_unlock_irq(&ioc->lock);
1118 return true;
1119
1120 fail_unlock:
1121 spin_unlock_irq(&ioc->lock);
1122 return false;
1123 }
1124
iocg_wake_fn(struct wait_queue_entry * wq_entry,unsigned mode,int flags,void * key)1125 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1126 int flags, void *key)
1127 {
1128 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1129 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1130 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1131
1132 ctx->vbudget -= cost;
1133
1134 if (ctx->vbudget < 0)
1135 return -1;
1136
1137 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1138
1139 /*
1140 * autoremove_wake_function() removes the wait entry only when it
1141 * actually changed the task state. We want the wait always
1142 * removed. Remove explicitly and use default_wake_function().
1143 */
1144 list_del_init(&wq_entry->entry);
1145 wait->committed = true;
1146
1147 default_wake_function(wq_entry, mode, flags, key);
1148 return 0;
1149 }
1150
iocg_kick_waitq(struct ioc_gq * iocg,struct ioc_now * now)1151 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1152 {
1153 struct ioc *ioc = iocg->ioc;
1154 struct iocg_wake_ctx ctx = { .iocg = iocg };
1155 u64 margin_ns = (u64)(ioc->period_us *
1156 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1157 u64 vdebt, vshortage, expires, oexpires;
1158 s64 vbudget;
1159 u32 hw_inuse;
1160
1161 lockdep_assert_held(&iocg->waitq.lock);
1162
1163 current_hweight(iocg, NULL, &hw_inuse);
1164 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1165
1166 /* pay off debt */
1167 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1168 if (vdebt && vbudget > 0) {
1169 u64 delta = min_t(u64, vbudget, vdebt);
1170 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1171 iocg->abs_vdebt);
1172
1173 atomic64_add(delta, &iocg->vtime);
1174 atomic64_add(delta, &iocg->done_vtime);
1175 iocg->abs_vdebt -= abs_delta;
1176 }
1177
1178 /*
1179 * Wake up the ones which are due and see how much vtime we'll need
1180 * for the next one.
1181 */
1182 ctx.hw_inuse = hw_inuse;
1183 ctx.vbudget = vbudget - vdebt;
1184 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1185 if (!waitqueue_active(&iocg->waitq))
1186 return;
1187 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1188 return;
1189
1190 /* determine next wakeup, add a quarter margin to guarantee chunking */
1191 vshortage = -ctx.vbudget;
1192 expires = now->now_ns +
1193 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1194 expires += margin_ns / 4;
1195
1196 /* if already active and close enough, don't bother */
1197 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1198 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1199 abs(oexpires - expires) <= margin_ns / 4)
1200 return;
1201
1202 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1203 margin_ns / 4, HRTIMER_MODE_ABS);
1204 }
1205
iocg_waitq_timer_fn(struct hrtimer * timer)1206 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1207 {
1208 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1209 struct ioc_now now;
1210 unsigned long flags;
1211
1212 ioc_now(iocg->ioc, &now);
1213
1214 spin_lock_irqsave(&iocg->waitq.lock, flags);
1215 iocg_kick_waitq(iocg, &now);
1216 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1217
1218 return HRTIMER_NORESTART;
1219 }
1220
iocg_kick_delay(struct ioc_gq * iocg,struct ioc_now * now,u64 cost)1221 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1222 {
1223 struct ioc *ioc = iocg->ioc;
1224 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1225 u64 vtime = atomic64_read(&iocg->vtime);
1226 u64 vmargin = ioc->margin_us * now->vrate;
1227 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1228 u64 expires, oexpires;
1229 u32 hw_inuse;
1230
1231 lockdep_assert_held(&iocg->waitq.lock);
1232
1233 /* debt-adjust vtime */
1234 current_hweight(iocg, NULL, &hw_inuse);
1235 vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1236
1237 /*
1238 * Clear or maintain depending on the overage. Non-zero vdebt is what
1239 * guarantees that @iocg is online and future iocg_kick_delay() will
1240 * clear use_delay. Don't leave it on when there's no vdebt.
1241 */
1242 if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1243 blkcg_clear_delay(blkg);
1244 return false;
1245 }
1246 if (!atomic_read(&blkg->use_delay) &&
1247 time_before_eq64(vtime, now->vnow + vmargin))
1248 return false;
1249
1250 /* use delay */
1251 if (cost) {
1252 u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1253 now->vrate);
1254 blkcg_add_delay(blkg, now->now_ns, cost_ns);
1255 }
1256 blkcg_use_delay(blkg);
1257
1258 expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1259 now->vrate) * NSEC_PER_USEC;
1260
1261 /* if already active and close enough, don't bother */
1262 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1263 if (hrtimer_is_queued(&iocg->delay_timer) &&
1264 abs(oexpires - expires) <= margin_ns / 4)
1265 return true;
1266
1267 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1268 margin_ns / 4, HRTIMER_MODE_ABS);
1269 return true;
1270 }
1271
iocg_delay_timer_fn(struct hrtimer * timer)1272 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1273 {
1274 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1275 struct ioc_now now;
1276 unsigned long flags;
1277
1278 spin_lock_irqsave(&iocg->waitq.lock, flags);
1279 ioc_now(iocg->ioc, &now);
1280 iocg_kick_delay(iocg, &now, 0);
1281 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1282
1283 return HRTIMER_NORESTART;
1284 }
1285
ioc_lat_stat(struct ioc * ioc,u32 * missed_ppm_ar,u32 * rq_wait_pct_p)1286 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1287 {
1288 u32 nr_met[2] = { };
1289 u32 nr_missed[2] = { };
1290 u64 rq_wait_ns = 0;
1291 int cpu, rw;
1292
1293 for_each_online_cpu(cpu) {
1294 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1295 u64 this_rq_wait_ns;
1296
1297 for (rw = READ; rw <= WRITE; rw++) {
1298 u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1299 u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1300
1301 nr_met[rw] += this_met - stat->missed[rw].last_met;
1302 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1303 stat->missed[rw].last_met = this_met;
1304 stat->missed[rw].last_missed = this_missed;
1305 }
1306
1307 this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1308 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1309 stat->last_rq_wait_ns = this_rq_wait_ns;
1310 }
1311
1312 for (rw = READ; rw <= WRITE; rw++) {
1313 if (nr_met[rw] + nr_missed[rw])
1314 missed_ppm_ar[rw] =
1315 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1316 nr_met[rw] + nr_missed[rw]);
1317 else
1318 missed_ppm_ar[rw] = 0;
1319 }
1320
1321 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1322 ioc->period_us * NSEC_PER_USEC);
1323 }
1324
1325 /* was iocg idle this period? */
iocg_is_idle(struct ioc_gq * iocg)1326 static bool iocg_is_idle(struct ioc_gq *iocg)
1327 {
1328 struct ioc *ioc = iocg->ioc;
1329
1330 /* did something get issued this period? */
1331 if (atomic64_read(&iocg->active_period) ==
1332 atomic64_read(&ioc->cur_period))
1333 return false;
1334
1335 /* is something in flight? */
1336 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1337 return false;
1338
1339 return true;
1340 }
1341
1342 /* returns usage with margin added if surplus is large enough */
surplus_adjusted_hweight_inuse(u32 usage,u32 hw_inuse)1343 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1344 {
1345 /* add margin */
1346 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1347 usage += SURPLUS_SCALE_ABS;
1348
1349 /* don't bother if the surplus is too small */
1350 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1351 return 0;
1352
1353 return usage;
1354 }
1355
ioc_timer_fn(struct timer_list * timer)1356 static void ioc_timer_fn(struct timer_list *timer)
1357 {
1358 struct ioc *ioc = container_of(timer, struct ioc, timer);
1359 struct ioc_gq *iocg, *tiocg;
1360 struct ioc_now now;
1361 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1362 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1363 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1364 u32 missed_ppm[2], rq_wait_pct;
1365 u64 period_vtime;
1366 int prev_busy_level, i;
1367
1368 /* how were the latencies during the period? */
1369 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1370
1371 /* take care of active iocgs */
1372 spin_lock_irq(&ioc->lock);
1373
1374 ioc_now(ioc, &now);
1375
1376 period_vtime = now.vnow - ioc->period_at_vtime;
1377 if (WARN_ON_ONCE(!period_vtime)) {
1378 spin_unlock_irq(&ioc->lock);
1379 return;
1380 }
1381
1382 /*
1383 * Waiters determine the sleep durations based on the vrate they
1384 * saw at the time of sleep. If vrate has increased, some waiters
1385 * could be sleeping for too long. Wake up tardy waiters which
1386 * should have woken up in the last period and expire idle iocgs.
1387 */
1388 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1389 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1390 !iocg_is_idle(iocg))
1391 continue;
1392
1393 spin_lock(&iocg->waitq.lock);
1394
1395 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1396 /* might be oversleeping vtime / hweight changes, kick */
1397 iocg_kick_waitq(iocg, &now);
1398 iocg_kick_delay(iocg, &now, 0);
1399 } else if (iocg_is_idle(iocg)) {
1400 /* no waiter and idle, deactivate */
1401 iocg->last_inuse = iocg->inuse;
1402 __propagate_active_weight(iocg, 0, 0);
1403 list_del_init(&iocg->active_list);
1404 }
1405
1406 spin_unlock(&iocg->waitq.lock);
1407 }
1408 commit_active_weights(ioc);
1409
1410 /* calc usages and see whether some weights need to be moved around */
1411 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1412 u64 vdone, vtime, vusage, vmargin, vmin;
1413 u32 hw_active, hw_inuse, usage;
1414
1415 /*
1416 * Collect unused and wind vtime closer to vnow to prevent
1417 * iocgs from accumulating a large amount of budget.
1418 */
1419 vdone = atomic64_read(&iocg->done_vtime);
1420 vtime = atomic64_read(&iocg->vtime);
1421 current_hweight(iocg, &hw_active, &hw_inuse);
1422
1423 /*
1424 * Latency QoS detection doesn't account for IOs which are
1425 * in-flight for longer than a period. Detect them by
1426 * comparing vdone against period start. If lagging behind
1427 * IOs from past periods, don't increase vrate.
1428 */
1429 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1430 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1431 time_after64(vtime, vdone) &&
1432 time_after64(vtime, now.vnow -
1433 MAX_LAGGING_PERIODS * period_vtime) &&
1434 time_before64(vdone, now.vnow - period_vtime))
1435 nr_lagging++;
1436
1437 if (waitqueue_active(&iocg->waitq))
1438 vusage = now.vnow - iocg->last_vtime;
1439 else if (time_before64(iocg->last_vtime, vtime))
1440 vusage = vtime - iocg->last_vtime;
1441 else
1442 vusage = 0;
1443
1444 iocg->last_vtime += vusage;
1445 /*
1446 * Factor in in-flight vtime into vusage to avoid
1447 * high-latency completions appearing as idle. This should
1448 * be done after the above ->last_time adjustment.
1449 */
1450 vusage = max(vusage, vtime - vdone);
1451
1452 /* calculate hweight based usage ratio and record */
1453 if (vusage) {
1454 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1455 period_vtime);
1456 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1457 iocg->usages[iocg->usage_idx] = usage;
1458 } else {
1459 usage = 0;
1460 }
1461
1462 /* see whether there's surplus vtime */
1463 vmargin = ioc->margin_us * now.vrate;
1464 vmin = now.vnow - vmargin;
1465
1466 iocg->has_surplus = false;
1467
1468 if (!waitqueue_active(&iocg->waitq) &&
1469 time_before64(vtime, vmin)) {
1470 u64 delta = vmin - vtime;
1471
1472 /* throw away surplus vtime */
1473 atomic64_add(delta, &iocg->vtime);
1474 atomic64_add(delta, &iocg->done_vtime);
1475 iocg->last_vtime += delta;
1476 /* if usage is sufficiently low, maybe it can donate */
1477 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1478 iocg->has_surplus = true;
1479 nr_surpluses++;
1480 }
1481 } else if (hw_inuse < hw_active) {
1482 u32 new_hwi, new_inuse;
1483
1484 /* was donating but might need to take back some */
1485 if (waitqueue_active(&iocg->waitq)) {
1486 new_hwi = hw_active;
1487 } else {
1488 new_hwi = max(hw_inuse,
1489 usage * SURPLUS_SCALE_PCT / 100 +
1490 SURPLUS_SCALE_ABS);
1491 }
1492
1493 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1494 hw_inuse);
1495 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1496
1497 if (new_inuse > iocg->inuse) {
1498 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1499 iocg->inuse, new_inuse,
1500 hw_inuse, new_hwi);
1501 __propagate_active_weight(iocg, iocg->weight,
1502 new_inuse);
1503 }
1504 } else {
1505 /* genuninely out of vtime */
1506 nr_shortages++;
1507 }
1508 }
1509
1510 if (!nr_shortages || !nr_surpluses)
1511 goto skip_surplus_transfers;
1512
1513 /* there are both shortages and surpluses, transfer surpluses */
1514 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1515 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1516 int nr_valid = 0;
1517
1518 if (!iocg->has_surplus)
1519 continue;
1520
1521 /* base the decision on max historical usage */
1522 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1523 if (iocg->usages[i]) {
1524 usage = max(usage, iocg->usages[i]);
1525 nr_valid++;
1526 }
1527 }
1528 if (nr_valid < MIN_VALID_USAGES)
1529 continue;
1530
1531 current_hweight(iocg, &hw_active, &hw_inuse);
1532 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1533 if (!new_hwi)
1534 continue;
1535
1536 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1537 hw_inuse);
1538 if (new_inuse < iocg->inuse) {
1539 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1540 iocg->inuse, new_inuse,
1541 hw_inuse, new_hwi);
1542 __propagate_active_weight(iocg, iocg->weight, new_inuse);
1543 }
1544 }
1545 skip_surplus_transfers:
1546 commit_active_weights(ioc);
1547
1548 /*
1549 * If q is getting clogged or we're missing too much, we're issuing
1550 * too much IO and should lower vtime rate. If we're not missing
1551 * and experiencing shortages but not surpluses, we're too stingy
1552 * and should increase vtime rate.
1553 */
1554 prev_busy_level = ioc->busy_level;
1555 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1556 missed_ppm[READ] > ppm_rthr ||
1557 missed_ppm[WRITE] > ppm_wthr) {
1558 /* clearly missing QoS targets, slow down vrate */
1559 ioc->busy_level = max(ioc->busy_level, 0);
1560 ioc->busy_level++;
1561 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1562 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1563 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1564 /* QoS targets are being met with >25% margin */
1565 if (nr_shortages) {
1566 /*
1567 * We're throttling while the device has spare
1568 * capacity. If vrate was being slowed down, stop.
1569 */
1570 ioc->busy_level = min(ioc->busy_level, 0);
1571
1572 /*
1573 * If there are IOs spanning multiple periods, wait
1574 * them out before pushing the device harder. If
1575 * there are surpluses, let redistribution work it
1576 * out first.
1577 */
1578 if (!nr_lagging && !nr_surpluses)
1579 ioc->busy_level--;
1580 } else {
1581 /*
1582 * Nobody is being throttled and the users aren't
1583 * issuing enough IOs to saturate the device. We
1584 * simply don't know how close the device is to
1585 * saturation. Coast.
1586 */
1587 ioc->busy_level = 0;
1588 }
1589 } else {
1590 /* inside the hysterisis margin, we're good */
1591 ioc->busy_level = 0;
1592 }
1593
1594 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1595
1596 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1597 u64 vrate = atomic64_read(&ioc->vtime_rate);
1598 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1599
1600 /* rq_wait signal is always reliable, ignore user vrate_min */
1601 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1602 vrate_min = VRATE_MIN;
1603
1604 /*
1605 * If vrate is out of bounds, apply clamp gradually as the
1606 * bounds can change abruptly. Otherwise, apply busy_level
1607 * based adjustment.
1608 */
1609 if (vrate < vrate_min) {
1610 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1611 100);
1612 vrate = min(vrate, vrate_min);
1613 } else if (vrate > vrate_max) {
1614 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1615 100);
1616 vrate = max(vrate, vrate_max);
1617 } else {
1618 int idx = min_t(int, abs(ioc->busy_level),
1619 ARRAY_SIZE(vrate_adj_pct) - 1);
1620 u32 adj_pct = vrate_adj_pct[idx];
1621
1622 if (ioc->busy_level > 0)
1623 adj_pct = 100 - adj_pct;
1624 else
1625 adj_pct = 100 + adj_pct;
1626
1627 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1628 vrate_min, vrate_max);
1629 }
1630
1631 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1632 nr_lagging, nr_shortages,
1633 nr_surpluses);
1634
1635 atomic64_set(&ioc->vtime_rate, vrate);
1636 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1637 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1638 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1639 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1640 missed_ppm, rq_wait_pct, nr_lagging,
1641 nr_shortages, nr_surpluses);
1642 }
1643
1644 ioc_refresh_params(ioc, false);
1645
1646 /*
1647 * This period is done. Move onto the next one. If nothing's
1648 * going on with the device, stop the timer.
1649 */
1650 atomic64_inc(&ioc->cur_period);
1651
1652 if (ioc->running != IOC_STOP) {
1653 if (!list_empty(&ioc->active_iocgs)) {
1654 ioc_start_period(ioc, &now);
1655 } else {
1656 ioc->busy_level = 0;
1657 ioc->running = IOC_IDLE;
1658 }
1659 }
1660
1661 spin_unlock_irq(&ioc->lock);
1662 }
1663
calc_vtime_cost_builtin(struct bio * bio,struct ioc_gq * iocg,bool is_merge,u64 * costp)1664 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1665 bool is_merge, u64 *costp)
1666 {
1667 struct ioc *ioc = iocg->ioc;
1668 u64 coef_seqio, coef_randio, coef_page;
1669 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1670 u64 seek_pages = 0;
1671 u64 cost = 0;
1672
1673 switch (bio_op(bio)) {
1674 case REQ_OP_READ:
1675 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1676 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1677 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1678 break;
1679 case REQ_OP_WRITE:
1680 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1681 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1682 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1683 break;
1684 default:
1685 goto out;
1686 }
1687
1688 if (iocg->cursor) {
1689 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1690 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1691 }
1692
1693 if (!is_merge) {
1694 if (seek_pages > LCOEF_RANDIO_PAGES) {
1695 cost += coef_randio;
1696 } else {
1697 cost += coef_seqio;
1698 }
1699 }
1700 cost += pages * coef_page;
1701 out:
1702 *costp = cost;
1703 }
1704
calc_vtime_cost(struct bio * bio,struct ioc_gq * iocg,bool is_merge)1705 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1706 {
1707 u64 cost;
1708
1709 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1710 return cost;
1711 }
1712
ioc_rqos_throttle(struct rq_qos * rqos,struct bio * bio)1713 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1714 {
1715 struct blkcg_gq *blkg = bio->bi_blkg;
1716 struct ioc *ioc = rqos_to_ioc(rqos);
1717 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1718 struct ioc_now now;
1719 struct iocg_wait wait;
1720 u32 hw_active, hw_inuse;
1721 u64 abs_cost, cost, vtime;
1722
1723 /* bypass IOs if disabled or for root cgroup */
1724 if (!ioc->enabled || !iocg->level)
1725 return;
1726
1727 /* always activate so that even 0 cost IOs get protected to some level */
1728 if (!iocg_activate(iocg, &now))
1729 return;
1730
1731 /* calculate the absolute vtime cost */
1732 abs_cost = calc_vtime_cost(bio, iocg, false);
1733 if (!abs_cost)
1734 return;
1735
1736 iocg->cursor = bio_end_sector(bio);
1737
1738 vtime = atomic64_read(&iocg->vtime);
1739 current_hweight(iocg, &hw_active, &hw_inuse);
1740
1741 if (hw_inuse < hw_active &&
1742 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1743 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1744 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1745 spin_lock_irq(&ioc->lock);
1746 propagate_active_weight(iocg, iocg->weight, iocg->weight);
1747 spin_unlock_irq(&ioc->lock);
1748 current_hweight(iocg, &hw_active, &hw_inuse);
1749 }
1750
1751 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1752
1753 /*
1754 * If no one's waiting and within budget, issue right away. The
1755 * tests are racy but the races aren't systemic - we only miss once
1756 * in a while which is fine.
1757 */
1758 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1759 time_before_eq64(vtime + cost, now.vnow)) {
1760 iocg_commit_bio(iocg, bio, cost);
1761 return;
1762 }
1763
1764 /*
1765 * We activated above but w/o any synchronization. Deactivation is
1766 * synchronized with waitq.lock and we won't get deactivated as long
1767 * as we're waiting or has debt, so we're good if we're activated
1768 * here. In the unlikely case that we aren't, just issue the IO.
1769 */
1770 spin_lock_irq(&iocg->waitq.lock);
1771
1772 if (unlikely(list_empty(&iocg->active_list))) {
1773 spin_unlock_irq(&iocg->waitq.lock);
1774 iocg_commit_bio(iocg, bio, cost);
1775 return;
1776 }
1777
1778 /*
1779 * We're over budget. If @bio has to be issued regardless, remember
1780 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1781 * off the debt before waking more IOs.
1782 *
1783 * This way, the debt is continuously paid off each period with the
1784 * actual budget available to the cgroup. If we just wound vtime, we
1785 * would incorrectly use the current hw_inuse for the entire amount
1786 * which, for example, can lead to the cgroup staying blocked for a
1787 * long time even with substantially raised hw_inuse.
1788 *
1789 * An iocg with vdebt should stay online so that the timer can keep
1790 * deducting its vdebt and [de]activate use_delay mechanism
1791 * accordingly. We don't want to race against the timer trying to
1792 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1793 * penalizing the cgroup and its descendants.
1794 */
1795 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1796 iocg->abs_vdebt += abs_cost;
1797 if (iocg_kick_delay(iocg, &now, cost))
1798 blkcg_schedule_throttle(rqos->q,
1799 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1800 spin_unlock_irq(&iocg->waitq.lock);
1801 return;
1802 }
1803
1804 /*
1805 * Append self to the waitq and schedule the wakeup timer if we're
1806 * the first waiter. The timer duration is calculated based on the
1807 * current vrate. vtime and hweight changes can make it too short
1808 * or too long. Each wait entry records the absolute cost it's
1809 * waiting for to allow re-evaluation using a custom wait entry.
1810 *
1811 * If too short, the timer simply reschedules itself. If too long,
1812 * the period timer will notice and trigger wakeups.
1813 *
1814 * All waiters are on iocg->waitq and the wait states are
1815 * synchronized using waitq.lock.
1816 */
1817 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1818 wait.wait.private = current;
1819 wait.bio = bio;
1820 wait.abs_cost = abs_cost;
1821 wait.committed = false; /* will be set true by waker */
1822
1823 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1824 iocg_kick_waitq(iocg, &now);
1825
1826 spin_unlock_irq(&iocg->waitq.lock);
1827
1828 while (true) {
1829 set_current_state(TASK_UNINTERRUPTIBLE);
1830 if (wait.committed)
1831 break;
1832 io_schedule();
1833 }
1834
1835 /* waker already committed us, proceed */
1836 finish_wait(&iocg->waitq, &wait.wait);
1837 }
1838
ioc_rqos_merge(struct rq_qos * rqos,struct request * rq,struct bio * bio)1839 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1840 struct bio *bio)
1841 {
1842 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1843 struct ioc *ioc = iocg->ioc;
1844 sector_t bio_end = bio_end_sector(bio);
1845 struct ioc_now now;
1846 u32 hw_inuse;
1847 u64 abs_cost, cost;
1848 unsigned long flags;
1849
1850 /* bypass if disabled or for root cgroup */
1851 if (!ioc->enabled || !iocg->level)
1852 return;
1853
1854 abs_cost = calc_vtime_cost(bio, iocg, true);
1855 if (!abs_cost)
1856 return;
1857
1858 ioc_now(ioc, &now);
1859 current_hweight(iocg, NULL, &hw_inuse);
1860 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1861
1862 /* update cursor if backmerging into the request at the cursor */
1863 if (blk_rq_pos(rq) < bio_end &&
1864 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1865 iocg->cursor = bio_end;
1866
1867 /*
1868 * Charge if there's enough vtime budget and the existing request has
1869 * cost assigned.
1870 */
1871 if (rq->bio && rq->bio->bi_iocost_cost &&
1872 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1873 iocg_commit_bio(iocg, bio, cost);
1874 return;
1875 }
1876
1877 /*
1878 * Otherwise, account it as debt if @iocg is online, which it should
1879 * be for the vast majority of cases. See debt handling in
1880 * ioc_rqos_throttle() for details.
1881 */
1882 spin_lock_irqsave(&iocg->waitq.lock, flags);
1883 if (likely(!list_empty(&iocg->active_list))) {
1884 iocg->abs_vdebt += abs_cost;
1885 iocg_kick_delay(iocg, &now, cost);
1886 } else {
1887 iocg_commit_bio(iocg, bio, cost);
1888 }
1889 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1890 }
1891
ioc_rqos_done_bio(struct rq_qos * rqos,struct bio * bio)1892 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1893 {
1894 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1895
1896 if (iocg && bio->bi_iocost_cost)
1897 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1898 }
1899
ioc_rqos_done(struct rq_qos * rqos,struct request * rq)1900 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1901 {
1902 struct ioc *ioc = rqos_to_ioc(rqos);
1903 u64 on_q_ns, rq_wait_ns;
1904 int pidx, rw;
1905
1906 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1907 return;
1908
1909 switch (req_op(rq) & REQ_OP_MASK) {
1910 case REQ_OP_READ:
1911 pidx = QOS_RLAT;
1912 rw = READ;
1913 break;
1914 case REQ_OP_WRITE:
1915 pidx = QOS_WLAT;
1916 rw = WRITE;
1917 break;
1918 default:
1919 return;
1920 }
1921
1922 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1923 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1924
1925 if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1926 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1927 else
1928 this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1929
1930 this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1931 }
1932
ioc_rqos_queue_depth_changed(struct rq_qos * rqos)1933 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1934 {
1935 struct ioc *ioc = rqos_to_ioc(rqos);
1936
1937 spin_lock_irq(&ioc->lock);
1938 ioc_refresh_params(ioc, false);
1939 spin_unlock_irq(&ioc->lock);
1940 }
1941
ioc_rqos_exit(struct rq_qos * rqos)1942 static void ioc_rqos_exit(struct rq_qos *rqos)
1943 {
1944 struct ioc *ioc = rqos_to_ioc(rqos);
1945
1946 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1947
1948 spin_lock_irq(&ioc->lock);
1949 ioc->running = IOC_STOP;
1950 spin_unlock_irq(&ioc->lock);
1951
1952 del_timer_sync(&ioc->timer);
1953 free_percpu(ioc->pcpu_stat);
1954 kfree(ioc);
1955 }
1956
1957 static struct rq_qos_ops ioc_rqos_ops = {
1958 .throttle = ioc_rqos_throttle,
1959 .merge = ioc_rqos_merge,
1960 .done_bio = ioc_rqos_done_bio,
1961 .done = ioc_rqos_done,
1962 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1963 .exit = ioc_rqos_exit,
1964 };
1965
blk_iocost_init(struct request_queue * q)1966 static int blk_iocost_init(struct request_queue *q)
1967 {
1968 struct ioc *ioc;
1969 struct rq_qos *rqos;
1970 int ret;
1971
1972 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1973 if (!ioc)
1974 return -ENOMEM;
1975
1976 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1977 if (!ioc->pcpu_stat) {
1978 kfree(ioc);
1979 return -ENOMEM;
1980 }
1981
1982 rqos = &ioc->rqos;
1983 rqos->id = RQ_QOS_COST;
1984 rqos->ops = &ioc_rqos_ops;
1985 rqos->q = q;
1986
1987 spin_lock_init(&ioc->lock);
1988 timer_setup(&ioc->timer, ioc_timer_fn, 0);
1989 INIT_LIST_HEAD(&ioc->active_iocgs);
1990
1991 ioc->running = IOC_IDLE;
1992 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1993 seqcount_init(&ioc->period_seqcount);
1994 ioc->period_at = ktime_to_us(ktime_get());
1995 atomic64_set(&ioc->cur_period, 0);
1996 atomic_set(&ioc->hweight_gen, 0);
1997
1998 spin_lock_irq(&ioc->lock);
1999 ioc->autop_idx = AUTOP_INVALID;
2000 ioc_refresh_params(ioc, true);
2001 spin_unlock_irq(&ioc->lock);
2002
2003 rq_qos_add(q, rqos);
2004 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2005 if (ret) {
2006 rq_qos_del(q, rqos);
2007 free_percpu(ioc->pcpu_stat);
2008 kfree(ioc);
2009 return ret;
2010 }
2011 return 0;
2012 }
2013
ioc_cpd_alloc(gfp_t gfp)2014 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2015 {
2016 struct ioc_cgrp *iocc;
2017
2018 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2019 if (!iocc)
2020 return NULL;
2021
2022 iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2023 return &iocc->cpd;
2024 }
2025
ioc_cpd_free(struct blkcg_policy_data * cpd)2026 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2027 {
2028 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2029 }
2030
ioc_pd_alloc(gfp_t gfp,struct request_queue * q,struct blkcg * blkcg)2031 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2032 struct blkcg *blkcg)
2033 {
2034 int levels = blkcg->css.cgroup->level + 1;
2035 struct ioc_gq *iocg;
2036
2037 iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
2038 gfp, q->node);
2039 if (!iocg)
2040 return NULL;
2041
2042 return &iocg->pd;
2043 }
2044
ioc_pd_init(struct blkg_policy_data * pd)2045 static void ioc_pd_init(struct blkg_policy_data *pd)
2046 {
2047 struct ioc_gq *iocg = pd_to_iocg(pd);
2048 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2049 struct ioc *ioc = q_to_ioc(blkg->q);
2050 struct ioc_now now;
2051 struct blkcg_gq *tblkg;
2052 unsigned long flags;
2053
2054 ioc_now(ioc, &now);
2055
2056 iocg->ioc = ioc;
2057 atomic64_set(&iocg->vtime, now.vnow);
2058 atomic64_set(&iocg->done_vtime, now.vnow);
2059 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2060 INIT_LIST_HEAD(&iocg->active_list);
2061 iocg->hweight_active = HWEIGHT_WHOLE;
2062 iocg->hweight_inuse = HWEIGHT_WHOLE;
2063
2064 init_waitqueue_head(&iocg->waitq);
2065 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2066 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2067 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2068 iocg->delay_timer.function = iocg_delay_timer_fn;
2069
2070 iocg->level = blkg->blkcg->css.cgroup->level;
2071
2072 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2073 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2074 iocg->ancestors[tiocg->level] = tiocg;
2075 }
2076
2077 spin_lock_irqsave(&ioc->lock, flags);
2078 weight_updated(iocg);
2079 spin_unlock_irqrestore(&ioc->lock, flags);
2080 }
2081
ioc_pd_free(struct blkg_policy_data * pd)2082 static void ioc_pd_free(struct blkg_policy_data *pd)
2083 {
2084 struct ioc_gq *iocg = pd_to_iocg(pd);
2085 struct ioc *ioc = iocg->ioc;
2086 unsigned long flags;
2087
2088 if (ioc) {
2089 spin_lock_irqsave(&ioc->lock, flags);
2090 if (!list_empty(&iocg->active_list)) {
2091 propagate_active_weight(iocg, 0, 0);
2092 list_del_init(&iocg->active_list);
2093 }
2094 spin_unlock_irqrestore(&ioc->lock, flags);
2095
2096 hrtimer_cancel(&iocg->waitq_timer);
2097 hrtimer_cancel(&iocg->delay_timer);
2098 }
2099 kfree(iocg);
2100 }
2101
ioc_weight_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)2102 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2103 int off)
2104 {
2105 const char *dname = blkg_dev_name(pd->blkg);
2106 struct ioc_gq *iocg = pd_to_iocg(pd);
2107
2108 if (dname && iocg->cfg_weight)
2109 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2110 return 0;
2111 }
2112
2113
ioc_weight_show(struct seq_file * sf,void * v)2114 static int ioc_weight_show(struct seq_file *sf, void *v)
2115 {
2116 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2117 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2118
2119 seq_printf(sf, "default %u\n", iocc->dfl_weight);
2120 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2121 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2122 return 0;
2123 }
2124
ioc_weight_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)2125 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2126 size_t nbytes, loff_t off)
2127 {
2128 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2129 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2130 struct blkg_conf_ctx ctx;
2131 struct ioc_gq *iocg;
2132 u32 v;
2133 int ret;
2134
2135 if (!strchr(buf, ':')) {
2136 struct blkcg_gq *blkg;
2137
2138 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2139 return -EINVAL;
2140
2141 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2142 return -EINVAL;
2143
2144 spin_lock(&blkcg->lock);
2145 iocc->dfl_weight = v;
2146 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2147 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2148
2149 if (iocg) {
2150 spin_lock_irq(&iocg->ioc->lock);
2151 weight_updated(iocg);
2152 spin_unlock_irq(&iocg->ioc->lock);
2153 }
2154 }
2155 spin_unlock(&blkcg->lock);
2156
2157 return nbytes;
2158 }
2159
2160 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2161 if (ret)
2162 return ret;
2163
2164 iocg = blkg_to_iocg(ctx.blkg);
2165
2166 if (!strncmp(ctx.body, "default", 7)) {
2167 v = 0;
2168 } else {
2169 if (!sscanf(ctx.body, "%u", &v))
2170 goto einval;
2171 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2172 goto einval;
2173 }
2174
2175 spin_lock(&iocg->ioc->lock);
2176 iocg->cfg_weight = v;
2177 weight_updated(iocg);
2178 spin_unlock(&iocg->ioc->lock);
2179
2180 blkg_conf_finish(&ctx);
2181 return nbytes;
2182
2183 einval:
2184 blkg_conf_finish(&ctx);
2185 return -EINVAL;
2186 }
2187
ioc_qos_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)2188 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2189 int off)
2190 {
2191 const char *dname = blkg_dev_name(pd->blkg);
2192 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2193
2194 if (!dname)
2195 return 0;
2196
2197 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2198 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2199 ioc->params.qos[QOS_RPPM] / 10000,
2200 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2201 ioc->params.qos[QOS_RLAT],
2202 ioc->params.qos[QOS_WPPM] / 10000,
2203 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2204 ioc->params.qos[QOS_WLAT],
2205 ioc->params.qos[QOS_MIN] / 10000,
2206 ioc->params.qos[QOS_MIN] % 10000 / 100,
2207 ioc->params.qos[QOS_MAX] / 10000,
2208 ioc->params.qos[QOS_MAX] % 10000 / 100);
2209 return 0;
2210 }
2211
ioc_qos_show(struct seq_file * sf,void * v)2212 static int ioc_qos_show(struct seq_file *sf, void *v)
2213 {
2214 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2215
2216 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2217 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2218 return 0;
2219 }
2220
2221 static const match_table_t qos_ctrl_tokens = {
2222 { QOS_ENABLE, "enable=%u" },
2223 { QOS_CTRL, "ctrl=%s" },
2224 { NR_QOS_CTRL_PARAMS, NULL },
2225 };
2226
2227 static const match_table_t qos_tokens = {
2228 { QOS_RPPM, "rpct=%s" },
2229 { QOS_RLAT, "rlat=%u" },
2230 { QOS_WPPM, "wpct=%s" },
2231 { QOS_WLAT, "wlat=%u" },
2232 { QOS_MIN, "min=%s" },
2233 { QOS_MAX, "max=%s" },
2234 { NR_QOS_PARAMS, NULL },
2235 };
2236
ioc_qos_write(struct kernfs_open_file * of,char * input,size_t nbytes,loff_t off)2237 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2238 size_t nbytes, loff_t off)
2239 {
2240 struct gendisk *disk;
2241 struct ioc *ioc;
2242 u32 qos[NR_QOS_PARAMS];
2243 bool enable, user;
2244 char *p;
2245 int ret;
2246
2247 disk = blkcg_conf_get_disk(&input);
2248 if (IS_ERR(disk))
2249 return PTR_ERR(disk);
2250
2251 ioc = q_to_ioc(disk->queue);
2252 if (!ioc) {
2253 ret = blk_iocost_init(disk->queue);
2254 if (ret)
2255 goto err;
2256 ioc = q_to_ioc(disk->queue);
2257 }
2258
2259 spin_lock_irq(&ioc->lock);
2260 memcpy(qos, ioc->params.qos, sizeof(qos));
2261 enable = ioc->enabled;
2262 user = ioc->user_qos_params;
2263 spin_unlock_irq(&ioc->lock);
2264
2265 while ((p = strsep(&input, " \t\n"))) {
2266 substring_t args[MAX_OPT_ARGS];
2267 char buf[32];
2268 int tok;
2269 s64 v;
2270
2271 if (!*p)
2272 continue;
2273
2274 switch (match_token(p, qos_ctrl_tokens, args)) {
2275 case QOS_ENABLE:
2276 match_u64(&args[0], &v);
2277 enable = v;
2278 continue;
2279 case QOS_CTRL:
2280 match_strlcpy(buf, &args[0], sizeof(buf));
2281 if (!strcmp(buf, "auto"))
2282 user = false;
2283 else if (!strcmp(buf, "user"))
2284 user = true;
2285 else
2286 goto einval;
2287 continue;
2288 }
2289
2290 tok = match_token(p, qos_tokens, args);
2291 switch (tok) {
2292 case QOS_RPPM:
2293 case QOS_WPPM:
2294 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2295 sizeof(buf))
2296 goto einval;
2297 if (cgroup_parse_float(buf, 2, &v))
2298 goto einval;
2299 if (v < 0 || v > 10000)
2300 goto einval;
2301 qos[tok] = v * 100;
2302 break;
2303 case QOS_RLAT:
2304 case QOS_WLAT:
2305 if (match_u64(&args[0], &v))
2306 goto einval;
2307 qos[tok] = v;
2308 break;
2309 case QOS_MIN:
2310 case QOS_MAX:
2311 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2312 sizeof(buf))
2313 goto einval;
2314 if (cgroup_parse_float(buf, 2, &v))
2315 goto einval;
2316 if (v < 0)
2317 goto einval;
2318 qos[tok] = clamp_t(s64, v * 100,
2319 VRATE_MIN_PPM, VRATE_MAX_PPM);
2320 break;
2321 default:
2322 goto einval;
2323 }
2324 user = true;
2325 }
2326
2327 if (qos[QOS_MIN] > qos[QOS_MAX])
2328 goto einval;
2329
2330 spin_lock_irq(&ioc->lock);
2331
2332 if (enable) {
2333 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2334 ioc->enabled = true;
2335 } else {
2336 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2337 ioc->enabled = false;
2338 }
2339
2340 if (user) {
2341 memcpy(ioc->params.qos, qos, sizeof(qos));
2342 ioc->user_qos_params = true;
2343 } else {
2344 ioc->user_qos_params = false;
2345 }
2346
2347 ioc_refresh_params(ioc, true);
2348 spin_unlock_irq(&ioc->lock);
2349
2350 put_disk_and_module(disk);
2351 return nbytes;
2352 einval:
2353 ret = -EINVAL;
2354 err:
2355 put_disk_and_module(disk);
2356 return ret;
2357 }
2358
ioc_cost_model_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)2359 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2360 struct blkg_policy_data *pd, int off)
2361 {
2362 const char *dname = blkg_dev_name(pd->blkg);
2363 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2364 u64 *u = ioc->params.i_lcoefs;
2365
2366 if (!dname)
2367 return 0;
2368
2369 seq_printf(sf, "%s ctrl=%s model=linear "
2370 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2371 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2372 dname, ioc->user_cost_model ? "user" : "auto",
2373 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2374 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2375 return 0;
2376 }
2377
ioc_cost_model_show(struct seq_file * sf,void * v)2378 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2379 {
2380 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2381
2382 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2383 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2384 return 0;
2385 }
2386
2387 static const match_table_t cost_ctrl_tokens = {
2388 { COST_CTRL, "ctrl=%s" },
2389 { COST_MODEL, "model=%s" },
2390 { NR_COST_CTRL_PARAMS, NULL },
2391 };
2392
2393 static const match_table_t i_lcoef_tokens = {
2394 { I_LCOEF_RBPS, "rbps=%u" },
2395 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2396 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2397 { I_LCOEF_WBPS, "wbps=%u" },
2398 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2399 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2400 { NR_I_LCOEFS, NULL },
2401 };
2402
ioc_cost_model_write(struct kernfs_open_file * of,char * input,size_t nbytes,loff_t off)2403 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2404 size_t nbytes, loff_t off)
2405 {
2406 struct gendisk *disk;
2407 struct ioc *ioc;
2408 u64 u[NR_I_LCOEFS];
2409 bool user;
2410 char *p;
2411 int ret;
2412
2413 disk = blkcg_conf_get_disk(&input);
2414 if (IS_ERR(disk))
2415 return PTR_ERR(disk);
2416
2417 ioc = q_to_ioc(disk->queue);
2418 if (!ioc) {
2419 ret = blk_iocost_init(disk->queue);
2420 if (ret)
2421 goto err;
2422 ioc = q_to_ioc(disk->queue);
2423 }
2424
2425 spin_lock_irq(&ioc->lock);
2426 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2427 user = ioc->user_cost_model;
2428 spin_unlock_irq(&ioc->lock);
2429
2430 while ((p = strsep(&input, " \t\n"))) {
2431 substring_t args[MAX_OPT_ARGS];
2432 char buf[32];
2433 int tok;
2434 u64 v;
2435
2436 if (!*p)
2437 continue;
2438
2439 switch (match_token(p, cost_ctrl_tokens, args)) {
2440 case COST_CTRL:
2441 match_strlcpy(buf, &args[0], sizeof(buf));
2442 if (!strcmp(buf, "auto"))
2443 user = false;
2444 else if (!strcmp(buf, "user"))
2445 user = true;
2446 else
2447 goto einval;
2448 continue;
2449 case COST_MODEL:
2450 match_strlcpy(buf, &args[0], sizeof(buf));
2451 if (strcmp(buf, "linear"))
2452 goto einval;
2453 continue;
2454 }
2455
2456 tok = match_token(p, i_lcoef_tokens, args);
2457 if (tok == NR_I_LCOEFS)
2458 goto einval;
2459 if (match_u64(&args[0], &v))
2460 goto einval;
2461 u[tok] = v;
2462 user = true;
2463 }
2464
2465 spin_lock_irq(&ioc->lock);
2466 if (user) {
2467 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2468 ioc->user_cost_model = true;
2469 } else {
2470 ioc->user_cost_model = false;
2471 }
2472 ioc_refresh_params(ioc, true);
2473 spin_unlock_irq(&ioc->lock);
2474
2475 put_disk_and_module(disk);
2476 return nbytes;
2477
2478 einval:
2479 ret = -EINVAL;
2480 err:
2481 put_disk_and_module(disk);
2482 return ret;
2483 }
2484
2485 static struct cftype ioc_files[] = {
2486 {
2487 .name = "weight",
2488 .flags = CFTYPE_NOT_ON_ROOT,
2489 .seq_show = ioc_weight_show,
2490 .write = ioc_weight_write,
2491 },
2492 {
2493 .name = "cost.qos",
2494 .flags = CFTYPE_ONLY_ON_ROOT,
2495 .seq_show = ioc_qos_show,
2496 .write = ioc_qos_write,
2497 },
2498 {
2499 .name = "cost.model",
2500 .flags = CFTYPE_ONLY_ON_ROOT,
2501 .seq_show = ioc_cost_model_show,
2502 .write = ioc_cost_model_write,
2503 },
2504 {}
2505 };
2506
2507 static struct blkcg_policy blkcg_policy_iocost = {
2508 .dfl_cftypes = ioc_files,
2509 .cpd_alloc_fn = ioc_cpd_alloc,
2510 .cpd_free_fn = ioc_cpd_free,
2511 .pd_alloc_fn = ioc_pd_alloc,
2512 .pd_init_fn = ioc_pd_init,
2513 .pd_free_fn = ioc_pd_free,
2514 };
2515
ioc_init(void)2516 static int __init ioc_init(void)
2517 {
2518 return blkcg_policy_register(&blkcg_policy_iocost);
2519 }
2520
ioc_exit(void)2521 static void __exit ioc_exit(void)
2522 {
2523 return blkcg_policy_unregister(&blkcg_policy_iocost);
2524 }
2525
2526 module_init(ioc_init);
2527 module_exit(ioc_exit);
2528