1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27
28 using namespace brw;
29
30 namespace {
31 /**
32 * Enumeration representing the various asynchronous units that can run
33 * computations in parallel on behalf of a shader thread.
34 */
35 enum intel_eu_unit {
36 /** EU front-end. */
37 EU_UNIT_FE,
38 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
39 EU_UNIT_FPU,
40 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
41 EU_UNIT_EM,
42 /** Sampler shared function. */
43 EU_UNIT_SAMPLER,
44 /** Pixel Interpolator shared function. */
45 EU_UNIT_PI,
46 /** Unified Return Buffer shared function. */
47 EU_UNIT_URB,
48 /** Data Port Data Cache shared function. */
49 EU_UNIT_DP_DC,
50 /** Data Port Render Cache shared function. */
51 EU_UNIT_DP_RC,
52 /** Data Port Constant Cache shared function. */
53 EU_UNIT_DP_CC,
54 /** Message Gateway shared function. */
55 EU_UNIT_GATEWAY,
56 /** Thread Spawner shared function. */
57 EU_UNIT_SPAWNER,
58 /* EU_UNIT_VME, */
59 /* EU_UNIT_CRE, */
60 /** Number of asynchronous units currently tracked. */
61 EU_NUM_UNITS,
62 /** Dummy unit for instructions that don't consume runtime from the above. */
63 EU_UNIT_NULL = EU_NUM_UNITS
64 };
65
66 /**
67 * Enumeration representing a computation result another computation can
68 * potentially depend on.
69 */
70 enum intel_eu_dependency_id {
71 /* Register part of the GRF. */
72 EU_DEPENDENCY_ID_GRF0 = 0,
73 /* Address register part of the ARF. */
74 EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
75 /* Accumulator register part of the ARF. */
76 EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
77 /* Flag register part of the ARF. */
78 EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
79 /* SBID token write completion. Only used on Gfx12+. */
80 EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
81 /* SBID token read completion. Only used on Gfx12+. */
82 EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
83 /* Number of computation dependencies currently tracked. */
84 EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
85 };
86
87 /**
88 * State of our modeling of the program execution.
89 */
90 struct state {
state__anonc582e4c40111::state91 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
92 /**
93 * Time at which a given unit will be ready to execute the next
94 * computation, in clock units.
95 */
96 unsigned unit_ready[EU_NUM_UNITS];
97 /**
98 * Time at which an instruction dependent on a given dependency ID will
99 * be ready to execute, in clock units.
100 */
101 unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
102 /**
103 * Aggregated utilization of a given unit excluding idle cycles,
104 * in clock units.
105 */
106 float unit_busy[EU_NUM_UNITS];
107 /**
108 * Factor of the overhead of a computation accounted for in the
109 * aggregated utilization calculation.
110 */
111 float weight;
112 };
113
114 /**
115 * Information derived from an IR instruction used to compute performance
116 * estimates. Allows the timing calculation to work on both FS and VEC4
117 * instructions.
118 */
119 struct instruction_info {
instruction_info__anonc582e4c40111::instruction_info120 instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
121 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
122 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
123 tx(get_exec_type(inst)), sx(0), ss(0),
124 sc(has_bank_conflict(isa, inst) ? sd : 0),
125 desc(inst->desc), sfid(inst->sfid)
126 {
127 /* We typically want the maximum source size, except for split send
128 * messages which require the total size.
129 */
130 if (inst->opcode == SHADER_OPCODE_SEND) {
131 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
132 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
133 } else {
134 for (unsigned i = 0; i < inst->sources; i++)
135 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
136 }
137
138 /* Convert the execution size to GRF units. */
139 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
140
141 /* 32x32 integer multiplication has half the usual ALU throughput.
142 * Treat it as double-precision.
143 */
144 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
145 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
146 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
147 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
148
149 rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
150 }
151
152 /** ISA encoding information */
153 const struct brw_isa_info *isa;
154 /** Device information. */
155 const struct intel_device_info *devinfo;
156 /** Instruction opcode. */
157 opcode op;
158 /** Destination type. */
159 brw_reg_type td;
160 /** Destination size in GRF units. */
161 unsigned sd;
162 /** Execution type. */
163 brw_reg_type tx;
164 /** Execution size in GRF units. */
165 unsigned sx;
166 /** Source size. */
167 unsigned ss;
168 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
169 unsigned sc;
170 /** Send message descriptor. */
171 uint32_t desc;
172 /** Send message shared function ID. */
173 uint8_t sfid;
174 /** Repeat count for DPAS instructions. */
175 uint8_t rcount;
176 };
177
178 /**
179 * Timing information of an instruction used to estimate the performance of
180 * the program.
181 */
182 struct perf_desc {
perf_desc__anonc582e4c40111::perf_desc183 perf_desc(enum intel_eu_unit u, int df, int db,
184 int ls, int ld, int la, int lf) :
185 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
186
187 /**
188 * Back-end unit its runtime shall be accounted to, in addition to the
189 * EU front-end which is always assumed to be involved.
190 */
191 enum intel_eu_unit u;
192 /**
193 * Overhead cycles from the time that the EU front-end starts executing
194 * the instruction until it's ready to execute the next instruction.
195 */
196 int df;
197 /**
198 * Overhead cycles from the time that the back-end starts executing the
199 * instruction until it's ready to execute the next instruction.
200 */
201 int db;
202 /**
203 * Latency cycles from the time that the back-end starts executing the
204 * instruction until its sources have been read from the register file.
205 */
206 int ls;
207 /**
208 * Latency cycles from the time that the back-end starts executing the
209 * instruction until its regular destination has been written to the
210 * register file.
211 */
212 int ld;
213 /**
214 * Latency cycles from the time that the back-end starts executing the
215 * instruction until its accumulator destination has been written to the
216 * ARF file.
217 *
218 * Note that this is an approximation of the real behavior of
219 * accumulating instructions in the hardware: Instead of modeling a pair
220 * of back-to-back accumulating instructions as a first computation with
221 * latency equal to ld followed by another computation with a
222 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
223 * model the stall as if it occurred at the top of the pipeline, with
224 * the latency of the accumulator computation offset accordingly.
225 */
226 int la;
227 /**
228 * Latency cycles from the time that the back-end starts executing the
229 * instruction until its flag destination has been written to the ARF
230 * file.
231 */
232 int lf;
233 };
234
235 /**
236 * Compute the timing information of an instruction based on any relevant
237 * information from the IR and a number of parameters specifying a linear
238 * approximation: Parameter X_Y specifies the derivative of timing X
239 * relative to info field Y, while X_1 specifies the independent term of
240 * the approximation of timing X.
241 */
242 perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)243 calculate_desc(const instruction_info &info, enum intel_eu_unit u,
244 int df_1, int df_sd, int df_sc,
245 int db_1, int db_sx,
246 int ls_1, int ld_1, int la_1, int lf_1,
247 int l_ss, int l_sd)
248 {
249 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
250 db_1 + db_sx * int(info.sx),
251 ls_1 + l_ss * int(info.ss),
252 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
253 la_1, lf_1);
254 }
255
256 /**
257 * Compute the timing information of an instruction based on any relevant
258 * information from the IR and a number of linear approximation parameters
259 * hard-coded for each IR instruction.
260 *
261 * Most timing parameters are obtained from the multivariate linear
262 * regression of a sample of empirical timings measured using the tm0
263 * register (as can be done today by using the shader_time debugging
264 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
265 * "Shared Functions - Extended Math", Section 3.2 "Performance".
266 * Parameters marked XXX shall be considered low-quality, they're possibly
267 * high variance or completely guessed in cases where experimental data was
268 * unavailable.
269 */
270 const perf_desc
instruction_desc(const instruction_info & info)271 instruction_desc(const instruction_info &info)
272 {
273 const struct intel_device_info *devinfo = info.devinfo;
274
275 switch (info.op) {
276 case BRW_OPCODE_SYNC:
277 case BRW_OPCODE_SEL:
278 case BRW_OPCODE_NOT:
279 case BRW_OPCODE_AND:
280 case BRW_OPCODE_OR:
281 case BRW_OPCODE_XOR:
282 case BRW_OPCODE_SHR:
283 case BRW_OPCODE_SHL:
284 case BRW_OPCODE_ASR:
285 case BRW_OPCODE_CMPN:
286 case BRW_OPCODE_BFREV:
287 case BRW_OPCODE_BFI1:
288 case BRW_OPCODE_AVG:
289 case BRW_OPCODE_FRC:
290 case BRW_OPCODE_RNDU:
291 case BRW_OPCODE_RNDD:
292 case BRW_OPCODE_RNDE:
293 case BRW_OPCODE_RNDZ:
294 case BRW_OPCODE_MAC:
295 case BRW_OPCODE_MACH:
296 case BRW_OPCODE_LZD:
297 case BRW_OPCODE_FBH:
298 case BRW_OPCODE_FBL:
299 case BRW_OPCODE_CBIT:
300 case BRW_OPCODE_ADDC:
301 case BRW_OPCODE_ROR:
302 case BRW_OPCODE_ROL:
303 case BRW_OPCODE_SUBB:
304 case BRW_OPCODE_SAD2:
305 case BRW_OPCODE_SADA2:
306 case BRW_OPCODE_LINE:
307 case BRW_OPCODE_NOP:
308 case SHADER_OPCODE_CLUSTER_BROADCAST:
309 case SHADER_OPCODE_SCRATCH_HEADER:
310 case FS_OPCODE_DDX_COARSE:
311 case FS_OPCODE_DDX_FINE:
312 case FS_OPCODE_DDY_COARSE:
313 case FS_OPCODE_PIXEL_X:
314 case FS_OPCODE_PIXEL_Y:
315 case SHADER_OPCODE_READ_SR_REG:
316 if (devinfo->ver >= 11) {
317 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
318 0, 10, 6 /* XXX */, 14, 0, 0);
319 } else {
320 if (type_sz(info.tx) > 4)
321 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
322 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
323 else
324 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
325 0, 8, 4, 12, 0, 0);
326 }
327
328 case BRW_OPCODE_MOV:
329 case BRW_OPCODE_CMP:
330 case BRW_OPCODE_ADD:
331 case BRW_OPCODE_ADD3:
332 case BRW_OPCODE_MUL:
333 case SHADER_OPCODE_MOV_RELOC_IMM:
334 if (devinfo->ver >= 11) {
335 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
336 0, 10, 6, 14, 0, 0);
337 } else {
338 if (type_sz(info.tx) > 4)
339 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
340 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
341 else
342 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
343 0, 8, 4, 12, 0, 0);
344 }
345
346 case BRW_OPCODE_BFE:
347 case BRW_OPCODE_BFI2:
348 case BRW_OPCODE_CSEL:
349 if (devinfo->ver >= 11)
350 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
351 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
352 else
353 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
354 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
355
356 case BRW_OPCODE_MAD:
357 if (devinfo->ver >= 11) {
358 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
359 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
360 } else {
361 if (type_sz(info.tx) > 4)
362 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
363 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
364 else
365 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
366 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
367 }
368
369 case BRW_OPCODE_DP4:
370 case BRW_OPCODE_DPH:
371 case BRW_OPCODE_DP3:
372 case BRW_OPCODE_DP2:
373 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
374 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
375
376 case BRW_OPCODE_DP4A:
377 if (devinfo->ver >= 12)
378 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
379 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
380 else
381 abort();
382
383 case BRW_OPCODE_DPAS: {
384 unsigned ld;
385
386 switch (info.rcount) {
387 case 1:
388 ld = 21;
389 break;
390 case 2:
391 ld = 22;
392 break;
393 case 8:
394 default:
395 ld = 32;
396 break;
397 }
398
399 /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
400 * for la and lf.
401 */
402 if (devinfo->verx10 >= 125)
403 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
404 0, ld, UINT_MAX, UINT_MAX, 0, 0);
405 else
406 abort();
407 }
408
409 case SHADER_OPCODE_RCP:
410 case SHADER_OPCODE_RSQ:
411 case SHADER_OPCODE_SQRT:
412 case SHADER_OPCODE_EXP2:
413 case SHADER_OPCODE_LOG2:
414 case SHADER_OPCODE_SIN:
415 case SHADER_OPCODE_COS:
416 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
417 0, 16, 0, 0, 0, 0);
418
419 case SHADER_OPCODE_POW:
420 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
421 0, 24, 0, 0, 0, 0);
422
423 case SHADER_OPCODE_INT_QUOTIENT:
424 case SHADER_OPCODE_INT_REMAINDER:
425 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
426 0, 28 /* XXX */, 0, 0, 0, 0);
427
428 case BRW_OPCODE_DO:
429 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
430 0, 0, 0, 0, 0, 0);
431
432 case BRW_OPCODE_IF:
433 case BRW_OPCODE_ELSE:
434 case BRW_OPCODE_ENDIF:
435 case BRW_OPCODE_WHILE:
436 case BRW_OPCODE_BREAK:
437 case BRW_OPCODE_CONTINUE:
438 case BRW_OPCODE_HALT:
439 return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
440 0, 0, 0, 0, 0, 0);
441
442 case FS_OPCODE_LINTERP:
443 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
444 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
445
446 case BRW_OPCODE_LRP:
447 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
448 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
449
450 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
451 if (devinfo->ver >= 11)
452 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
453 0, 10 /* XXX */, 6 /* XXX */,
454 14 /* XXX */, 0, 0);
455 else
456 return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
457 0, 8 /* XXX */, 4 /* XXX */,
458 12 /* XXX */, 0, 0);
459
460 case SHADER_OPCODE_MOV_INDIRECT:
461 if (devinfo->ver >= 11)
462 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
463 0, 10 /* XXX */, 6 /* XXX */,
464 14 /* XXX */, 0, 0);
465 else
466 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
467 0, 8 /* XXX */, 4 /* XXX */,
468 12 /* XXX */, 0, 0);
469
470 case SHADER_OPCODE_BROADCAST:
471 if (devinfo->ver >= 11)
472 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
473 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
474 else
475 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
476 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
477
478 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
479 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
480 case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
481 if (devinfo->ver >= 11)
482 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
483 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
484 else
485 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
486 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
487
488 case SHADER_OPCODE_RND_MODE:
489 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
490 if (devinfo->ver >= 11)
491 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
492 4 /* XXX */, 0,
493 0, 0, 0, 0, 0, 0);
494 else
495 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
496 4 /* XXX */, 0,
497 0, 0, 0, 0, 0, 0);
498
499 case SHADER_OPCODE_SHUFFLE:
500 if (devinfo->ver >= 11)
501 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
502 44 /* XXX */, 0,
503 0, 10 /* XXX */, 6 /* XXX */,
504 14 /* XXX */, 0, 0);
505 else
506 return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
507 42 /* XXX */, 0,
508 0, 8 /* XXX */, 4 /* XXX */,
509 12 /* XXX */, 0, 0);
510
511 case SHADER_OPCODE_SEL_EXEC:
512 if (devinfo->ver >= 11)
513 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
514 0, 4 /* XXX */,
515 0, 10 /* XXX */, 6 /* XXX */,
516 14 /* XXX */, 0, 0);
517 else
518 return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
519 0, 4 /* XXX */,
520 0, 8 /* XXX */, 4 /* XXX */,
521 12 /* XXX */, 0, 0);
522
523 case SHADER_OPCODE_QUAD_SWIZZLE:
524 if (devinfo->ver >= 11)
525 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
526 0, 8 /* XXX */,
527 0, 10 /* XXX */, 6 /* XXX */,
528 14 /* XXX */, 0, 0);
529 else
530 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
531 0, 8 /* XXX */,
532 0, 8 /* XXX */, 4 /* XXX */,
533 12 /* XXX */, 0, 0);
534
535 case FS_OPCODE_DDY_FINE:
536 if (devinfo->ver >= 11)
537 return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
538 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
539 else
540 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
541 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
542
543 case FS_OPCODE_LOAD_LIVE_CHANNELS:
544 if (devinfo->ver >= 11)
545 return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
546 2 /* XXX */, 0,
547 0, 0, 0, 10 /* XXX */, 0, 0);
548 else
549 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
550 0, 2 /* XXX */,
551 0, 0, 0, 8 /* XXX */, 0, 0);
552
553 case SHADER_OPCODE_TEX:
554 case FS_OPCODE_TXB:
555 case SHADER_OPCODE_TXD:
556 case SHADER_OPCODE_TXF:
557 case SHADER_OPCODE_TXF_LZ:
558 case SHADER_OPCODE_TXL:
559 case SHADER_OPCODE_TXL_LZ:
560 case SHADER_OPCODE_TXF_CMS:
561 case SHADER_OPCODE_TXF_CMS_W:
562 case SHADER_OPCODE_TXF_UMS:
563 case SHADER_OPCODE_TXF_MCS:
564 case SHADER_OPCODE_TXS:
565 case SHADER_OPCODE_LOD:
566 case SHADER_OPCODE_GET_BUFFER_SIZE:
567 case SHADER_OPCODE_TG4:
568 case SHADER_OPCODE_TG4_BIAS:
569 case SHADER_OPCODE_TG4_EXPLICIT_LOD:
570 case SHADER_OPCODE_TG4_IMPLICIT_LOD:
571 case SHADER_OPCODE_TG4_OFFSET:
572 case SHADER_OPCODE_TG4_OFFSET_LOD:
573 case SHADER_OPCODE_TG4_OFFSET_BIAS:
574 case SHADER_OPCODE_SAMPLEINFO:
575 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
576 8 /* XXX */, 750 /* XXX */, 0, 0,
577 2 /* XXX */, 0);
578
579 case SHADER_OPCODE_MEMORY_FENCE:
580 case SHADER_OPCODE_INTERLOCK:
581 switch (info.sfid) {
582 case GFX6_SFID_DATAPORT_RENDER_CACHE:
583 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
584 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
585
586 case BRW_SFID_URB:
587 case GFX7_SFID_DATAPORT_DATA_CACHE:
588 case GFX12_SFID_SLM:
589 case GFX12_SFID_TGM:
590 case GFX12_SFID_UGM:
591 case HSW_SFID_DATAPORT_DATA_CACHE_1:
592 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
593 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
594
595 default:
596 abort();
597 }
598
599 case FS_OPCODE_FB_READ:
600 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
601 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
602
603 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
604 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
605 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
606
607 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
608 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
609 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
610 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
611 0, 90 /* XXX */, 0, 0, 0, 0);
612
613 case SHADER_OPCODE_BARRIER:
614 return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
615 0 /* XXX */, 0,
616 0, 0, 0, 0, 0, 0);
617
618 case CS_OPCODE_CS_TERMINATE:
619 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
620 10 /* XXX */, 0, 0, 0, 0, 0);
621
622 case SHADER_OPCODE_SEND:
623 switch (info.sfid) {
624 case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
625 /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
626 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
627 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
628 case GFX6_SFID_DATAPORT_RENDER_CACHE:
629 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
630 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
631 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
632 30 /* XXX */, 450 /* XXX */,
633 10 /* XXX */, 100 /* XXX */,
634 0, 0, 0, 400 /* XXX */);
635 default:
636 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
637 0, 450 /* XXX */,
638 10 /* XXX */, 300 /* XXX */, 0, 0,
639 0, 0);
640 }
641 case BRW_SFID_SAMPLER: {
642 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
643 8, 750, 0, 0, 2, 0);
644 }
645 case GFX7_SFID_DATAPORT_DATA_CACHE:
646 case HSW_SFID_DATAPORT_DATA_CACHE_1:
647 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
648 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
649 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
650 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
651 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
652 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
653 30 /* XXX */, 400 /* XXX */,
654 10 /* XXX */, 100 /* XXX */, 0, 0,
655 0, 400 /* XXX */);
656
657 default:
658 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
659 0, 20 /* XXX */,
660 10 /* XXX */, 100 /* XXX */, 0, 0,
661 0, 0);
662 }
663
664 case GFX7_SFID_PIXEL_INTERPOLATOR:
665 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
666 0, 90 /* XXX */, 0, 0, 0, 0);
667
668 case GFX12_SFID_UGM:
669 case GFX12_SFID_TGM:
670 case GFX12_SFID_SLM:
671 switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
672 case LSC_OP_LOAD:
673 case LSC_OP_STORE:
674 case LSC_OP_LOAD_CMASK:
675 case LSC_OP_STORE_CMASK:
676 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
677 0, 20 /* XXX */,
678 10 /* XXX */, 100 /* XXX */, 0, 0,
679 0, 0);
680
681 case LSC_OP_FENCE:
682 case LSC_OP_ATOMIC_INC:
683 case LSC_OP_ATOMIC_DEC:
684 case LSC_OP_ATOMIC_LOAD:
685 case LSC_OP_ATOMIC_STORE:
686 case LSC_OP_ATOMIC_ADD:
687 case LSC_OP_ATOMIC_SUB:
688 case LSC_OP_ATOMIC_MIN:
689 case LSC_OP_ATOMIC_MAX:
690 case LSC_OP_ATOMIC_UMIN:
691 case LSC_OP_ATOMIC_UMAX:
692 case LSC_OP_ATOMIC_CMPXCHG:
693 case LSC_OP_ATOMIC_FADD:
694 case LSC_OP_ATOMIC_FSUB:
695 case LSC_OP_ATOMIC_FMIN:
696 case LSC_OP_ATOMIC_FMAX:
697 case LSC_OP_ATOMIC_FCMPXCHG:
698 case LSC_OP_ATOMIC_AND:
699 case LSC_OP_ATOMIC_OR:
700 case LSC_OP_ATOMIC_XOR:
701 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
702 30 /* XXX */, 400 /* XXX */,
703 10 /* XXX */, 100 /* XXX */, 0, 0,
704 0, 400 /* XXX */);
705 default:
706 abort();
707 }
708
709 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
710 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
711 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
712 10 /* XXX */, 0, 0, 0, 0, 0);
713
714 case BRW_SFID_URB:
715 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
716 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
717
718 default:
719 abort();
720 }
721
722 case SHADER_OPCODE_UNDEF:
723 case SHADER_OPCODE_HALT_TARGET:
724 case FS_OPCODE_SCHEDULING_FENCE:
725 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
726 0, 0, 0, 0, 0, 0);
727
728 default:
729 abort();
730 }
731 }
732
733 /**
734 * Model the performance behavior of a stall on the specified dependency
735 * ID.
736 */
737 void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)738 stall_on_dependency(state &st, enum intel_eu_dependency_id id)
739 {
740 if (id < ARRAY_SIZE(st.dep_ready))
741 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
742 st.dep_ready[id]);
743 }
744
745 /**
746 * Model the performance behavior of the front-end and back-end while
747 * executing an instruction with the specified timing information, assuming
748 * all dependencies are already clear.
749 */
750 void
execute_instruction(state & st,const perf_desc & perf)751 execute_instruction(state &st, const perf_desc &perf)
752 {
753 /* Compute the time at which the front-end will be ready to execute the
754 * next instruction.
755 */
756 st.unit_ready[EU_UNIT_FE] += perf.df;
757
758 if (perf.u < EU_NUM_UNITS) {
759 /* Wait for the back-end to be ready to execute this instruction. */
760 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
761 st.unit_ready[perf.u]);
762
763 /* Compute the time at which the back-end will be ready to execute
764 * the next instruction, and update the back-end utilization.
765 */
766 st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
767 st.unit_busy[perf.u] += perf.db * st.weight;
768 }
769 }
770
771 /**
772 * Model the performance behavior of a read dependency provided by an
773 * instruction.
774 */
775 void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)776 mark_read_dependency(state &st, const perf_desc &perf,
777 enum intel_eu_dependency_id id)
778 {
779 if (id < ARRAY_SIZE(st.dep_ready))
780 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
781 }
782
783 /**
784 * Model the performance behavior of a write dependency provided by an
785 * instruction.
786 */
787 void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)788 mark_write_dependency(state &st, const perf_desc &perf,
789 enum intel_eu_dependency_id id)
790 {
791 if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
792 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
793 else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
794 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
795 else if (id < ARRAY_SIZE(st.dep_ready))
796 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
797 }
798
799 /**
800 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
801 */
802 enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const backend_reg & r,const int delta)803 reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
804 const int delta)
805 {
806 if (r.file == VGRF) {
807 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
808 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
809 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
810
811 } else if (r.file == FIXED_GRF) {
812 const unsigned i = r.nr + delta;
813 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
814 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
815
816 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
817 r.nr < BRW_ARF_ACCUMULATOR) {
818 assert(delta == 0);
819 return EU_DEPENDENCY_ID_ADDR0;
820
821 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
822 r.nr < BRW_ARF_FLAG) {
823 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
824 assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
825 return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
826
827 } else {
828 return EU_NUM_DEPENDENCY_IDS;
829 }
830 }
831
832 /**
833 * Return the dependency ID of flag register starting at offset \p i.
834 */
835 enum intel_eu_dependency_id
flag_dependency_id(unsigned i)836 flag_dependency_id(unsigned i)
837 {
838 assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
839 return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
840 }
841
842 /**
843 * Return the dependency ID corresponding to the SBID read completion
844 * condition of a Gfx12+ SWSB.
845 */
846 enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)847 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
848 {
849 if (swsb.mode) {
850 assert(swsb.sbid <
851 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
852 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
853 } else {
854 return EU_NUM_DEPENDENCY_IDS;
855 }
856 }
857
858 /**
859 * Return the dependency ID corresponding to the SBID write completion
860 * condition of a Gfx12+ SWSB.
861 */
862 enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)863 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
864 {
865 if (swsb.mode) {
866 assert(swsb.sbid <
867 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
868 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
869 } else {
870 return EU_NUM_DEPENDENCY_IDS;
871 }
872 }
873
874 /**
875 * Return the implicit accumulator register accessed by channel \p i of the
876 * instruction.
877 */
878 unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)879 accum_reg_of_channel(const intel_device_info *devinfo,
880 const backend_instruction *inst,
881 brw_reg_type tx, unsigned i)
882 {
883 assert(inst->reads_accumulator_implicitly() ||
884 inst->writes_accumulator_implicitly(devinfo));
885 const unsigned offset = (inst->group + i) * type_sz(tx) *
886 (brw_reg_type_is_floating_point(tx) ? 1 : 2);
887 return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
888 }
889
890 /**
891 * Model the performance behavior of an FS back-end instruction.
892 */
893 void
issue_fs_inst(state & st,const struct brw_isa_info * isa,const backend_instruction * be_inst)894 issue_fs_inst(state &st, const struct brw_isa_info *isa,
895 const backend_instruction *be_inst)
896 {
897 const struct intel_device_info *devinfo = isa->devinfo;
898 const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
899 const instruction_info info(isa, inst);
900 const perf_desc perf = instruction_desc(info);
901
902 /* Stall on any source dependencies. */
903 for (unsigned i = 0; i < inst->sources; i++) {
904 for (unsigned j = 0; j < regs_read(inst, i); j++)
905 stall_on_dependency(
906 st, reg_dependency_id(devinfo, inst->src[i], j));
907 }
908
909 if (inst->reads_accumulator_implicitly()) {
910 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
911 j <= accum_reg_of_channel(devinfo, inst, info.tx,
912 inst->exec_size - 1); j++)
913 stall_on_dependency(
914 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
915 }
916
917 if (const unsigned mask = inst->flags_read(devinfo)) {
918 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
919 if (mask & (1 << i))
920 stall_on_dependency(st, flag_dependency_id(i));
921 }
922 }
923
924 /* Stall on any write dependencies. */
925 if (!inst->no_dd_check) {
926 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
927 for (unsigned j = 0; j < regs_written(inst); j++)
928 stall_on_dependency(
929 st, reg_dependency_id(devinfo, inst->dst, j));
930 }
931
932 if (inst->writes_accumulator_implicitly(devinfo)) {
933 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
934 j <= accum_reg_of_channel(devinfo, inst, info.tx,
935 inst->exec_size - 1); j++)
936 stall_on_dependency(
937 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
938 }
939
940 if (const unsigned mask = inst->flags_written(devinfo)) {
941 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
942 if (mask & (1 << i))
943 stall_on_dependency(st, flag_dependency_id(i));
944 }
945 }
946 }
947
948 /* Stall on any SBID dependencies. */
949 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
950 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
951 else if (inst->sched.mode & TGL_SBID_SRC)
952 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
953
954 /* Execute the instruction. */
955 execute_instruction(st, perf);
956
957 /* Mark any source dependencies. */
958 if (inst->is_send_from_grf()) {
959 for (unsigned i = 0; i < inst->sources; i++) {
960 if (inst->is_payload(i)) {
961 for (unsigned j = 0; j < regs_read(inst, i); j++)
962 mark_read_dependency(
963 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
964 }
965 }
966 }
967
968 /* Mark any destination dependencies. */
969 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
970 for (unsigned j = 0; j < regs_written(inst); j++) {
971 mark_write_dependency(st, perf,
972 reg_dependency_id(devinfo, inst->dst, j));
973 }
974 }
975
976 if (inst->writes_accumulator_implicitly(devinfo)) {
977 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
978 j <= accum_reg_of_channel(devinfo, inst, info.tx,
979 inst->exec_size - 1); j++)
980 mark_write_dependency(st, perf,
981 reg_dependency_id(devinfo, brw_acc_reg(8), j));
982 }
983
984 if (const unsigned mask = inst->flags_written(devinfo)) {
985 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
986 if (mask & (1 << i))
987 mark_write_dependency(st, perf, flag_dependency_id(i));
988 }
989 }
990
991 /* Mark any SBID dependencies. */
992 if (inst->sched.mode & TGL_SBID_SET) {
993 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
994 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
995 }
996 }
997
998 /**
999 * Calculate the maximum possible throughput of the program compatible with
1000 * the cycle-count utilization estimated for each asynchronous unit, in
1001 * threads-per-cycle units.
1002 */
1003 float
calculate_thread_throughput(const state & st,float busy)1004 calculate_thread_throughput(const state &st, float busy)
1005 {
1006 for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1007 busy = MAX2(busy, st.unit_busy[i]);
1008
1009 return 1.0 / busy;
1010 }
1011
1012 /**
1013 * Estimate the performance of the specified shader.
1014 */
1015 void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const struct brw_isa_info *,const backend_instruction *),unsigned dispatch_width)1016 calculate_performance(performance &p, const backend_shader *s,
1017 void (*issue_instruction)(
1018 state &, const struct brw_isa_info *,
1019 const backend_instruction *),
1020 unsigned dispatch_width)
1021 {
1022 /* XXX - Note that the previous version of this code used worst-case
1023 * scenario estimation of branching divergence for SIMD32 shaders,
1024 * but this heuristic was removed to improve performance in common
1025 * scenarios. Wider shader variants are less optimal when divergence
1026 * is high, e.g. when application renders complex scene on a small
1027 * surface. It is assumed that such renders are short, so their
1028 * time doesn't matter and when it comes to the overall performance,
1029 * they are dominated by more optimal larger renders.
1030 *
1031 * It's possible that we could do better with divergence analysis
1032 * by isolating branches which are 100% uniform.
1033 *
1034 * Plumbing the trip counts from NIR loop analysis would allow us
1035 * to do a better job regarding the loop weights.
1036 *
1037 * In the meantime use values that roughly match the control flow
1038 * weights used elsewhere in the compiler back-end.
1039 *
1040 * Note that we provide slightly more pessimistic weights on
1041 * Gfx12+ for SIMD32, since the effective warp size on that
1042 * platform is 2x the SIMD width due to EU fusion, which increases
1043 * the likelihood of divergent control flow in comparison to
1044 * previous generations, giving narrower SIMD modes a performance
1045 * advantage in several test-cases with non-uniform discard jumps.
1046 */
1047 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1048 1.0 : 0.5);
1049 const float loop_weight = 10;
1050 unsigned halt_count = 0;
1051 unsigned elapsed = 0;
1052 state st;
1053
1054 foreach_block(block, s->cfg) {
1055 const unsigned elapsed0 = elapsed;
1056
1057 foreach_inst_in_block(backend_instruction, inst, block) {
1058 const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1059
1060 issue_instruction(st, &s->compiler->isa, inst);
1061
1062 if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1063 st.weight /= discard_weight;
1064
1065 elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1066
1067 if (inst->opcode == BRW_OPCODE_DO)
1068 st.weight *= loop_weight;
1069 else if (inst->opcode == BRW_OPCODE_WHILE)
1070 st.weight /= loop_weight;
1071 else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1072 st.weight *= discard_weight;
1073 }
1074
1075 p.block_latency[block->num] = elapsed - elapsed0;
1076 }
1077
1078 p.latency = elapsed;
1079 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1080 }
1081 }
1082
performance(const fs_visitor * v)1083 brw::performance::performance(const fs_visitor *v) :
1084 block_latency(new unsigned[v->cfg->num_blocks])
1085 {
1086 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1087 }
1088
~performance()1089 brw::performance::~performance()
1090 {
1091 delete[] block_latency;
1092 }
1093