1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27
28 using namespace brw;
29
30 namespace {
31 /**
32 * Enumeration representing the various asynchronous units that can run
33 * computations in parallel on behalf of a shader thread.
34 */
35 enum intel_eu_unit {
36 /** EU front-end. */
37 EU_UNIT_FE,
38 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
39 EU_UNIT_FPU,
40 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
41 EU_UNIT_EM,
42 /** Sampler shared function. */
43 EU_UNIT_SAMPLER,
44 /** Pixel Interpolator shared function. */
45 EU_UNIT_PI,
46 /** Unified Return Buffer shared function. */
47 EU_UNIT_URB,
48 /** Data Port Data Cache shared function. */
49 EU_UNIT_DP_DC,
50 /** Data Port Render Cache shared function. */
51 EU_UNIT_DP_RC,
52 /** Data Port Constant Cache shared function. */
53 EU_UNIT_DP_CC,
54 /** Message Gateway shared function. */
55 EU_UNIT_GATEWAY,
56 /** Thread Spawner shared function. */
57 EU_UNIT_SPAWNER,
58 /* EU_UNIT_VME, */
59 /* EU_UNIT_CRE, */
60 /** Number of asynchronous units currently tracked. */
61 EU_NUM_UNITS,
62 /** Dummy unit for instructions that don't consume runtime from the above. */
63 EU_UNIT_NULL = EU_NUM_UNITS
64 };
65
66 /**
67 * Enumeration representing a computation result another computation can
68 * potentially depend on.
69 */
70 enum intel_eu_dependency_id {
71 /* Register part of the GRF. */
72 EU_DEPENDENCY_ID_GRF0 = 0,
73 /* Address register part of the ARF. */
74 EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_GRF0 + XE3_MAX_GRF,
75 /* Accumulator register part of the ARF. */
76 EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
77 /* Flag register part of the ARF. */
78 EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
79 /* SBID token write completion. Only used on Gfx12+. */
80 EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
81 /* SBID token read completion. Only used on Gfx12+. */
82 EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
83 /* Number of computation dependencies currently tracked. */
84 EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
85 };
86
87 /**
88 * State of our modeling of the program execution.
89 */
90 struct state {
state__anon0742f7430111::state91 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
92 /**
93 * Time at which a given unit will be ready to execute the next
94 * computation, in clock units.
95 */
96 unsigned unit_ready[EU_NUM_UNITS];
97 /**
98 * Time at which an instruction dependent on a given dependency ID will
99 * be ready to execute, in clock units.
100 */
101 unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
102 /**
103 * Aggregated utilization of a given unit excluding idle cycles,
104 * in clock units.
105 */
106 float unit_busy[EU_NUM_UNITS];
107 /**
108 * Factor of the overhead of a computation accounted for in the
109 * aggregated utilization calculation.
110 */
111 float weight;
112 };
113
114 /**
115 * Information derived from an IR instruction used to compute performance
116 * estimates. Allows the timing calculation to work on both FS and VEC4
117 * instructions.
118 */
119 struct instruction_info {
instruction_info__anon0742f7430111::instruction_info120 instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
121 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
122 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
123 tx(get_exec_type(inst)), sx(0), ss(0),
124 sc(has_bank_conflict(isa, inst) ? sd : 0),
125 desc(inst->desc), sfid(inst->sfid)
126 {
127 /* We typically want the maximum source size, except for split send
128 * messages which require the total size.
129 */
130 if (inst->opcode == SHADER_OPCODE_SEND) {
131 ss = DIV_ROUND_UP(inst->size_read(devinfo, 2), REG_SIZE) +
132 DIV_ROUND_UP(inst->size_read(devinfo, 3), REG_SIZE);
133 } else if (inst->opcode == SHADER_OPCODE_SEND_GATHER) {
134 ss = inst->mlen;
135 /* If haven't lowered yet, count the sources. */
136 if (!ss) {
137 for (int i = 3; i < inst->sources; i++)
138 ss += DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE);
139 }
140 } else {
141 for (unsigned i = 0; i < inst->sources; i++)
142 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE));
143 }
144
145 /* Convert the execution size to GRF units. */
146 sx = DIV_ROUND_UP(inst->exec_size * brw_type_size_bytes(tx), REG_SIZE);
147
148 /* 32x32 integer multiplication has half the usual ALU throughput.
149 * Treat it as double-precision.
150 */
151 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
152 !brw_type_is_float(tx) && brw_type_size_bytes(tx) == 4 &&
153 brw_type_size_bytes(inst->src[0].type) == brw_type_size_bytes(inst->src[1].type))
154 tx = brw_int_type(8, tx == BRW_TYPE_D);
155
156 rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
157 }
158
159 /** ISA encoding information */
160 const struct brw_isa_info *isa;
161 /** Device information. */
162 const struct intel_device_info *devinfo;
163 /** Instruction opcode. */
164 opcode op;
165 /** Destination type. */
166 brw_reg_type td;
167 /** Destination size in GRF units. */
168 unsigned sd;
169 /** Execution type. */
170 brw_reg_type tx;
171 /** Execution size in GRF units. */
172 unsigned sx;
173 /** Source size. */
174 unsigned ss;
175 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
176 unsigned sc;
177 /** Send message descriptor. */
178 uint32_t desc;
179 /** Send message shared function ID. */
180 uint8_t sfid;
181 /** Repeat count for DPAS instructions. */
182 uint8_t rcount;
183 };
184
185 /**
186 * Timing information of an instruction used to estimate the performance of
187 * the program.
188 */
189 struct perf_desc {
perf_desc__anon0742f7430111::perf_desc190 perf_desc(enum intel_eu_unit u, int df, int db,
191 int ls, int ld, int la, int lf) :
192 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
193
194 /**
195 * Back-end unit its runtime shall be accounted to, in addition to the
196 * EU front-end which is always assumed to be involved.
197 */
198 enum intel_eu_unit u;
199 /**
200 * Overhead cycles from the time that the EU front-end starts executing
201 * the instruction until it's ready to execute the next instruction.
202 */
203 int df;
204 /**
205 * Overhead cycles from the time that the back-end starts executing the
206 * instruction until it's ready to execute the next instruction.
207 */
208 int db;
209 /**
210 * Latency cycles from the time that the back-end starts executing the
211 * instruction until its sources have been read from the register file.
212 */
213 int ls;
214 /**
215 * Latency cycles from the time that the back-end starts executing the
216 * instruction until its regular destination has been written to the
217 * register file.
218 */
219 int ld;
220 /**
221 * Latency cycles from the time that the back-end starts executing the
222 * instruction until its accumulator destination has been written to the
223 * ARF file.
224 *
225 * Note that this is an approximation of the real behavior of
226 * accumulating instructions in the hardware: Instead of modeling a pair
227 * of back-to-back accumulating instructions as a first computation with
228 * latency equal to ld followed by another computation with a
229 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
230 * model the stall as if it occurred at the top of the pipeline, with
231 * the latency of the accumulator computation offset accordingly.
232 */
233 int la;
234 /**
235 * Latency cycles from the time that the back-end starts executing the
236 * instruction until its flag destination has been written to the ARF
237 * file.
238 */
239 int lf;
240 };
241
242 /**
243 * Compute the timing information of an instruction based on any relevant
244 * information from the IR and a number of parameters specifying a linear
245 * approximation: Parameter X_Y specifies the derivative of timing X
246 * relative to info field Y, while X_1 specifies the independent term of
247 * the approximation of timing X.
248 */
249 perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)250 calculate_desc(const instruction_info &info, enum intel_eu_unit u,
251 int df_1, int df_sd, int df_sc,
252 int db_1, int db_sx,
253 int ls_1, int ld_1, int la_1, int lf_1,
254 int l_ss, int l_sd)
255 {
256 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
257 db_1 + db_sx * int(info.sx),
258 ls_1 + l_ss * int(info.ss),
259 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
260 la_1, lf_1);
261 }
262
263 /**
264 * Compute the timing information of an instruction based on any relevant
265 * information from the IR and a number of linear approximation parameters
266 * hard-coded for each IR instruction.
267 *
268 * Most timing parameters are obtained from the multivariate linear
269 * regression of a sample of empirical timings measured using the tm0
270 * register (as can be done today by using the shader_time debugging
271 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
272 * "Shared Functions - Extended Math", Section 3.2 "Performance".
273 * Parameters marked XXX shall be considered low-quality, they're possibly
274 * high variance or completely guessed in cases where experimental data was
275 * unavailable.
276 */
277 const perf_desc
instruction_desc(const instruction_info & info)278 instruction_desc(const instruction_info &info)
279 {
280 const struct intel_device_info *devinfo = info.devinfo;
281
282 switch (info.op) {
283 case BRW_OPCODE_SYNC:
284 case BRW_OPCODE_SEL:
285 case BRW_OPCODE_NOT:
286 case BRW_OPCODE_AND:
287 case BRW_OPCODE_OR:
288 case BRW_OPCODE_XOR:
289 case BRW_OPCODE_SHR:
290 case BRW_OPCODE_SHL:
291 case BRW_OPCODE_ASR:
292 case BRW_OPCODE_CMPN:
293 case BRW_OPCODE_BFREV:
294 case BRW_OPCODE_BFI1:
295 case BRW_OPCODE_AVG:
296 case BRW_OPCODE_FRC:
297 case BRW_OPCODE_RNDU:
298 case BRW_OPCODE_RNDD:
299 case BRW_OPCODE_RNDE:
300 case BRW_OPCODE_RNDZ:
301 case BRW_OPCODE_MAC:
302 case BRW_OPCODE_MACH:
303 case BRW_OPCODE_LZD:
304 case BRW_OPCODE_FBH:
305 case BRW_OPCODE_FBL:
306 case BRW_OPCODE_CBIT:
307 case BRW_OPCODE_ADDC:
308 case BRW_OPCODE_ROR:
309 case BRW_OPCODE_ROL:
310 case BRW_OPCODE_SUBB:
311 case BRW_OPCODE_LINE:
312 case BRW_OPCODE_NOP:
313 case SHADER_OPCODE_CLUSTER_BROADCAST:
314 case SHADER_OPCODE_SCRATCH_HEADER:
315 case FS_OPCODE_DDX_COARSE:
316 case FS_OPCODE_DDX_FINE:
317 case FS_OPCODE_DDY_COARSE:
318 case FS_OPCODE_PIXEL_X:
319 case FS_OPCODE_PIXEL_Y:
320 if (devinfo->ver >= 11) {
321 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
322 0, 10, 6 /* XXX */, 14, 0, 0);
323 } else {
324 if (brw_type_size_bytes(info.tx) > 4)
325 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
326 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
327 else
328 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
329 0, 8, 4, 12, 0, 0);
330 }
331
332 case BRW_OPCODE_MOV:
333 case BRW_OPCODE_CMP:
334 case BRW_OPCODE_ADD:
335 case BRW_OPCODE_ADD3:
336 case BRW_OPCODE_MUL:
337 case SHADER_OPCODE_MOV_RELOC_IMM:
338 if (devinfo->ver >= 11) {
339 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
340 0, 10, 6, 14, 0, 0);
341 } else {
342 if (brw_type_size_bytes(info.tx) > 4)
343 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
344 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
345 else
346 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
347 0, 8, 4, 12, 0, 0);
348 }
349
350 case BRW_OPCODE_BFE:
351 case BRW_OPCODE_BFI2:
352 case BRW_OPCODE_CSEL:
353 if (devinfo->ver >= 11)
354 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
355 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
356 else
357 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
358 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
359
360 case BRW_OPCODE_MAD:
361 if (devinfo->ver >= 11) {
362 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
363 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
364 } else {
365 if (brw_type_size_bytes(info.tx) > 4)
366 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
367 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
368 else
369 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
370 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
371 }
372
373 case BRW_OPCODE_DP4:
374 case BRW_OPCODE_DPH:
375 case BRW_OPCODE_DP3:
376 case BRW_OPCODE_DP2:
377 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
378 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
379
380 case BRW_OPCODE_DP4A:
381 if (devinfo->ver >= 12)
382 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
383 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
384 else
385 abort();
386
387 case BRW_OPCODE_DPAS: {
388 unsigned ld;
389
390 switch (info.rcount) {
391 case 1:
392 ld = 21;
393 break;
394 case 2:
395 ld = 22;
396 break;
397 case 8:
398 default:
399 ld = 32;
400 break;
401 }
402
403 /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
404 * for la and lf.
405 */
406 if (devinfo->verx10 >= 125)
407 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
408 0, ld, UINT_MAX, UINT_MAX, 0, 0);
409 else
410 abort();
411 }
412
413 case SHADER_OPCODE_RCP:
414 case SHADER_OPCODE_RSQ:
415 case SHADER_OPCODE_SQRT:
416 case SHADER_OPCODE_EXP2:
417 case SHADER_OPCODE_LOG2:
418 case SHADER_OPCODE_SIN:
419 case SHADER_OPCODE_COS:
420 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
421 0, 16, 0, 0, 0, 0);
422
423 case SHADER_OPCODE_POW:
424 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
425 0, 24, 0, 0, 0, 0);
426
427 case SHADER_OPCODE_INT_QUOTIENT:
428 case SHADER_OPCODE_INT_REMAINDER:
429 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
430 0, 28 /* XXX */, 0, 0, 0, 0);
431
432 case BRW_OPCODE_DO:
433 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
434 0, 0, 0, 0, 0, 0);
435
436 case BRW_OPCODE_IF:
437 case BRW_OPCODE_ELSE:
438 case BRW_OPCODE_ENDIF:
439 case BRW_OPCODE_WHILE:
440 case BRW_OPCODE_BREAK:
441 case BRW_OPCODE_CONTINUE:
442 case BRW_OPCODE_HALT:
443 return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
444 0, 0, 0, 0, 0, 0);
445
446 case BRW_OPCODE_PLN:
447 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
448 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
449
450 case BRW_OPCODE_LRP:
451 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
452 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
453
454 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
455 if (devinfo->ver >= 11)
456 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
457 0, 10 /* XXX */, 6 /* XXX */,
458 14 /* XXX */, 0, 0);
459 else
460 return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
461 0, 8 /* XXX */, 4 /* XXX */,
462 12 /* XXX */, 0, 0);
463
464 case SHADER_OPCODE_READ_ARCH_REG:
465 if (devinfo->ver >= 12) {
466 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
467 0, 10, 6 /* XXX */, 14, 0, 0);
468 } else {
469 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
470 0, 8, 4, 12, 0, 0);
471 }
472
473 case SHADER_OPCODE_MOV_INDIRECT:
474 if (devinfo->ver >= 11)
475 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
476 0, 10 /* XXX */, 6 /* XXX */,
477 14 /* XXX */, 0, 0);
478 else
479 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
480 0, 8 /* XXX */, 4 /* XXX */,
481 12 /* XXX */, 0, 0);
482
483 case SHADER_OPCODE_BROADCAST:
484 if (devinfo->ver >= 11)
485 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
486 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
487 else
488 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
489 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
490
491 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
492 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
493 case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
494 if (devinfo->ver >= 11)
495 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
496 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
497 else
498 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
499 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
500
501 case SHADER_OPCODE_RND_MODE:
502 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
503 if (devinfo->ver >= 11)
504 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
505 4 /* XXX */, 0,
506 0, 0, 0, 0, 0, 0);
507 else
508 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
509 4 /* XXX */, 0,
510 0, 0, 0, 0, 0, 0);
511
512 case SHADER_OPCODE_SHUFFLE:
513 if (devinfo->ver >= 11)
514 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
515 44 /* XXX */, 0,
516 0, 10 /* XXX */, 6 /* XXX */,
517 14 /* XXX */, 0, 0);
518 else
519 return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
520 42 /* XXX */, 0,
521 0, 8 /* XXX */, 4 /* XXX */,
522 12 /* XXX */, 0, 0);
523
524 case SHADER_OPCODE_SEL_EXEC:
525 if (devinfo->ver >= 11)
526 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
527 0, 4 /* XXX */,
528 0, 10 /* XXX */, 6 /* XXX */,
529 14 /* XXX */, 0, 0);
530 else
531 return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
532 0, 4 /* XXX */,
533 0, 8 /* XXX */, 4 /* XXX */,
534 12 /* XXX */, 0, 0);
535
536 case SHADER_OPCODE_QUAD_SWIZZLE:
537 if (devinfo->ver >= 11)
538 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
539 0, 8 /* XXX */,
540 0, 10 /* XXX */, 6 /* XXX */,
541 14 /* XXX */, 0, 0);
542 else
543 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
544 0, 8 /* XXX */,
545 0, 8 /* XXX */, 4 /* XXX */,
546 12 /* XXX */, 0, 0);
547
548 case FS_OPCODE_DDY_FINE:
549 if (devinfo->ver >= 11)
550 return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
551 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
552 else
553 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
554 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
555
556 case FS_OPCODE_LOAD_LIVE_CHANNELS:
557 if (devinfo->ver >= 11)
558 return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
559 2 /* XXX */, 0,
560 0, 0, 0, 10 /* XXX */, 0, 0);
561 else
562 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
563 0, 2 /* XXX */,
564 0, 0, 0, 8 /* XXX */, 0, 0);
565
566 case SHADER_OPCODE_GET_BUFFER_SIZE:
567 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
568 8 /* XXX */, 750 /* XXX */, 0, 0,
569 2 /* XXX */, 0);
570
571 case SHADER_OPCODE_MEMORY_FENCE:
572 case SHADER_OPCODE_INTERLOCK:
573 switch (info.sfid) {
574 case GFX6_SFID_DATAPORT_RENDER_CACHE:
575 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
576 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
577
578 case BRW_SFID_URB:
579 case GFX7_SFID_DATAPORT_DATA_CACHE:
580 case GFX12_SFID_SLM:
581 case GFX12_SFID_TGM:
582 case GFX12_SFID_UGM:
583 case HSW_SFID_DATAPORT_DATA_CACHE_1:
584 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
585 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
586
587 default:
588 abort();
589 }
590
591 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
592 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
593 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
594
595 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
596 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
597 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
598 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
599 0, 90 /* XXX */, 0, 0, 0, 0);
600
601 case SHADER_OPCODE_BARRIER:
602 return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
603 0 /* XXX */, 0,
604 0, 0, 0, 0, 0, 0);
605
606 case SHADER_OPCODE_SEND:
607 case SHADER_OPCODE_SEND_GATHER:
608 switch (info.sfid) {
609 case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
610 /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
611 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
612 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
613 case GFX6_SFID_DATAPORT_RENDER_CACHE:
614 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
615 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
616 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
617 30 /* XXX */, 450 /* XXX */,
618 10 /* XXX */, 100 /* XXX */,
619 0, 0, 0, 400 /* XXX */);
620 default:
621 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
622 0, 450 /* XXX */,
623 10 /* XXX */, 300 /* XXX */, 0, 0,
624 0, 0);
625 }
626 case BRW_SFID_SAMPLER: {
627 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
628 8, 750, 0, 0, 2, 0);
629 }
630 case GFX7_SFID_DATAPORT_DATA_CACHE:
631 case HSW_SFID_DATAPORT_DATA_CACHE_1:
632 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
633 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
634 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
635 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
636 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
637 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
638 30 /* XXX */, 400 /* XXX */,
639 10 /* XXX */, 100 /* XXX */, 0, 0,
640 0, 400 /* XXX */);
641
642 default:
643 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
644 0, 20 /* XXX */,
645 10 /* XXX */, 100 /* XXX */, 0, 0,
646 0, 0);
647 }
648
649 case GFX7_SFID_PIXEL_INTERPOLATOR:
650 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
651 0, 90 /* XXX */, 0, 0, 0, 0);
652
653 case GFX12_SFID_UGM:
654 case GFX12_SFID_TGM:
655 case GFX12_SFID_SLM:
656 switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
657 case LSC_OP_LOAD:
658 case LSC_OP_STORE:
659 case LSC_OP_LOAD_CMASK:
660 case LSC_OP_STORE_CMASK:
661 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
662 0, 20 /* XXX */,
663 10 /* XXX */, 100 /* XXX */, 0, 0,
664 0, 0);
665
666 case LSC_OP_FENCE:
667 case LSC_OP_ATOMIC_INC:
668 case LSC_OP_ATOMIC_DEC:
669 case LSC_OP_ATOMIC_LOAD:
670 case LSC_OP_ATOMIC_STORE:
671 case LSC_OP_ATOMIC_ADD:
672 case LSC_OP_ATOMIC_SUB:
673 case LSC_OP_ATOMIC_MIN:
674 case LSC_OP_ATOMIC_MAX:
675 case LSC_OP_ATOMIC_UMIN:
676 case LSC_OP_ATOMIC_UMAX:
677 case LSC_OP_ATOMIC_CMPXCHG:
678 case LSC_OP_ATOMIC_FADD:
679 case LSC_OP_ATOMIC_FSUB:
680 case LSC_OP_ATOMIC_FMIN:
681 case LSC_OP_ATOMIC_FMAX:
682 case LSC_OP_ATOMIC_FCMPXCHG:
683 case LSC_OP_ATOMIC_AND:
684 case LSC_OP_ATOMIC_OR:
685 case LSC_OP_ATOMIC_XOR:
686 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
687 30 /* XXX */, 400 /* XXX */,
688 10 /* XXX */, 100 /* XXX */, 0, 0,
689 0, 400 /* XXX */);
690 default:
691 abort();
692 }
693
694 case BRW_SFID_MESSAGE_GATEWAY:
695 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */
696 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
697 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
698 10 /* XXX */, 0, 0, 0, 0, 0);
699
700 case BRW_SFID_URB:
701 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
702 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
703
704 default:
705 abort();
706 }
707
708 case SHADER_OPCODE_UNDEF:
709 case SHADER_OPCODE_HALT_TARGET:
710 case FS_OPCODE_SCHEDULING_FENCE:
711 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
712 0, 0, 0, 0, 0, 0);
713
714 default:
715 abort();
716 }
717 }
718
719 /**
720 * Model the performance behavior of a stall on the specified dependency
721 * ID.
722 */
723 void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)724 stall_on_dependency(state &st, enum intel_eu_dependency_id id)
725 {
726 if (id < ARRAY_SIZE(st.dep_ready))
727 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
728 st.dep_ready[id]);
729 }
730
731 /**
732 * Model the performance behavior of the front-end and back-end while
733 * executing an instruction with the specified timing information, assuming
734 * all dependencies are already clear.
735 */
736 void
execute_instruction(state & st,const perf_desc & perf)737 execute_instruction(state &st, const perf_desc &perf)
738 {
739 /* Compute the time at which the front-end will be ready to execute the
740 * next instruction.
741 */
742 st.unit_ready[EU_UNIT_FE] += perf.df;
743
744 if (perf.u < EU_NUM_UNITS) {
745 /* Wait for the back-end to be ready to execute this instruction. */
746 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
747 st.unit_ready[perf.u]);
748
749 /* Compute the time at which the back-end will be ready to execute
750 * the next instruction, and update the back-end utilization.
751 */
752 st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
753 st.unit_busy[perf.u] += perf.db * st.weight;
754 }
755 }
756
757 /**
758 * Model the performance behavior of a read dependency provided by an
759 * instruction.
760 */
761 void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)762 mark_read_dependency(state &st, const perf_desc &perf,
763 enum intel_eu_dependency_id id)
764 {
765 if (id < ARRAY_SIZE(st.dep_ready))
766 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
767 }
768
769 /**
770 * Model the performance behavior of a write dependency provided by an
771 * instruction.
772 */
773 void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)774 mark_write_dependency(state &st, const perf_desc &perf,
775 enum intel_eu_dependency_id id)
776 {
777 if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
778 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
779 else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
780 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
781 else if (id < ARRAY_SIZE(st.dep_ready))
782 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
783 }
784
785 /**
786 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
787 */
788 enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const brw_reg & r,const int delta)789 reg_dependency_id(const intel_device_info *devinfo, const brw_reg &r,
790 const int delta)
791 {
792 if (r.file == VGRF) {
793 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
794 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
795 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
796
797 } else if (r.file == FIXED_GRF) {
798 const unsigned i = r.nr + delta;
799 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
800 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
801
802 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
803 r.nr < BRW_ARF_ACCUMULATOR) {
804 assert(delta == 0);
805 return EU_DEPENDENCY_ID_ADDR0;
806
807 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
808 r.nr < BRW_ARF_FLAG) {
809 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
810 assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
811 return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
812
813 } else {
814 return EU_NUM_DEPENDENCY_IDS;
815 }
816 }
817
818 /**
819 * Return the dependency ID of flag register starting at offset \p i.
820 */
821 enum intel_eu_dependency_id
flag_dependency_id(unsigned i)822 flag_dependency_id(unsigned i)
823 {
824 assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
825 return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
826 }
827
828 /**
829 * Return the dependency ID corresponding to the SBID read completion
830 * condition of a Gfx12+ SWSB.
831 */
832 enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)833 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
834 {
835 if (swsb.mode) {
836 assert(swsb.sbid <
837 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
838 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
839 } else {
840 return EU_NUM_DEPENDENCY_IDS;
841 }
842 }
843
844 /**
845 * Return the dependency ID corresponding to the SBID write completion
846 * condition of a Gfx12+ SWSB.
847 */
848 enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)849 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
850 {
851 if (swsb.mode) {
852 assert(swsb.sbid <
853 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
854 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
855 } else {
856 return EU_NUM_DEPENDENCY_IDS;
857 }
858 }
859
860 /**
861 * Return the implicit accumulator register accessed by channel \p i of the
862 * instruction.
863 */
864 unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const fs_inst * inst,brw_reg_type tx,unsigned i)865 accum_reg_of_channel(const intel_device_info *devinfo,
866 const fs_inst *inst,
867 brw_reg_type tx, unsigned i)
868 {
869 assert(inst->reads_accumulator_implicitly() ||
870 inst->writes_accumulator_implicitly(devinfo));
871 const unsigned offset = (inst->group + i) * brw_type_size_bytes(tx) *
872 (brw_type_is_float(tx) ? 1 : 2);
873 return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
874 }
875
876 /**
877 * Model the performance behavior of an FS back-end instruction.
878 */
879 void
issue_inst(state & st,const struct brw_isa_info * isa,const fs_inst * inst)880 issue_inst(state &st, const struct brw_isa_info *isa,
881 const fs_inst *inst)
882 {
883 const struct intel_device_info *devinfo = isa->devinfo;
884 const instruction_info info(isa, inst);
885 const perf_desc perf = instruction_desc(info);
886
887 /* Stall on any source dependencies. */
888 for (unsigned i = 0; i < inst->sources; i++) {
889 for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
890 stall_on_dependency(
891 st, reg_dependency_id(devinfo, inst->src[i], j));
892 }
893
894 if (inst->reads_accumulator_implicitly()) {
895 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
896 j <= accum_reg_of_channel(devinfo, inst, info.tx,
897 inst->exec_size - 1); j++)
898 stall_on_dependency(
899 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
900 }
901
902 if (const unsigned mask = inst->flags_read(devinfo)) {
903 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
904 if (mask & (1 << i))
905 stall_on_dependency(st, flag_dependency_id(i));
906 }
907 }
908
909 /* Stall on any write dependencies. */
910 if (!inst->no_dd_check) {
911 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
912 for (unsigned j = 0; j < regs_written(inst); j++)
913 stall_on_dependency(
914 st, reg_dependency_id(devinfo, inst->dst, j));
915 }
916
917 if (inst->writes_accumulator_implicitly(devinfo)) {
918 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
919 j <= accum_reg_of_channel(devinfo, inst, info.tx,
920 inst->exec_size - 1); j++)
921 stall_on_dependency(
922 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
923 }
924
925 if (const unsigned mask = inst->flags_written(devinfo)) {
926 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
927 if (mask & (1 << i))
928 stall_on_dependency(st, flag_dependency_id(i));
929 }
930 }
931 }
932
933 /* Stall on any SBID dependencies. */
934 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
935 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
936 else if (inst->sched.mode & TGL_SBID_SRC)
937 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
938
939 /* Execute the instruction. */
940 execute_instruction(st, perf);
941
942 /* Mark any source dependencies. */
943 if (inst->is_send_from_grf()) {
944 for (unsigned i = 0; i < inst->sources; i++) {
945 if (inst->is_payload(i)) {
946 for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
947 mark_read_dependency(
948 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
949 }
950 }
951 }
952
953 /* Mark any destination dependencies. */
954 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
955 for (unsigned j = 0; j < regs_written(inst); j++) {
956 mark_write_dependency(st, perf,
957 reg_dependency_id(devinfo, inst->dst, j));
958 }
959 }
960
961 if (inst->writes_accumulator_implicitly(devinfo)) {
962 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
963 j <= accum_reg_of_channel(devinfo, inst, info.tx,
964 inst->exec_size - 1); j++)
965 mark_write_dependency(st, perf,
966 reg_dependency_id(devinfo, brw_acc_reg(8), j));
967 }
968
969 if (const unsigned mask = inst->flags_written(devinfo)) {
970 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
971 if (mask & (1 << i))
972 mark_write_dependency(st, perf, flag_dependency_id(i));
973 }
974 }
975
976 /* Mark any SBID dependencies. */
977 if (inst->sched.mode & TGL_SBID_SET) {
978 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
979 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
980 }
981 }
982
983 /**
984 * Calculate the maximum possible throughput of the program compatible with
985 * the cycle-count utilization estimated for each asynchronous unit, in
986 * threads-per-cycle units.
987 */
988 float
calculate_thread_throughput(const state & st,float busy)989 calculate_thread_throughput(const state &st, float busy)
990 {
991 for (unsigned i = 0; i < EU_NUM_UNITS; i++)
992 busy = MAX2(busy, st.unit_busy[i]);
993
994 return 1.0 / busy;
995 }
996
997 /**
998 * Estimate the performance of the specified shader.
999 */
1000 void
calculate_performance(performance & p,const fs_visitor * s,unsigned dispatch_width)1001 calculate_performance(performance &p, const fs_visitor *s,
1002 unsigned dispatch_width)
1003 {
1004 /* XXX - Note that the previous version of this code used worst-case
1005 * scenario estimation of branching divergence for SIMD32 shaders,
1006 * but this heuristic was removed to improve performance in common
1007 * scenarios. Wider shader variants are less optimal when divergence
1008 * is high, e.g. when application renders complex scene on a small
1009 * surface. It is assumed that such renders are short, so their
1010 * time doesn't matter and when it comes to the overall performance,
1011 * they are dominated by more optimal larger renders.
1012 *
1013 * It's possible that we could do better with divergence analysis
1014 * by isolating branches which are 100% uniform.
1015 *
1016 * Plumbing the trip counts from NIR loop analysis would allow us
1017 * to do a better job regarding the loop weights.
1018 *
1019 * In the meantime use values that roughly match the control flow
1020 * weights used elsewhere in the compiler back-end.
1021 *
1022 * Note that we provide slightly more pessimistic weights on
1023 * Gfx12.x for SIMD32, since the effective warp size on that
1024 * platform is 2x the SIMD width due to EU fusion, which increases
1025 * the likelihood of divergent control flow in comparison to
1026 * previous generations, giving narrower SIMD modes a performance
1027 * advantage in several test-cases with non-uniform discard jumps.
1028 * EU fusion has been removed on Xe2+ so its divergence behavior is
1029 * expected to be closer to pre-Gfx12 platforms.
1030 */
1031 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver != 12 ?
1032 1.0 : 0.5);
1033 const float loop_weight = 10;
1034 unsigned halt_count = 0;
1035 unsigned elapsed = 0;
1036 state st;
1037
1038 foreach_block(block, s->cfg) {
1039 const unsigned elapsed0 = elapsed;
1040
1041 foreach_inst_in_block(fs_inst, inst, block) {
1042 const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1043
1044 issue_inst(st, &s->compiler->isa, inst);
1045
1046 if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1047 st.weight /= discard_weight;
1048
1049 elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1050
1051 if (inst->opcode == BRW_OPCODE_DO)
1052 st.weight *= loop_weight;
1053 else if (inst->opcode == BRW_OPCODE_WHILE)
1054 st.weight /= loop_weight;
1055 else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1056 st.weight *= discard_weight;
1057 }
1058
1059 p.block_latency[block->num] = elapsed - elapsed0;
1060 }
1061
1062 p.latency = elapsed;
1063 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1064 }
1065 }
1066
performance(const fs_visitor * v)1067 brw::performance::performance(const fs_visitor *v) :
1068 block_latency(new unsigned[v->cfg->num_blocks])
1069 {
1070 calculate_performance(*this, v, v->dispatch_width);
1071 }
1072
~performance()1073 brw::performance::~performance()
1074 {
1075 delete[] block_latency;
1076 }
1077