1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28
29 using namespace brw;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum unit {
37 /** EU front-end. */
38 unit_fe,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 unit_fpu,
41 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42 unit_em,
43 /** Sampler shared function. */
44 unit_sampler,
45 /** Pixel Interpolator shared function. */
46 unit_pi,
47 /** Unified Return Buffer shared function. */
48 unit_urb,
49 /** Data Port Data Cache shared function. */
50 unit_dp_dc,
51 /** Data Port Render Cache shared function. */
52 unit_dp_rc,
53 /** Data Port Constant Cache shared function. */
54 unit_dp_cc,
55 /** Message Gateway shared function. */
56 unit_gateway,
57 /** Thread Spawner shared function. */
58 unit_spawner,
59 /* unit_vme, */
60 /* unit_cre, */
61 /** Number of asynchronous units currently tracked. */
62 num_units,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 unit_null = num_units
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum dependency_id {
72 /* Register part of the GRF. */
73 dependency_id_grf0 = 0,
74 /* Register part of the MRF. Only used on Gfx4-6. */
75 dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76 /* Address register part of the ARF. */
77 dependency_id_addr0 = dependency_id_mrf0 + 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0 = dependency_id_addr0 + 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0 = dependency_id_accum0 + 12,
82 /* SBID token write completion. Only used on Gfx12+. */
83 dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84 /* SBID token read completion. Only used on Gfx12+. */
85 dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids = dependency_id_sbid_rd0 + 16
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
state__anon63ed41e20111::state94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[num_units];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[num_dependency_ids];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[num_units];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
instruction_info__anon63ed41e20111::instruction_info123 instruction_info(const intel_device_info *devinfo, const fs_inst *inst) :
124 devinfo(devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
132 */
133 if (inst->opcode == SHADER_OPCODE_SEND) {
134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136 } else {
137 for (unsigned i = 0; i < inst->sources; i++)
138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139 }
140
141 /* Convert the execution size to GRF units. */
142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
146 */
147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151 }
152
instruction_info__anon63ed41e20111::instruction_info153 instruction_info(const intel_device_info *devinfo,
154 const vec4_instruction *inst) :
155 devinfo(devinfo), op(inst->opcode),
156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158 desc(inst->desc), sfid(inst->sfid)
159 {
160 /* Compute the maximum source size. */
161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164 /* Convert the execution size to GRF units. */
165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
169 */
170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174 }
175
176 /** Device information. */
177 const struct intel_device_info *devinfo;
178 /** Instruction opcode. */
179 opcode op;
180 /** Destination type. */
181 brw_reg_type td;
182 /** Destination size in GRF units. */
183 unsigned sd;
184 /** Execution type. */
185 brw_reg_type tx;
186 /** Execution size in GRF units. */
187 unsigned sx;
188 /** Source size. */
189 unsigned ss;
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191 unsigned sc;
192 /** Send message descriptor. */
193 uint32_t desc;
194 /** Send message shared function ID. */
195 uint8_t sfid;
196 };
197
198 /**
199 * Timing information of an instruction used to estimate the performance of
200 * the program.
201 */
202 struct perf_desc {
perf_desc__anon63ed41e20111::perf_desc203 perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205
206 /**
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
209 */
210 unit u;
211 /**
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
214 */
215 int df;
216 /**
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
219 */
220 int db;
221 /**
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
224 */
225 int ls;
226 /**
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
229 * register file.
230 */
231 int ld;
232 /**
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
235 * ARF file.
236 *
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
244 */
245 int la;
246 /**
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
249 * file.
250 */
251 int lf;
252 };
253
254 /**
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
260 */
261 perf_desc
calculate_desc(const instruction_info & info,unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)262 calculate_desc(const instruction_info &info, unit u,
263 int df_1, int df_sd, int df_sc,
264 int db_1, int db_sx,
265 int ls_1, int ld_1, int la_1, int lf_1,
266 int l_ss, int l_sd)
267 {
268 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269 db_1 + db_sx * int(info.sx),
270 ls_1 + l_ss * int(info.ss),
271 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272 la_1, lf_1);
273 }
274
275 /**
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
279 *
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
287 * unavailable.
288 */
289 const perf_desc
instruction_desc(const instruction_info & info)290 instruction_desc(const instruction_info &info)
291 {
292 const struct intel_device_info *devinfo = info.devinfo;
293
294 switch (info.op) {
295 case BRW_OPCODE_SYNC:
296 case BRW_OPCODE_SEL:
297 case BRW_OPCODE_NOT:
298 case BRW_OPCODE_AND:
299 case BRW_OPCODE_OR:
300 case BRW_OPCODE_XOR:
301 case BRW_OPCODE_SHR:
302 case BRW_OPCODE_SHL:
303 case BRW_OPCODE_DIM:
304 case BRW_OPCODE_ASR:
305 case BRW_OPCODE_CMPN:
306 case BRW_OPCODE_F16TO32:
307 case BRW_OPCODE_BFREV:
308 case BRW_OPCODE_BFI1:
309 case BRW_OPCODE_AVG:
310 case BRW_OPCODE_FRC:
311 case BRW_OPCODE_RNDU:
312 case BRW_OPCODE_RNDD:
313 case BRW_OPCODE_RNDE:
314 case BRW_OPCODE_RNDZ:
315 case BRW_OPCODE_MAC:
316 case BRW_OPCODE_MACH:
317 case BRW_OPCODE_LZD:
318 case BRW_OPCODE_FBH:
319 case BRW_OPCODE_FBL:
320 case BRW_OPCODE_CBIT:
321 case BRW_OPCODE_ADDC:
322 case BRW_OPCODE_ROR:
323 case BRW_OPCODE_ROL:
324 case BRW_OPCODE_SUBB:
325 case BRW_OPCODE_SAD2:
326 case BRW_OPCODE_SADA2:
327 case BRW_OPCODE_LINE:
328 case BRW_OPCODE_NOP:
329 case SHADER_OPCODE_CLUSTER_BROADCAST:
330 case SHADER_OPCODE_SCRATCH_HEADER:
331 case FS_OPCODE_DDX_COARSE:
332 case FS_OPCODE_DDX_FINE:
333 case FS_OPCODE_DDY_COARSE:
334 case FS_OPCODE_PIXEL_X:
335 case FS_OPCODE_PIXEL_Y:
336 case FS_OPCODE_SET_SAMPLE_ID:
337 case VEC4_OPCODE_MOV_BYTES:
338 case VEC4_OPCODE_UNPACK_UNIFORM:
339 case VEC4_OPCODE_DOUBLE_TO_F32:
340 case VEC4_OPCODE_DOUBLE_TO_D32:
341 case VEC4_OPCODE_DOUBLE_TO_U32:
342 case VEC4_OPCODE_TO_DOUBLE:
343 case VEC4_OPCODE_PICK_LOW_32BIT:
344 case VEC4_OPCODE_PICK_HIGH_32BIT:
345 case VEC4_OPCODE_SET_LOW_32BIT:
346 case VEC4_OPCODE_SET_HIGH_32BIT:
347 case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
348 case GS_OPCODE_SET_DWORD_2:
349 case GS_OPCODE_SET_WRITE_OFFSET:
350 case GS_OPCODE_SET_VERTEX_COUNT:
351 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
352 case GS_OPCODE_SET_CHANNEL_MASKS:
353 case GS_OPCODE_GET_INSTANCE_ID:
354 case GS_OPCODE_SET_PRIMITIVE_ID:
355 case GS_OPCODE_SVB_SET_DST_INDEX:
356 case TCS_OPCODE_SRC0_010_IS_ZERO:
357 case TCS_OPCODE_GET_PRIMITIVE_ID:
358 case TES_OPCODE_GET_PRIMITIVE_ID:
359 case SHADER_OPCODE_GET_DSS_ID:
360 if (devinfo->ver >= 11) {
361 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
362 0, 10, 6 /* XXX */, 14, 0, 0);
363 } else if (devinfo->ver >= 8) {
364 if (type_sz(info.tx) > 4)
365 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
366 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
367 else
368 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
369 0, 8, 4, 12, 0, 0);
370 } else if (devinfo->is_haswell) {
371 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
372 0, 10, 6 /* XXX */, 16, 0, 0);
373 } else {
374 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
375 0, 12, 8 /* XXX */, 18, 0, 0);
376 }
377
378 case BRW_OPCODE_MOV:
379 case BRW_OPCODE_CMP:
380 case BRW_OPCODE_ADD:
381 case BRW_OPCODE_ADD3:
382 case BRW_OPCODE_MUL:
383 case SHADER_OPCODE_MOV_RELOC_IMM:
384 case VEC4_OPCODE_MOV_FOR_SCRATCH:
385 if (devinfo->ver >= 11) {
386 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
387 0, 10, 6, 14, 0, 0);
388 } else if (devinfo->ver >= 8) {
389 if (type_sz(info.tx) > 4)
390 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
391 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
392 else
393 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
394 0, 8, 4, 12, 0, 0);
395 } else if (devinfo->is_haswell) {
396 if (info.tx == BRW_REGISTER_TYPE_F)
397 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
398 0, 12, 8 /* XXX */, 18, 0, 0);
399 else
400 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
401 0, 10, 6 /* XXX */, 16, 0, 0);
402 } else if (devinfo->ver >= 7) {
403 if (info.tx == BRW_REGISTER_TYPE_F)
404 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
405 0, 14, 10 /* XXX */, 20, 0, 0);
406 else
407 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
408 0, 12, 8 /* XXX */, 18, 0, 0);
409 } else {
410 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
411 0, 2 /* XXX */,
412 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
413 0, 0);
414 }
415
416 case BRW_OPCODE_BFE:
417 case BRW_OPCODE_BFI2:
418 case BRW_OPCODE_CSEL:
419 if (devinfo->ver >= 11)
420 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
421 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
422 else if (devinfo->ver >= 8)
423 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
424 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
425 else if (devinfo->is_haswell)
426 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
427 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
428 else if (devinfo->ver >= 7)
429 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
430 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
431 else
432 abort();
433
434 case BRW_OPCODE_MAD:
435 if (devinfo->ver >= 11) {
436 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
437 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
438 } else if (devinfo->ver >= 8) {
439 if (type_sz(info.tx) > 4)
440 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
441 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
442 else
443 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
444 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
445 } else if (devinfo->is_haswell) {
446 if (info.tx == BRW_REGISTER_TYPE_F)
447 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
448 0, 12, 8 /* XXX */, 18, 0, 0);
449 else
450 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
451 0, 10, 6 /* XXX */, 16, 0, 0);
452 } else if (devinfo->ver >= 7) {
453 if (info.tx == BRW_REGISTER_TYPE_F)
454 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
455 0, 14, 10 /* XXX */, 20, 0, 0);
456 else
457 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
458 0, 12, 8 /* XXX */, 18, 0, 0);
459 } else if (devinfo->ver >= 6) {
460 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
461 0, 2 /* XXX */,
462 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
463 0, 0);
464 } else {
465 abort();
466 }
467
468 case BRW_OPCODE_F32TO16:
469 if (devinfo->ver >= 11)
470 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
471 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
472 else if (devinfo->ver >= 8)
473 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
474 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
475 else if (devinfo->is_haswell)
476 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
477 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
478 else if (devinfo->ver >= 7)
479 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
480 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
481 else
482 abort();
483
484 case BRW_OPCODE_DP4:
485 case BRW_OPCODE_DPH:
486 case BRW_OPCODE_DP3:
487 case BRW_OPCODE_DP2:
488 if (devinfo->ver >= 8)
489 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
490 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
491 else if (devinfo->is_haswell)
492 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
493 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
494 else
495 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
496 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
497
498 case BRW_OPCODE_DP4A:
499 if (devinfo->ver >= 12)
500 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
501 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
502 else
503 abort();
504
505 case SHADER_OPCODE_RCP:
506 case SHADER_OPCODE_RSQ:
507 case SHADER_OPCODE_SQRT:
508 case SHADER_OPCODE_EXP2:
509 case SHADER_OPCODE_LOG2:
510 case SHADER_OPCODE_SIN:
511 case SHADER_OPCODE_COS:
512 case SHADER_OPCODE_POW:
513 case SHADER_OPCODE_INT_QUOTIENT:
514 case SHADER_OPCODE_INT_REMAINDER:
515 if (devinfo->ver >= 6) {
516 switch (info.op) {
517 case SHADER_OPCODE_RCP:
518 case SHADER_OPCODE_RSQ:
519 case SHADER_OPCODE_SQRT:
520 case SHADER_OPCODE_EXP2:
521 case SHADER_OPCODE_LOG2:
522 case SHADER_OPCODE_SIN:
523 case SHADER_OPCODE_COS:
524 if (devinfo->ver >= 8)
525 return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
526 0, 16, 0, 0, 0, 0);
527 else if (devinfo->is_haswell)
528 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
529 0, 12, 0, 0, 0, 0);
530 else
531 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
532 0, 14, 0, 0, 0, 0);
533
534 case SHADER_OPCODE_POW:
535 if (devinfo->ver >= 8)
536 return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
537 0, 24, 0, 0, 0, 0);
538 else if (devinfo->is_haswell)
539 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
540 0, 20, 0, 0, 0, 0);
541 else
542 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
543 0, 22, 0, 0, 0, 0);
544
545 case SHADER_OPCODE_INT_QUOTIENT:
546 case SHADER_OPCODE_INT_REMAINDER:
547 return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
548 0, 28 /* XXX */, 0, 0, 0, 0);
549
550 default:
551 abort();
552 }
553 } else {
554 switch (info.op) {
555 case SHADER_OPCODE_RCP:
556 return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
557 0, 22, 0, 0, 0, 8);
558
559 case SHADER_OPCODE_RSQ:
560 return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
561 0, 44, 0, 0, 0, 8);
562
563 case SHADER_OPCODE_INT_QUOTIENT:
564 case SHADER_OPCODE_SQRT:
565 case SHADER_OPCODE_LOG2:
566 return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
567 0, 66, 0, 0, 0, 8);
568
569 case SHADER_OPCODE_INT_REMAINDER:
570 case SHADER_OPCODE_EXP2:
571 return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
572 0, 88, 0, 0, 0, 8);
573
574 case SHADER_OPCODE_SIN:
575 case SHADER_OPCODE_COS:
576 return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
577 0, 132, 0, 0, 0, 8);
578
579 case SHADER_OPCODE_POW:
580 return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
581 0, 176, 0, 0, 0, 8);
582
583 default:
584 abort();
585 }
586 }
587
588 case BRW_OPCODE_DO:
589 if (devinfo->ver >= 6)
590 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0);
592 else
593 return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
594 0, 0, 0, 0, 0, 0);
595
596 case BRW_OPCODE_IF:
597 case BRW_OPCODE_ELSE:
598 case BRW_OPCODE_ENDIF:
599 case BRW_OPCODE_WHILE:
600 case BRW_OPCODE_BREAK:
601 case BRW_OPCODE_CONTINUE:
602 case BRW_OPCODE_HALT:
603 if (devinfo->ver >= 8)
604 return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
605 0, 0, 0, 0, 0, 0);
606 else if (devinfo->is_haswell)
607 return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
608 0, 0, 0, 0, 0, 0);
609 else
610 return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
611 0, 0, 0, 0, 0, 0);
612
613 case FS_OPCODE_LINTERP:
614 if (devinfo->ver >= 8)
615 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
616 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
617 else if (devinfo->is_haswell)
618 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
619 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
620 else
621 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
622 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
623
624 case BRW_OPCODE_LRP:
625 if (devinfo->ver >= 8)
626 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
627 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
628 else if (devinfo->is_haswell)
629 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
630 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
631 else if (devinfo->ver >= 6)
632 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
633 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
634 else
635 abort();
636
637 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
638 if (devinfo->ver >= 11)
639 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
640 0, 10 /* XXX */, 6 /* XXX */,
641 14 /* XXX */, 0, 0);
642 else if (devinfo->ver >= 8)
643 return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
644 0, 8 /* XXX */, 4 /* XXX */,
645 12 /* XXX */, 0, 0);
646 else if (devinfo->is_haswell)
647 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
648 0, 10 /* XXX */, 6 /* XXX */,
649 16 /* XXX */, 0, 0);
650 else if (devinfo->ver >= 7)
651 return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
652 0, 12 /* XXX */, 8 /* XXX */,
653 18 /* XXX */, 0, 0);
654 else
655 abort();
656
657 case SHADER_OPCODE_MOV_INDIRECT:
658 if (devinfo->ver >= 11)
659 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
660 0, 10 /* XXX */, 6 /* XXX */,
661 14 /* XXX */, 0, 0);
662 else if (devinfo->ver >= 8)
663 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
664 0, 8 /* XXX */, 4 /* XXX */,
665 12 /* XXX */, 0, 0);
666 else if (devinfo->is_haswell)
667 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
668 0, 10 /* XXX */, 6 /* XXX */,
669 16 /* XXX */, 0, 0);
670 else
671 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
672 0, 12 /* XXX */, 8 /* XXX */,
673 18 /* XXX */, 0, 0);
674
675 case SHADER_OPCODE_BROADCAST:
676 if (devinfo->ver >= 11)
677 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
678 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
679 else if (devinfo->ver >= 8)
680 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
681 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
682 else if (devinfo->is_haswell)
683 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
684 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
685 else if (devinfo->ver >= 7)
686 return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
687 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
688 else
689 abort();
690
691 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
692 if (devinfo->ver >= 11)
693 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
694 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
695 else if (devinfo->ver >= 8)
696 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
697 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
698 else if (devinfo->is_haswell)
699 return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
700 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
701 else if (devinfo->ver >= 7)
702 return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
703 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
704 else
705 abort();
706
707 case SHADER_OPCODE_RND_MODE:
708 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
709 if (devinfo->ver >= 11)
710 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
711 4 /* XXX */, 0,
712 0, 0, 0, 0, 0, 0);
713 else if (devinfo->ver >= 8)
714 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
715 4 /* XXX */, 0,
716 0, 0, 0, 0, 0, 0);
717 else if (devinfo->is_haswell)
718 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
719 4 /* XXX */, 0,
720 0, 0, 0, 0, 0, 0);
721 else if (devinfo->ver >= 6)
722 return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
723 4 /* XXX */, 0,
724 0, 0, 0, 0, 0, 0);
725 else
726 abort();
727
728 case SHADER_OPCODE_SHUFFLE:
729 if (devinfo->ver >= 11)
730 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
731 44 /* XXX */, 0,
732 0, 10 /* XXX */, 6 /* XXX */,
733 14 /* XXX */, 0, 0);
734 else if (devinfo->ver >= 8)
735 return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
736 42 /* XXX */, 0,
737 0, 8 /* XXX */, 4 /* XXX */,
738 12 /* XXX */, 0, 0);
739 else if (devinfo->is_haswell)
740 return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
741 0, 44 /* XXX */,
742 0, 10 /* XXX */, 6 /* XXX */,
743 16 /* XXX */, 0, 0);
744 else if (devinfo->ver >= 6)
745 return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
746 0, 46 /* XXX */,
747 0, 12 /* XXX */, 8 /* XXX */,
748 18 /* XXX */, 0, 0);
749 else
750 abort();
751
752 case SHADER_OPCODE_SEL_EXEC:
753 if (devinfo->ver >= 11)
754 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
755 0, 4 /* XXX */,
756 0, 10 /* XXX */, 6 /* XXX */,
757 14 /* XXX */, 0, 0);
758 else if (devinfo->ver >= 8)
759 return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
760 0, 4 /* XXX */,
761 0, 8 /* XXX */, 4 /* XXX */,
762 12 /* XXX */, 0, 0);
763 else if (devinfo->is_haswell)
764 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
765 0, 4 /* XXX */,
766 0, 10 /* XXX */, 6 /* XXX */,
767 16 /* XXX */, 0, 0);
768 else
769 return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
770 0, 4 /* XXX */,
771 0, 12 /* XXX */, 8 /* XXX */,
772 18 /* XXX */, 0, 0);
773
774 case SHADER_OPCODE_QUAD_SWIZZLE:
775 if (devinfo->ver >= 11)
776 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
777 0, 8 /* XXX */,
778 0, 10 /* XXX */, 6 /* XXX */,
779 14 /* XXX */, 0, 0);
780 else if (devinfo->ver >= 8)
781 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
782 0, 8 /* XXX */,
783 0, 8 /* XXX */, 4 /* XXX */,
784 12 /* XXX */, 0, 0);
785 else if (devinfo->is_haswell)
786 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
787 0, 8 /* XXX */,
788 0, 10 /* XXX */, 6 /* XXX */,
789 16 /* XXX */, 0, 0);
790 else
791 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
792 0, 8 /* XXX */,
793 0, 12 /* XXX */, 8 /* XXX */,
794 18 /* XXX */, 0, 0);
795
796 case FS_OPCODE_DDY_FINE:
797 if (devinfo->ver >= 11)
798 return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
799 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
800 else if (devinfo->ver >= 8)
801 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
802 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
803 else if (devinfo->is_haswell)
804 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
805 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
806 else
807 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
808 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
809
810 case FS_OPCODE_LOAD_LIVE_CHANNELS:
811 if (devinfo->ver >= 11)
812 return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
813 2 /* XXX */, 0,
814 0, 0, 0, 10 /* XXX */, 0, 0);
815 else if (devinfo->ver >= 8)
816 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
817 0, 2 /* XXX */,
818 0, 0, 0, 8 /* XXX */, 0, 0);
819 else
820 abort();
821
822 case VEC4_OPCODE_PACK_BYTES:
823 if (devinfo->ver >= 8)
824 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
825 4 /* XXX */, 0,
826 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
827 0, 0);
828 else if (devinfo->is_haswell)
829 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
830 4 /* XXX */, 0,
831 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
832 0, 0);
833 else
834 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
835 4 /* XXX */, 0,
836 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
837 0, 0);
838
839 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
840 case TCS_OPCODE_GET_INSTANCE_ID:
841 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
842 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
843 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
844 if (devinfo->ver >= 8)
845 return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
846 6 /* XXX */, 0,
847 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
848 0, 0);
849 else if (devinfo->is_haswell)
850 return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
851 6 /* XXX */, 0,
852 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
853 0, 0);
854 else
855 return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
856 6 /* XXX */, 0,
857 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
858 0, 0);
859
860 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
861 case TCS_OPCODE_CREATE_BARRIER_HEADER:
862 if (devinfo->ver >= 8)
863 return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
864 8 /* XXX */, 0,
865 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
866 0, 0);
867 else if (devinfo->is_haswell)
868 return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
869 8 /* XXX */, 0,
870 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
871 0, 0);
872 else if (devinfo->ver >= 6)
873 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
874 8 /* XXX */, 0,
875 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
876 0, 0);
877 else
878 abort();
879
880 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
881 if (devinfo->ver >= 8)
882 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
883 4 /* XXX */, 0,
884 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
885 0, 0);
886 else if (devinfo->is_haswell)
887 return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
888 4 /* XXX */, 0,
889 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
890 0, 0);
891 else if (devinfo->ver >= 7)
892 return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
893 4 /* XXX */, 0,
894 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
895 0, 0);
896 else
897 abort();
898
899 case SHADER_OPCODE_TEX:
900 case FS_OPCODE_TXB:
901 case SHADER_OPCODE_TXD:
902 case SHADER_OPCODE_TXF:
903 case SHADER_OPCODE_TXF_LZ:
904 case SHADER_OPCODE_TXL:
905 case SHADER_OPCODE_TXL_LZ:
906 case SHADER_OPCODE_TXF_CMS:
907 case SHADER_OPCODE_TXF_CMS_W:
908 case SHADER_OPCODE_TXF_UMS:
909 case SHADER_OPCODE_TXF_MCS:
910 case SHADER_OPCODE_TXS:
911 case SHADER_OPCODE_LOD:
912 case SHADER_OPCODE_GET_BUFFER_SIZE:
913 case SHADER_OPCODE_TG4:
914 case SHADER_OPCODE_TG4_OFFSET:
915 case SHADER_OPCODE_SAMPLEINFO:
916 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
917 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
918 8 /* XXX */, 750 /* XXX */, 0, 0,
919 2 /* XXX */, 0);
920
921 case SHADER_OPCODE_URB_READ_SIMD8:
922 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
923 case SHADER_OPCODE_URB_WRITE_SIMD8:
924 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
925 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
926 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
927 case VEC4_OPCODE_URB_READ:
928 case VS_OPCODE_URB_WRITE:
929 case GS_OPCODE_URB_WRITE:
930 case GS_OPCODE_URB_WRITE_ALLOCATE:
931 case GS_OPCODE_THREAD_END:
932 case GS_OPCODE_FF_SYNC:
933 case TCS_OPCODE_URB_WRITE:
934 case TCS_OPCODE_RELEASE_INPUT:
935 case TCS_OPCODE_THREAD_END:
936 return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
937 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
938
939 case SHADER_OPCODE_MEMORY_FENCE:
940 case SHADER_OPCODE_INTERLOCK:
941 switch (info.sfid) {
942 case GFX6_SFID_DATAPORT_RENDER_CACHE:
943 if (devinfo->ver >= 7)
944 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
945 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
946 else
947 abort();
948
949 case BRW_SFID_URB:
950 case GFX7_SFID_DATAPORT_DATA_CACHE:
951 case GFX12_SFID_SLM:
952 case GFX12_SFID_TGM:
953 case GFX12_SFID_UGM:
954 case HSW_SFID_DATAPORT_DATA_CACHE_1:
955 if (devinfo->ver >= 7)
956 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
957 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
958 else
959 abort();
960
961 default:
962 abort();
963 }
964
965 case SHADER_OPCODE_GFX4_SCRATCH_READ:
966 case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
967 case SHADER_OPCODE_GFX7_SCRATCH_READ:
968 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
969 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
970
971 case VEC4_OPCODE_UNTYPED_ATOMIC:
972 if (devinfo->ver >= 7)
973 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
974 30 /* XXX */, 400 /* XXX */,
975 10 /* XXX */, 100 /* XXX */, 0, 0,
976 0, 400 /* XXX */);
977 else
978 abort();
979
980 case VEC4_OPCODE_UNTYPED_SURFACE_READ:
981 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
982 if (devinfo->ver >= 7)
983 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
984 0, 20 /* XXX */,
985 10 /* XXX */, 100 /* XXX */, 0, 0,
986 0, 0);
987 else
988 abort();
989
990 case FS_OPCODE_FB_WRITE:
991 case FS_OPCODE_FB_READ:
992 case FS_OPCODE_REP_FB_WRITE:
993 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
994 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
995
996 case GS_OPCODE_SVB_WRITE:
997 if (devinfo->ver >= 6)
998 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
999 0, 450 /* XXX */,
1000 10 /* XXX */, 300 /* XXX */, 0, 0,
1001 0, 0);
1002 else
1003 abort();
1004
1005 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1006 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
1007 return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
1008 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1009
1010 case VS_OPCODE_PULL_CONSTANT_LOAD:
1011 case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1012 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1013 8, 750, 0, 0, 2, 0);
1014
1015 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1016 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1017 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1018 if (devinfo->ver >= 7)
1019 return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
1020 0, 90 /* XXX */, 0, 0, 0, 0);
1021 else
1022 abort();
1023
1024 case SHADER_OPCODE_BARRIER:
1025 if (devinfo->ver >= 7)
1026 return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1027 0 /* XXX */, 0,
1028 0, 0, 0, 0, 0, 0);
1029 else
1030 abort();
1031
1032 case CS_OPCODE_CS_TERMINATE:
1033 if (devinfo->ver >= 7)
1034 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1035 10 /* XXX */, 0, 0, 0, 0, 0);
1036 else
1037 abort();
1038
1039 case SHADER_OPCODE_SEND:
1040 switch (info.sfid) {
1041 case GFX6_SFID_DATAPORT_RENDER_CACHE:
1042 if (devinfo->ver >= 7) {
1043 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1044 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1045 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1046 30 /* XXX */, 450 /* XXX */,
1047 10 /* XXX */, 100 /* XXX */,
1048 0, 0, 0, 400 /* XXX */);
1049 default:
1050 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1051 0, 450 /* XXX */,
1052 10 /* XXX */, 300 /* XXX */, 0, 0,
1053 0, 0);
1054 }
1055 } else if (devinfo->ver >= 6) {
1056 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1057 0, 450 /* XXX */,
1058 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1059 } else {
1060 abort();
1061 }
1062 case BRW_SFID_SAMPLER: {
1063 if (devinfo->ver >= 6)
1064 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1065 8, 750, 0, 0, 2, 0);
1066 else
1067 abort();
1068 }
1069 case GFX7_SFID_DATAPORT_DATA_CACHE:
1070 case HSW_SFID_DATAPORT_DATA_CACHE_1:
1071 if (devinfo->verx10 >= 75) {
1072 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1073 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1074 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1075 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1076 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1077 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1078 30 /* XXX */, 400 /* XXX */,
1079 10 /* XXX */, 100 /* XXX */, 0, 0,
1080 0, 400 /* XXX */);
1081
1082 default:
1083 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1084 0, 20 /* XXX */,
1085 10 /* XXX */, 100 /* XXX */, 0, 0,
1086 0, 0);
1087 }
1088 } else if (devinfo->ver >= 7) {
1089 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1090 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1091 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1092 30 /* XXX */, 400 /* XXX */,
1093 10 /* XXX */, 100 /* XXX */,
1094 0, 0, 0, 400 /* XXX */);
1095 default:
1096 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1097 0, 20 /* XXX */,
1098 10 /* XXX */, 100 /* XXX */, 0, 0,
1099 0, 0);
1100 }
1101 } else {
1102 abort();
1103 }
1104
1105 case GFX12_SFID_UGM:
1106 case GFX12_SFID_TGM:
1107 case GFX12_SFID_SLM:
1108 switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1109 case LSC_OP_LOAD:
1110 case LSC_OP_STORE:
1111 case LSC_OP_LOAD_CMASK:
1112 case LSC_OP_STORE_CMASK:
1113 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1114 0, 20 /* XXX */,
1115 10 /* XXX */, 100 /* XXX */, 0, 0,
1116 0, 0);
1117
1118 case LSC_OP_FENCE:
1119 case LSC_OP_ATOMIC_INC:
1120 case LSC_OP_ATOMIC_DEC:
1121 case LSC_OP_ATOMIC_LOAD:
1122 case LSC_OP_ATOMIC_STORE:
1123 case LSC_OP_ATOMIC_ADD:
1124 case LSC_OP_ATOMIC_SUB:
1125 case LSC_OP_ATOMIC_MIN:
1126 case LSC_OP_ATOMIC_MAX:
1127 case LSC_OP_ATOMIC_UMIN:
1128 case LSC_OP_ATOMIC_UMAX:
1129 case LSC_OP_ATOMIC_CMPXCHG:
1130 case LSC_OP_ATOMIC_FADD:
1131 case LSC_OP_ATOMIC_FSUB:
1132 case LSC_OP_ATOMIC_FMIN:
1133 case LSC_OP_ATOMIC_FMAX:
1134 case LSC_OP_ATOMIC_FCMPXCHG:
1135 case LSC_OP_ATOMIC_AND:
1136 case LSC_OP_ATOMIC_OR:
1137 case LSC_OP_ATOMIC_XOR:
1138 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1139 30 /* XXX */, 400 /* XXX */,
1140 10 /* XXX */, 100 /* XXX */, 0, 0,
1141 0, 400 /* XXX */);
1142 default:
1143 abort();
1144 }
1145
1146 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
1147 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
1148 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1149 10 /* XXX */, 0, 0, 0, 0, 0);
1150
1151 default:
1152 abort();
1153 }
1154
1155 case SHADER_OPCODE_UNDEF:
1156 case SHADER_OPCODE_HALT_TARGET:
1157 case FS_OPCODE_SCHEDULING_FENCE:
1158 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1159 0, 0, 0, 0, 0, 0);
1160
1161 default:
1162 abort();
1163 }
1164 }
1165
1166 /**
1167 * Model the performance behavior of a stall on the specified dependency
1168 * ID.
1169 */
1170 void
stall_on_dependency(state & st,dependency_id id)1171 stall_on_dependency(state &st, dependency_id id)
1172 {
1173 if (id < ARRAY_SIZE(st.dep_ready))
1174 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1175 st.dep_ready[id]);
1176 }
1177
1178 /**
1179 * Model the performance behavior of the front-end and back-end while
1180 * executing an instruction with the specified timing information, assuming
1181 * all dependencies are already clear.
1182 */
1183 void
execute_instruction(state & st,const perf_desc & perf)1184 execute_instruction(state &st, const perf_desc &perf)
1185 {
1186 /* Compute the time at which the front-end will be ready to execute the
1187 * next instruction.
1188 */
1189 st.unit_ready[unit_fe] += perf.df;
1190
1191 if (perf.u < num_units) {
1192 /* Wait for the back-end to be ready to execute this instruction. */
1193 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1194 st.unit_ready[perf.u]);
1195
1196 /* Compute the time at which the back-end will be ready to execute
1197 * the next instruction, and update the back-end utilization.
1198 */
1199 st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1200 st.unit_busy[perf.u] += perf.db * st.weight;
1201 }
1202 }
1203
1204 /**
1205 * Model the performance behavior of a read dependency provided by an
1206 * instruction.
1207 */
1208 void
mark_read_dependency(state & st,const perf_desc & perf,dependency_id id)1209 mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1210 {
1211 if (id < ARRAY_SIZE(st.dep_ready))
1212 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1213 }
1214
1215 /**
1216 * Model the performance behavior of a write dependency provided by an
1217 * instruction.
1218 */
1219 void
mark_write_dependency(state & st,const perf_desc & perf,dependency_id id)1220 mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1221 {
1222 if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1223 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1224 else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1225 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1226 else if (id < ARRAY_SIZE(st.dep_ready))
1227 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1228 }
1229
1230 /**
1231 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1232 */
1233 dependency_id
reg_dependency_id(const intel_device_info * devinfo,const backend_reg & r,const int delta)1234 reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
1235 const int delta)
1236 {
1237 if (r.file == VGRF) {
1238 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1239 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1240 return dependency_id(dependency_id_grf0 + i);
1241
1242 } else if (r.file == FIXED_GRF) {
1243 const unsigned i = r.nr + delta;
1244 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1245 return dependency_id(dependency_id_grf0 + i);
1246
1247 } else if (r.file == MRF && devinfo->ver >= 7) {
1248 const unsigned i = GFX7_MRF_HACK_START +
1249 r.nr + r.offset / REG_SIZE + delta;
1250 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1251 return dependency_id(dependency_id_grf0 + i);
1252
1253 } else if (r.file == MRF && devinfo->ver < 7) {
1254 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1255 r.offset / REG_SIZE + delta;
1256 assert(i < dependency_id_addr0 - dependency_id_mrf0);
1257 return dependency_id(dependency_id_mrf0 + i);
1258
1259 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1260 r.nr < BRW_ARF_ACCUMULATOR) {
1261 assert(delta == 0);
1262 return dependency_id_addr0;
1263
1264 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1265 r.nr < BRW_ARF_FLAG) {
1266 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1267 assert(i < dependency_id_flag0 - dependency_id_accum0);
1268 return dependency_id(dependency_id_accum0 + i);
1269
1270 } else {
1271 return num_dependency_ids;
1272 }
1273 }
1274
1275 /**
1276 * Return the dependency ID of flag register starting at offset \p i.
1277 */
1278 dependency_id
flag_dependency_id(unsigned i)1279 flag_dependency_id(unsigned i)
1280 {
1281 assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1282 return dependency_id(dependency_id_flag0 + i);
1283 }
1284
1285 /**
1286 * Return the dependency ID corresponding to the SBID read completion
1287 * condition of a Gfx12+ SWSB.
1288 */
1289 dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1290 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1291 {
1292 if (swsb.mode) {
1293 assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1294 return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1295 } else {
1296 return num_dependency_ids;
1297 }
1298 }
1299
1300 /**
1301 * Return the dependency ID corresponding to the SBID write completion
1302 * condition of a Gfx12+ SWSB.
1303 */
1304 dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1305 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1306 {
1307 if (swsb.mode) {
1308 assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1309 return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1310 } else {
1311 return num_dependency_ids;
1312 }
1313 }
1314
1315 /**
1316 * Return the implicit accumulator register accessed by channel \p i of the
1317 * instruction.
1318 */
1319 unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)1320 accum_reg_of_channel(const intel_device_info *devinfo,
1321 const backend_instruction *inst,
1322 brw_reg_type tx, unsigned i)
1323 {
1324 assert(inst->reads_accumulator_implicitly() ||
1325 inst->writes_accumulator_implicitly(devinfo));
1326 const unsigned offset = (inst->group + i) * type_sz(tx) *
1327 (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1328 return offset / REG_SIZE % 2;
1329 }
1330
1331 /**
1332 * Model the performance behavior of an FS back-end instruction.
1333 */
1334 void
issue_fs_inst(state & st,const intel_device_info * devinfo,const backend_instruction * be_inst)1335 issue_fs_inst(state &st, const intel_device_info *devinfo,
1336 const backend_instruction *be_inst)
1337 {
1338 const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1339 const instruction_info info(devinfo, inst);
1340 const perf_desc perf = instruction_desc(info);
1341
1342 /* Stall on any source dependencies. */
1343 for (unsigned i = 0; i < inst->sources; i++) {
1344 for (unsigned j = 0; j < regs_read(inst, i); j++)
1345 stall_on_dependency(
1346 st, reg_dependency_id(devinfo, inst->src[i], j));
1347 }
1348
1349 if (inst->reads_accumulator_implicitly()) {
1350 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1351 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1352 inst->exec_size - 1); j++)
1353 stall_on_dependency(
1354 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1355 }
1356
1357 if (is_send(inst) && inst->base_mrf != -1) {
1358 for (unsigned j = 0; j < inst->mlen; j++)
1359 stall_on_dependency(
1360 st, reg_dependency_id(
1361 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1362 }
1363
1364 if (const unsigned mask = inst->flags_read(devinfo)) {
1365 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1366 if (mask & (1 << i))
1367 stall_on_dependency(st, flag_dependency_id(i));
1368 }
1369 }
1370
1371 /* Stall on any write dependencies. */
1372 if (!inst->no_dd_check) {
1373 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1374 for (unsigned j = 0; j < regs_written(inst); j++)
1375 stall_on_dependency(
1376 st, reg_dependency_id(devinfo, inst->dst, j));
1377 }
1378
1379 if (inst->writes_accumulator_implicitly(devinfo)) {
1380 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1381 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1382 inst->exec_size - 1); j++)
1383 stall_on_dependency(
1384 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1385 }
1386
1387 if (const unsigned mask = inst->flags_written(devinfo)) {
1388 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1389 if (mask & (1 << i))
1390 stall_on_dependency(st, flag_dependency_id(i));
1391 }
1392 }
1393 }
1394
1395 /* Stall on any SBID dependencies. */
1396 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1397 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1398 else if (inst->sched.mode & TGL_SBID_SRC)
1399 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1400
1401 /* Execute the instruction. */
1402 execute_instruction(st, perf);
1403
1404 /* Mark any source dependencies. */
1405 if (inst->is_send_from_grf()) {
1406 for (unsigned i = 0; i < inst->sources; i++) {
1407 if (inst->is_payload(i)) {
1408 for (unsigned j = 0; j < regs_read(inst, i); j++)
1409 mark_read_dependency(
1410 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1411 }
1412 }
1413 }
1414
1415 if (is_send(inst) && inst->base_mrf != -1) {
1416 for (unsigned j = 0; j < inst->mlen; j++)
1417 mark_read_dependency(st, perf,
1418 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1419 }
1420
1421 /* Mark any destination dependencies. */
1422 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1423 for (unsigned j = 0; j < regs_written(inst); j++) {
1424 mark_write_dependency(st, perf,
1425 reg_dependency_id(devinfo, inst->dst, j));
1426 }
1427 }
1428
1429 if (inst->writes_accumulator_implicitly(devinfo)) {
1430 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1431 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1432 inst->exec_size - 1); j++)
1433 mark_write_dependency(st, perf,
1434 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1435 }
1436
1437 if (const unsigned mask = inst->flags_written(devinfo)) {
1438 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1439 if (mask & (1 << i))
1440 mark_write_dependency(st, perf, flag_dependency_id(i));
1441 }
1442 }
1443
1444 /* Mark any SBID dependencies. */
1445 if (inst->sched.mode & TGL_SBID_SET) {
1446 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1447 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1448 }
1449 }
1450
1451 /**
1452 * Model the performance behavior of a VEC4 back-end instruction.
1453 */
1454 void
issue_vec4_instruction(state & st,const intel_device_info * devinfo,const backend_instruction * be_inst)1455 issue_vec4_instruction(state &st, const intel_device_info *devinfo,
1456 const backend_instruction *be_inst)
1457 {
1458 const vec4_instruction *inst =
1459 static_cast<const vec4_instruction *>(be_inst);
1460 const instruction_info info(devinfo, inst);
1461 const perf_desc perf = instruction_desc(info);
1462
1463 /* Stall on any source dependencies. */
1464 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1465 for (unsigned j = 0; j < regs_read(inst, i); j++)
1466 stall_on_dependency(
1467 st, reg_dependency_id(devinfo, inst->src[i], j));
1468 }
1469
1470 if (inst->reads_accumulator_implicitly()) {
1471 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1472 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1473 inst->exec_size - 1); j++)
1474 stall_on_dependency(
1475 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1476 }
1477
1478 if (inst->base_mrf != -1) {
1479 for (unsigned j = 0; j < inst->mlen; j++)
1480 stall_on_dependency(
1481 st, reg_dependency_id(
1482 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1483 }
1484
1485 if (inst->reads_flag())
1486 stall_on_dependency(st, dependency_id_flag0);
1487
1488 /* Stall on any write dependencies. */
1489 if (!inst->no_dd_check) {
1490 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1491 for (unsigned j = 0; j < regs_written(inst); j++)
1492 stall_on_dependency(
1493 st, reg_dependency_id(devinfo, inst->dst, j));
1494 }
1495
1496 if (inst->writes_accumulator_implicitly(devinfo)) {
1497 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1498 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1499 inst->exec_size - 1); j++)
1500 stall_on_dependency(
1501 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1502 }
1503
1504 if (inst->writes_flag(devinfo))
1505 stall_on_dependency(st, dependency_id_flag0);
1506 }
1507
1508 /* Execute the instruction. */
1509 execute_instruction(st, perf);
1510
1511 /* Mark any source dependencies. */
1512 if (inst->is_send_from_grf()) {
1513 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1514 for (unsigned j = 0; j < regs_read(inst, i); j++)
1515 mark_read_dependency(
1516 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1517 }
1518 }
1519
1520 if (inst->base_mrf != -1) {
1521 for (unsigned j = 0; j < inst->mlen; j++)
1522 mark_read_dependency(st, perf,
1523 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1524 }
1525
1526 /* Mark any destination dependencies. */
1527 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1528 for (unsigned j = 0; j < regs_written(inst); j++) {
1529 mark_write_dependency(st, perf,
1530 reg_dependency_id(devinfo, inst->dst, j));
1531 }
1532 }
1533
1534 if (inst->writes_accumulator_implicitly(devinfo)) {
1535 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1536 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1537 inst->exec_size - 1); j++)
1538 mark_write_dependency(st, perf,
1539 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1540 }
1541
1542 if (inst->writes_flag(devinfo))
1543 mark_write_dependency(st, perf, dependency_id_flag0);
1544 }
1545
1546 /**
1547 * Calculate the maximum possible throughput of the program compatible with
1548 * the cycle-count utilization estimated for each asynchronous unit, in
1549 * threads-per-cycle units.
1550 */
1551 float
calculate_thread_throughput(const state & st,float busy)1552 calculate_thread_throughput(const state &st, float busy)
1553 {
1554 for (unsigned i = 0; i < num_units; i++)
1555 busy = MAX2(busy, st.unit_busy[i]);
1556
1557 return 1.0 / busy;
1558 }
1559
1560 /**
1561 * Estimate the performance of the specified shader.
1562 */
1563 void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const intel_device_info *,const backend_instruction *),unsigned dispatch_width)1564 calculate_performance(performance &p, const backend_shader *s,
1565 void (*issue_instruction)(
1566 state &, const intel_device_info *,
1567 const backend_instruction *),
1568 unsigned dispatch_width)
1569 {
1570 /* XXX - Note that the previous version of this code used worst-case
1571 * scenario estimation of branching divergence for SIMD32 shaders,
1572 * but this heuristic was removed to improve performance in common
1573 * scenarios. Wider shader variants are less optimal when divergence
1574 * is high, e.g. when application renders complex scene on a small
1575 * surface. It is assumed that such renders are short, so their
1576 * time doesn't matter and when it comes to the overall performance,
1577 * they are dominated by more optimal larger renders.
1578 *
1579 * It's possible that we could do better with divergence analysis
1580 * by isolating branches which are 100% uniform.
1581 *
1582 * Plumbing the trip counts from NIR loop analysis would allow us
1583 * to do a better job regarding the loop weights.
1584 *
1585 * In the meantime use values that roughly match the control flow
1586 * weights used elsewhere in the compiler back-end.
1587 *
1588 * Note that we provide slightly more pessimistic weights on
1589 * Gfx12+ for SIMD32, since the effective warp size on that
1590 * platform is 2x the SIMD width due to EU fusion, which increases
1591 * the likelihood of divergent control flow in comparison to
1592 * previous generations, giving narrower SIMD modes a performance
1593 * advantage in several test-cases with non-uniform discard jumps.
1594 */
1595 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1596 1.0 : 0.5);
1597 const float loop_weight = 10;
1598 unsigned halt_count = 0;
1599 unsigned elapsed = 0;
1600 state st;
1601
1602 foreach_block(block, s->cfg) {
1603 const unsigned elapsed0 = elapsed;
1604
1605 foreach_inst_in_block(backend_instruction, inst, block) {
1606 const unsigned clock0 = st.unit_ready[unit_fe];
1607
1608 issue_instruction(st, s->devinfo, inst);
1609
1610 if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1611 st.weight /= discard_weight;
1612
1613 elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1614
1615 if (inst->opcode == BRW_OPCODE_DO)
1616 st.weight *= loop_weight;
1617 else if (inst->opcode == BRW_OPCODE_WHILE)
1618 st.weight /= loop_weight;
1619 else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1620 st.weight *= discard_weight;
1621 }
1622
1623 p.block_latency[block->num] = elapsed - elapsed0;
1624 }
1625
1626 p.latency = elapsed;
1627 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1628 }
1629 }
1630
performance(const fs_visitor * v)1631 brw::performance::performance(const fs_visitor *v) :
1632 block_latency(new unsigned[v->cfg->num_blocks])
1633 {
1634 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1635 }
1636
performance(const vec4_visitor * v)1637 brw::performance::performance(const vec4_visitor *v) :
1638 block_latency(new unsigned[v->cfg->num_blocks])
1639 {
1640 calculate_performance(*this, v, issue_vec4_instruction, 8);
1641 }
1642
~performance()1643 brw::performance::~performance()
1644 {
1645 delete[] block_latency;
1646 }
1647