1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28
29 using namespace brw;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum intel_eu_unit {
37 /** EU front-end. */
38 EU_UNIT_FE,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 EU_UNIT_FPU,
41 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42 EU_UNIT_EM,
43 /** Sampler shared function. */
44 EU_UNIT_SAMPLER,
45 /** Pixel Interpolator shared function. */
46 EU_UNIT_PI,
47 /** Unified Return Buffer shared function. */
48 EU_UNIT_URB,
49 /** Data Port Data Cache shared function. */
50 EU_UNIT_DP_DC,
51 /** Data Port Render Cache shared function. */
52 EU_UNIT_DP_RC,
53 /** Data Port Constant Cache shared function. */
54 EU_UNIT_DP_CC,
55 /** Message Gateway shared function. */
56 EU_UNIT_GATEWAY,
57 /** Thread Spawner shared function. */
58 EU_UNIT_SPAWNER,
59 /* EU_UNIT_VME, */
60 /* EU_UNIT_CRE, */
61 /** Number of asynchronous units currently tracked. */
62 EU_NUM_UNITS,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 EU_UNIT_NULL = EU_NUM_UNITS
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum intel_eu_dependency_id {
72 /* Register part of the GRF. */
73 EU_DEPENDENCY_ID_GRF0 = 0,
74 /* Register part of the MRF. Only used on Gfx4-6. */
75 EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + BRW_MAX_GRF,
76 /* Address register part of the ARF. */
77 EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
78 /* Accumulator register part of the ARF. */
79 EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
80 /* Flag register part of the ARF. */
81 EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
82 /* SBID token write completion. Only used on Gfx12+. */
83 EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
84 /* SBID token read completion. Only used on Gfx12+. */
85 EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 16,
86 /* Number of computation dependencies currently tracked. */
87 EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 16
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
state__anon5a8962000111::state94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[EU_NUM_UNITS];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[EU_NUM_UNITS];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
instruction_info__anon5a8962000111::instruction_info123 instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
124 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(has_bank_conflict(isa, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
132 */
133 if (inst->opcode == SHADER_OPCODE_SEND) {
134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136 } else {
137 for (unsigned i = 0; i < inst->sources; i++)
138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139 }
140
141 /* Convert the execution size to GRF units. */
142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
146 */
147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151 }
152
instruction_info__anon5a8962000111::instruction_info153 instruction_info(const struct brw_isa_info *isa,
154 const vec4_instruction *inst) :
155 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158 desc(inst->desc), sfid(inst->sfid)
159 {
160 /* Compute the maximum source size. */
161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164 /* Convert the execution size to GRF units. */
165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
169 */
170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174 }
175
176 /** ISA encoding information */
177 const struct brw_isa_info *isa;
178 /** Device information. */
179 const struct intel_device_info *devinfo;
180 /** Instruction opcode. */
181 opcode op;
182 /** Destination type. */
183 brw_reg_type td;
184 /** Destination size in GRF units. */
185 unsigned sd;
186 /** Execution type. */
187 brw_reg_type tx;
188 /** Execution size in GRF units. */
189 unsigned sx;
190 /** Source size. */
191 unsigned ss;
192 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
193 unsigned sc;
194 /** Send message descriptor. */
195 uint32_t desc;
196 /** Send message shared function ID. */
197 uint8_t sfid;
198 };
199
200 /**
201 * Timing information of an instruction used to estimate the performance of
202 * the program.
203 */
204 struct perf_desc {
perf_desc__anon5a8962000111::perf_desc205 perf_desc(enum intel_eu_unit u, int df, int db,
206 int ls, int ld, int la, int lf) :
207 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
208
209 /**
210 * Back-end unit its runtime shall be accounted to, in addition to the
211 * EU front-end which is always assumed to be involved.
212 */
213 enum intel_eu_unit u;
214 /**
215 * Overhead cycles from the time that the EU front-end starts executing
216 * the instruction until it's ready to execute the next instruction.
217 */
218 int df;
219 /**
220 * Overhead cycles from the time that the back-end starts executing the
221 * instruction until it's ready to execute the next instruction.
222 */
223 int db;
224 /**
225 * Latency cycles from the time that the back-end starts executing the
226 * instruction until its sources have been read from the register file.
227 */
228 int ls;
229 /**
230 * Latency cycles from the time that the back-end starts executing the
231 * instruction until its regular destination has been written to the
232 * register file.
233 */
234 int ld;
235 /**
236 * Latency cycles from the time that the back-end starts executing the
237 * instruction until its accumulator destination has been written to the
238 * ARF file.
239 *
240 * Note that this is an approximation of the real behavior of
241 * accumulating instructions in the hardware: Instead of modeling a pair
242 * of back-to-back accumulating instructions as a first computation with
243 * latency equal to ld followed by another computation with a
244 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
245 * model the stall as if it occurred at the top of the pipeline, with
246 * the latency of the accumulator computation offset accordingly.
247 */
248 int la;
249 /**
250 * Latency cycles from the time that the back-end starts executing the
251 * instruction until its flag destination has been written to the ARF
252 * file.
253 */
254 int lf;
255 };
256
257 /**
258 * Compute the timing information of an instruction based on any relevant
259 * information from the IR and a number of parameters specifying a linear
260 * approximation: Parameter X_Y specifies the derivative of timing X
261 * relative to info field Y, while X_1 specifies the independent term of
262 * the approximation of timing X.
263 */
264 perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)265 calculate_desc(const instruction_info &info, enum intel_eu_unit u,
266 int df_1, int df_sd, int df_sc,
267 int db_1, int db_sx,
268 int ls_1, int ld_1, int la_1, int lf_1,
269 int l_ss, int l_sd)
270 {
271 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
272 db_1 + db_sx * int(info.sx),
273 ls_1 + l_ss * int(info.ss),
274 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
275 la_1, lf_1);
276 }
277
278 /**
279 * Compute the timing information of an instruction based on any relevant
280 * information from the IR and a number of linear approximation parameters
281 * hard-coded for each IR instruction.
282 *
283 * Most timing parameters are obtained from the multivariate linear
284 * regression of a sample of empirical timings measured using the tm0
285 * register (as can be done today by using the shader_time debugging
286 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
287 * "Shared Functions - Extended Math", Section 3.2 "Performance".
288 * Parameters marked XXX shall be considered low-quality, they're possibly
289 * high variance or completely guessed in cases where experimental data was
290 * unavailable.
291 */
292 const perf_desc
instruction_desc(const instruction_info & info)293 instruction_desc(const instruction_info &info)
294 {
295 const struct intel_device_info *devinfo = info.devinfo;
296
297 switch (info.op) {
298 case BRW_OPCODE_SYNC:
299 case BRW_OPCODE_SEL:
300 case BRW_OPCODE_NOT:
301 case BRW_OPCODE_AND:
302 case BRW_OPCODE_OR:
303 case BRW_OPCODE_XOR:
304 case BRW_OPCODE_SHR:
305 case BRW_OPCODE_SHL:
306 case BRW_OPCODE_DIM:
307 case BRW_OPCODE_ASR:
308 case BRW_OPCODE_CMPN:
309 case BRW_OPCODE_F16TO32:
310 case BRW_OPCODE_BFREV:
311 case BRW_OPCODE_BFI1:
312 case BRW_OPCODE_AVG:
313 case BRW_OPCODE_FRC:
314 case BRW_OPCODE_RNDU:
315 case BRW_OPCODE_RNDD:
316 case BRW_OPCODE_RNDE:
317 case BRW_OPCODE_RNDZ:
318 case BRW_OPCODE_MAC:
319 case BRW_OPCODE_MACH:
320 case BRW_OPCODE_LZD:
321 case BRW_OPCODE_FBH:
322 case BRW_OPCODE_FBL:
323 case BRW_OPCODE_CBIT:
324 case BRW_OPCODE_ADDC:
325 case BRW_OPCODE_ROR:
326 case BRW_OPCODE_ROL:
327 case BRW_OPCODE_SUBB:
328 case BRW_OPCODE_SAD2:
329 case BRW_OPCODE_SADA2:
330 case BRW_OPCODE_LINE:
331 case BRW_OPCODE_NOP:
332 case SHADER_OPCODE_CLUSTER_BROADCAST:
333 case SHADER_OPCODE_SCRATCH_HEADER:
334 case FS_OPCODE_DDX_COARSE:
335 case FS_OPCODE_DDX_FINE:
336 case FS_OPCODE_DDY_COARSE:
337 case FS_OPCODE_PIXEL_X:
338 case FS_OPCODE_PIXEL_Y:
339 case FS_OPCODE_SET_SAMPLE_ID:
340 case VEC4_OPCODE_MOV_BYTES:
341 case VEC4_OPCODE_UNPACK_UNIFORM:
342 case VEC4_OPCODE_DOUBLE_TO_F32:
343 case VEC4_OPCODE_DOUBLE_TO_D32:
344 case VEC4_OPCODE_DOUBLE_TO_U32:
345 case VEC4_OPCODE_TO_DOUBLE:
346 case VEC4_OPCODE_PICK_LOW_32BIT:
347 case VEC4_OPCODE_PICK_HIGH_32BIT:
348 case VEC4_OPCODE_SET_LOW_32BIT:
349 case VEC4_OPCODE_SET_HIGH_32BIT:
350 case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
351 case GS_OPCODE_SET_DWORD_2:
352 case GS_OPCODE_SET_WRITE_OFFSET:
353 case GS_OPCODE_SET_VERTEX_COUNT:
354 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
355 case GS_OPCODE_SET_CHANNEL_MASKS:
356 case GS_OPCODE_GET_INSTANCE_ID:
357 case GS_OPCODE_SET_PRIMITIVE_ID:
358 case GS_OPCODE_SVB_SET_DST_INDEX:
359 case TCS_OPCODE_SRC0_010_IS_ZERO:
360 case TCS_OPCODE_GET_PRIMITIVE_ID:
361 case TES_OPCODE_GET_PRIMITIVE_ID:
362 case SHADER_OPCODE_READ_SR_REG:
363 if (devinfo->ver >= 11) {
364 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
365 0, 10, 6 /* XXX */, 14, 0, 0);
366 } else if (devinfo->ver >= 8) {
367 if (type_sz(info.tx) > 4)
368 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
369 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
370 else
371 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
372 0, 8, 4, 12, 0, 0);
373 } else if (devinfo->verx10 >= 75) {
374 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
375 0, 10, 6 /* XXX */, 16, 0, 0);
376 } else {
377 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
378 0, 12, 8 /* XXX */, 18, 0, 0);
379 }
380
381 case BRW_OPCODE_MOV:
382 case BRW_OPCODE_CMP:
383 case BRW_OPCODE_ADD:
384 case BRW_OPCODE_ADD3:
385 case BRW_OPCODE_MUL:
386 case SHADER_OPCODE_MOV_RELOC_IMM:
387 case VEC4_OPCODE_MOV_FOR_SCRATCH:
388 if (devinfo->ver >= 11) {
389 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
390 0, 10, 6, 14, 0, 0);
391 } else if (devinfo->ver >= 8) {
392 if (type_sz(info.tx) > 4)
393 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
394 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
395 else
396 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
397 0, 8, 4, 12, 0, 0);
398 } else if (devinfo->verx10 >= 75) {
399 if (info.tx == BRW_REGISTER_TYPE_F)
400 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
401 0, 12, 8 /* XXX */, 18, 0, 0);
402 else
403 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
404 0, 10, 6 /* XXX */, 16, 0, 0);
405 } else if (devinfo->ver >= 7) {
406 if (info.tx == BRW_REGISTER_TYPE_F)
407 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
408 0, 14, 10 /* XXX */, 20, 0, 0);
409 else
410 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
411 0, 12, 8 /* XXX */, 18, 0, 0);
412 } else {
413 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
414 0, 2 /* XXX */,
415 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
416 0, 0);
417 }
418
419 case BRW_OPCODE_BFE:
420 case BRW_OPCODE_BFI2:
421 case BRW_OPCODE_CSEL:
422 if (devinfo->ver >= 11)
423 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
424 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
425 else if (devinfo->ver >= 8)
426 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
427 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
428 else if (devinfo->verx10 >= 75)
429 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
430 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
431 else if (devinfo->ver >= 7)
432 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
433 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
434 else
435 abort();
436
437 case BRW_OPCODE_MAD:
438 if (devinfo->ver >= 11) {
439 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
440 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
441 } else if (devinfo->ver >= 8) {
442 if (type_sz(info.tx) > 4)
443 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
444 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
445 else
446 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
447 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
448 } else if (devinfo->verx10 >= 75) {
449 if (info.tx == BRW_REGISTER_TYPE_F)
450 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
451 0, 12, 8 /* XXX */, 18, 0, 0);
452 else
453 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
454 0, 10, 6 /* XXX */, 16, 0, 0);
455 } else if (devinfo->ver >= 7) {
456 if (info.tx == BRW_REGISTER_TYPE_F)
457 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
458 0, 14, 10 /* XXX */, 20, 0, 0);
459 else
460 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
461 0, 12, 8 /* XXX */, 18, 0, 0);
462 } else if (devinfo->ver >= 6) {
463 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
464 0, 2 /* XXX */,
465 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
466 0, 0);
467 } else {
468 abort();
469 }
470
471 case BRW_OPCODE_F32TO16:
472 if (devinfo->ver >= 11)
473 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
474 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
475 else if (devinfo->ver >= 8)
476 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
477 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
478 else if (devinfo->verx10 >= 75)
479 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
480 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
481 else if (devinfo->ver >= 7)
482 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
483 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
484 else
485 abort();
486
487 case BRW_OPCODE_DP4:
488 case BRW_OPCODE_DPH:
489 case BRW_OPCODE_DP3:
490 case BRW_OPCODE_DP2:
491 if (devinfo->ver >= 8)
492 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
493 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
494 else if (devinfo->verx10 >= 75)
495 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
496 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
497 else
498 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
499 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
500
501 case BRW_OPCODE_DP4A:
502 if (devinfo->ver >= 12)
503 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
504 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
505 else
506 abort();
507
508 case SHADER_OPCODE_RCP:
509 case SHADER_OPCODE_RSQ:
510 case SHADER_OPCODE_SQRT:
511 case SHADER_OPCODE_EXP2:
512 case SHADER_OPCODE_LOG2:
513 case SHADER_OPCODE_SIN:
514 case SHADER_OPCODE_COS:
515 case SHADER_OPCODE_POW:
516 case SHADER_OPCODE_INT_QUOTIENT:
517 case SHADER_OPCODE_INT_REMAINDER:
518 if (devinfo->ver >= 6) {
519 switch (info.op) {
520 case SHADER_OPCODE_RCP:
521 case SHADER_OPCODE_RSQ:
522 case SHADER_OPCODE_SQRT:
523 case SHADER_OPCODE_EXP2:
524 case SHADER_OPCODE_LOG2:
525 case SHADER_OPCODE_SIN:
526 case SHADER_OPCODE_COS:
527 if (devinfo->ver >= 8)
528 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
529 0, 16, 0, 0, 0, 0);
530 else if (devinfo->verx10 >= 75)
531 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
532 0, 12, 0, 0, 0, 0);
533 else
534 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
535 0, 14, 0, 0, 0, 0);
536
537 case SHADER_OPCODE_POW:
538 if (devinfo->ver >= 8)
539 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
540 0, 24, 0, 0, 0, 0);
541 else if (devinfo->verx10 >= 75)
542 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
543 0, 20, 0, 0, 0, 0);
544 else
545 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
546 0, 22, 0, 0, 0, 0);
547
548 case SHADER_OPCODE_INT_QUOTIENT:
549 case SHADER_OPCODE_INT_REMAINDER:
550 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
551 0, 28 /* XXX */, 0, 0, 0, 0);
552
553 default:
554 abort();
555 }
556 } else {
557 switch (info.op) {
558 case SHADER_OPCODE_RCP:
559 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
560 0, 22, 0, 0, 0, 8);
561
562 case SHADER_OPCODE_RSQ:
563 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
564 0, 44, 0, 0, 0, 8);
565
566 case SHADER_OPCODE_INT_QUOTIENT:
567 case SHADER_OPCODE_SQRT:
568 case SHADER_OPCODE_LOG2:
569 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
570 0, 66, 0, 0, 0, 8);
571
572 case SHADER_OPCODE_INT_REMAINDER:
573 case SHADER_OPCODE_EXP2:
574 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
575 0, 88, 0, 0, 0, 8);
576
577 case SHADER_OPCODE_SIN:
578 case SHADER_OPCODE_COS:
579 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
580 0, 132, 0, 0, 0, 8);
581
582 case SHADER_OPCODE_POW:
583 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
584 0, 176, 0, 0, 0, 8);
585
586 default:
587 abort();
588 }
589 }
590
591 case BRW_OPCODE_DO:
592 if (devinfo->ver >= 6)
593 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
594 0, 0, 0, 0, 0, 0);
595 else
596 return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
597 0, 0, 0, 0, 0, 0);
598
599 case BRW_OPCODE_IF:
600 case BRW_OPCODE_ELSE:
601 case BRW_OPCODE_ENDIF:
602 case BRW_OPCODE_WHILE:
603 case BRW_OPCODE_BREAK:
604 case BRW_OPCODE_CONTINUE:
605 case BRW_OPCODE_HALT:
606 if (devinfo->ver >= 8)
607 return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
608 0, 0, 0, 0, 0, 0);
609 else if (devinfo->verx10 >= 75)
610 return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
611 0, 0, 0, 0, 0, 0);
612 else
613 return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
614 0, 0, 0, 0, 0, 0);
615
616 case FS_OPCODE_LINTERP:
617 if (devinfo->ver >= 8)
618 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
619 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
620 else if (devinfo->verx10 >= 75)
621 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
622 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
623 else
624 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
625 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
626
627 case BRW_OPCODE_LRP:
628 if (devinfo->ver >= 8)
629 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
630 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
631 else if (devinfo->verx10 >= 75)
632 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
633 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
634 else if (devinfo->ver >= 6)
635 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
636 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
637 else
638 abort();
639
640 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
641 if (devinfo->ver >= 11)
642 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
643 0, 10 /* XXX */, 6 /* XXX */,
644 14 /* XXX */, 0, 0);
645 else if (devinfo->ver >= 8)
646 return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
647 0, 8 /* XXX */, 4 /* XXX */,
648 12 /* XXX */, 0, 0);
649 else if (devinfo->verx10 >= 75)
650 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
651 0, 10 /* XXX */, 6 /* XXX */,
652 16 /* XXX */, 0, 0);
653 else if (devinfo->ver >= 7)
654 return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
655 0, 12 /* XXX */, 8 /* XXX */,
656 18 /* XXX */, 0, 0);
657 else
658 abort();
659
660 case SHADER_OPCODE_MOV_INDIRECT:
661 if (devinfo->ver >= 11)
662 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
663 0, 10 /* XXX */, 6 /* XXX */,
664 14 /* XXX */, 0, 0);
665 else if (devinfo->ver >= 8)
666 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
667 0, 8 /* XXX */, 4 /* XXX */,
668 12 /* XXX */, 0, 0);
669 else if (devinfo->verx10 >= 75)
670 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
671 0, 10 /* XXX */, 6 /* XXX */,
672 16 /* XXX */, 0, 0);
673 else
674 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
675 0, 12 /* XXX */, 8 /* XXX */,
676 18 /* XXX */, 0, 0);
677
678 case SHADER_OPCODE_BROADCAST:
679 if (devinfo->ver >= 11)
680 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
681 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
682 else if (devinfo->ver >= 8)
683 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
684 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
685 else if (devinfo->verx10 >= 75)
686 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
687 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
688 else if (devinfo->ver >= 7)
689 return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
690 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
691 else
692 abort();
693
694 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
695 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
696 if (devinfo->ver >= 11)
697 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
698 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
699 else if (devinfo->ver >= 8)
700 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
701 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
702 else if (devinfo->verx10 >= 75)
703 return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
704 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
705 else if (devinfo->ver >= 7)
706 return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
707 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
708 else
709 abort();
710
711 case SHADER_OPCODE_RND_MODE:
712 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
713 if (devinfo->ver >= 11)
714 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
715 4 /* XXX */, 0,
716 0, 0, 0, 0, 0, 0);
717 else if (devinfo->ver >= 8)
718 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
719 4 /* XXX */, 0,
720 0, 0, 0, 0, 0, 0);
721 else if (devinfo->verx10 >= 75)
722 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
723 4 /* XXX */, 0,
724 0, 0, 0, 0, 0, 0);
725 else if (devinfo->ver >= 6)
726 return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
727 4 /* XXX */, 0,
728 0, 0, 0, 0, 0, 0);
729 else
730 abort();
731
732 case SHADER_OPCODE_SHUFFLE:
733 if (devinfo->ver >= 11)
734 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
735 44 /* XXX */, 0,
736 0, 10 /* XXX */, 6 /* XXX */,
737 14 /* XXX */, 0, 0);
738 else if (devinfo->ver >= 8)
739 return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
740 42 /* XXX */, 0,
741 0, 8 /* XXX */, 4 /* XXX */,
742 12 /* XXX */, 0, 0);
743 else if (devinfo->verx10 >= 75)
744 return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
745 0, 44 /* XXX */,
746 0, 10 /* XXX */, 6 /* XXX */,
747 16 /* XXX */, 0, 0);
748 else if (devinfo->ver >= 6)
749 return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
750 0, 46 /* XXX */,
751 0, 12 /* XXX */, 8 /* XXX */,
752 18 /* XXX */, 0, 0);
753 else
754 abort();
755
756 case SHADER_OPCODE_SEL_EXEC:
757 if (devinfo->ver >= 11)
758 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
759 0, 4 /* XXX */,
760 0, 10 /* XXX */, 6 /* XXX */,
761 14 /* XXX */, 0, 0);
762 else if (devinfo->ver >= 8)
763 return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
764 0, 4 /* XXX */,
765 0, 8 /* XXX */, 4 /* XXX */,
766 12 /* XXX */, 0, 0);
767 else if (devinfo->verx10 >= 75)
768 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
769 0, 4 /* XXX */,
770 0, 10 /* XXX */, 6 /* XXX */,
771 16 /* XXX */, 0, 0);
772 else
773 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
774 0, 4 /* XXX */,
775 0, 12 /* XXX */, 8 /* XXX */,
776 18 /* XXX */, 0, 0);
777
778 case SHADER_OPCODE_QUAD_SWIZZLE:
779 if (devinfo->ver >= 11)
780 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
781 0, 8 /* XXX */,
782 0, 10 /* XXX */, 6 /* XXX */,
783 14 /* XXX */, 0, 0);
784 else if (devinfo->ver >= 8)
785 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
786 0, 8 /* XXX */,
787 0, 8 /* XXX */, 4 /* XXX */,
788 12 /* XXX */, 0, 0);
789 else if (devinfo->verx10 >= 75)
790 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
791 0, 8 /* XXX */,
792 0, 10 /* XXX */, 6 /* XXX */,
793 16 /* XXX */, 0, 0);
794 else
795 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
796 0, 8 /* XXX */,
797 0, 12 /* XXX */, 8 /* XXX */,
798 18 /* XXX */, 0, 0);
799
800 case FS_OPCODE_DDY_FINE:
801 if (devinfo->ver >= 11)
802 return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
803 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
804 else if (devinfo->ver >= 8)
805 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
806 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
807 else if (devinfo->verx10 >= 75)
808 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
809 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
810 else
811 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
812 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
813
814 case FS_OPCODE_LOAD_LIVE_CHANNELS:
815 if (devinfo->ver >= 11)
816 return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
817 2 /* XXX */, 0,
818 0, 0, 0, 10 /* XXX */, 0, 0);
819 else if (devinfo->ver >= 8)
820 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
821 0, 2 /* XXX */,
822 0, 0, 0, 8 /* XXX */, 0, 0);
823 else
824 abort();
825
826 case VEC4_OPCODE_PACK_BYTES:
827 if (devinfo->ver >= 8)
828 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
829 4 /* XXX */, 0,
830 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
831 0, 0);
832 else if (devinfo->verx10 >= 75)
833 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
834 4 /* XXX */, 0,
835 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
836 0, 0);
837 else
838 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
839 4 /* XXX */, 0,
840 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
841 0, 0);
842
843 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
844 case TCS_OPCODE_GET_INSTANCE_ID:
845 case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
846 case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
847 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
848 if (devinfo->ver >= 8)
849 return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
850 6 /* XXX */, 0,
851 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
852 0, 0);
853 else if (devinfo->verx10 >= 75)
854 return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
855 6 /* XXX */, 0,
856 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
857 0, 0);
858 else
859 return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
860 6 /* XXX */, 0,
861 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
862 0, 0);
863
864 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
865 case TCS_OPCODE_CREATE_BARRIER_HEADER:
866 if (devinfo->ver >= 8)
867 return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
868 8 /* XXX */, 0,
869 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
870 0, 0);
871 else if (devinfo->verx10 >= 75)
872 return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
873 8 /* XXX */, 0,
874 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
875 0, 0);
876 else if (devinfo->ver >= 6)
877 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
878 8 /* XXX */, 0,
879 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
880 0, 0);
881 else
882 abort();
883
884 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
885 if (devinfo->ver >= 8)
886 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
887 4 /* XXX */, 0,
888 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
889 0, 0);
890 else if (devinfo->verx10 >= 75)
891 return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
892 4 /* XXX */, 0,
893 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
894 0, 0);
895 else if (devinfo->ver >= 7)
896 return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
897 4 /* XXX */, 0,
898 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
899 0, 0);
900 else
901 abort();
902
903 case SHADER_OPCODE_TEX:
904 case FS_OPCODE_TXB:
905 case SHADER_OPCODE_TXD:
906 case SHADER_OPCODE_TXF:
907 case SHADER_OPCODE_TXF_LZ:
908 case SHADER_OPCODE_TXL:
909 case SHADER_OPCODE_TXL_LZ:
910 case SHADER_OPCODE_TXF_CMS:
911 case SHADER_OPCODE_TXF_CMS_W:
912 case SHADER_OPCODE_TXF_UMS:
913 case SHADER_OPCODE_TXF_MCS:
914 case SHADER_OPCODE_TXS:
915 case SHADER_OPCODE_LOD:
916 case SHADER_OPCODE_GET_BUFFER_SIZE:
917 case SHADER_OPCODE_TG4:
918 case SHADER_OPCODE_TG4_OFFSET:
919 case SHADER_OPCODE_SAMPLEINFO:
920 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
921 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
922 8 /* XXX */, 750 /* XXX */, 0, 0,
923 2 /* XXX */, 0);
924
925 case VEC4_OPCODE_URB_READ:
926 case VEC4_VS_OPCODE_URB_WRITE:
927 case VEC4_GS_OPCODE_URB_WRITE:
928 case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
929 case GS_OPCODE_THREAD_END:
930 case GS_OPCODE_FF_SYNC:
931 case VEC4_TCS_OPCODE_URB_WRITE:
932 case TCS_OPCODE_RELEASE_INPUT:
933 case TCS_OPCODE_THREAD_END:
934 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
935 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
936
937 case SHADER_OPCODE_MEMORY_FENCE:
938 case SHADER_OPCODE_INTERLOCK:
939 switch (info.sfid) {
940 case GFX6_SFID_DATAPORT_RENDER_CACHE:
941 if (devinfo->ver >= 7)
942 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
943 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
944 else
945 abort();
946
947 case BRW_SFID_URB:
948 case GFX7_SFID_DATAPORT_DATA_CACHE:
949 case GFX12_SFID_SLM:
950 case GFX12_SFID_TGM:
951 case GFX12_SFID_UGM:
952 case HSW_SFID_DATAPORT_DATA_CACHE_1:
953 if (devinfo->ver >= 7)
954 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
955 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
956 else
957 abort();
958
959 default:
960 abort();
961 }
962
963 case SHADER_OPCODE_GFX4_SCRATCH_READ:
964 case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
965 case SHADER_OPCODE_GFX7_SCRATCH_READ:
966 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
967 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
968
969 case VEC4_OPCODE_UNTYPED_ATOMIC:
970 if (devinfo->ver >= 7)
971 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
972 30 /* XXX */, 400 /* XXX */,
973 10 /* XXX */, 100 /* XXX */, 0, 0,
974 0, 400 /* XXX */);
975 else
976 abort();
977
978 case VEC4_OPCODE_UNTYPED_SURFACE_READ:
979 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
980 if (devinfo->ver >= 7)
981 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
982 0, 20 /* XXX */,
983 10 /* XXX */, 100 /* XXX */, 0, 0,
984 0, 0);
985 else
986 abort();
987
988 case FS_OPCODE_FB_WRITE:
989 case FS_OPCODE_FB_READ:
990 case FS_OPCODE_REP_FB_WRITE:
991 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
992 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
993
994 case GS_OPCODE_SVB_WRITE:
995 if (devinfo->ver >= 6)
996 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
997 0, 450 /* XXX */,
998 10 /* XXX */, 300 /* XXX */, 0, 0,
999 0, 0);
1000 else
1001 abort();
1002
1003 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1004 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
1005 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
1006 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1007
1008 case VS_OPCODE_PULL_CONSTANT_LOAD:
1009 case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1010 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1011 8, 750, 0, 0, 2, 0);
1012
1013 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1014 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1015 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1016 if (devinfo->ver >= 7)
1017 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1018 0, 90 /* XXX */, 0, 0, 0, 0);
1019 else
1020 abort();
1021
1022 case SHADER_OPCODE_BARRIER:
1023 if (devinfo->ver >= 7)
1024 return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
1025 0 /* XXX */, 0,
1026 0, 0, 0, 0, 0, 0);
1027 else
1028 abort();
1029
1030 case CS_OPCODE_CS_TERMINATE:
1031 if (devinfo->ver >= 7)
1032 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1033 10 /* XXX */, 0, 0, 0, 0, 0);
1034 else
1035 abort();
1036
1037 case SHADER_OPCODE_SEND:
1038 switch (info.sfid) {
1039 case GFX6_SFID_DATAPORT_RENDER_CACHE:
1040 if (devinfo->ver >= 7) {
1041 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1042 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1043 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1044 30 /* XXX */, 450 /* XXX */,
1045 10 /* XXX */, 100 /* XXX */,
1046 0, 0, 0, 400 /* XXX */);
1047 default:
1048 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1049 0, 450 /* XXX */,
1050 10 /* XXX */, 300 /* XXX */, 0, 0,
1051 0, 0);
1052 }
1053 } else if (devinfo->ver >= 6) {
1054 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
1055 0, 450 /* XXX */,
1056 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1057 } else {
1058 abort();
1059 }
1060 case BRW_SFID_SAMPLER: {
1061 if (devinfo->ver >= 6)
1062 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1063 8, 750, 0, 0, 2, 0);
1064 else
1065 abort();
1066 }
1067 case GFX7_SFID_DATAPORT_DATA_CACHE:
1068 case HSW_SFID_DATAPORT_DATA_CACHE_1:
1069 if (devinfo->verx10 >= 75) {
1070 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1071 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1072 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1073 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1074 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1075 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1076 30 /* XXX */, 400 /* XXX */,
1077 10 /* XXX */, 100 /* XXX */, 0, 0,
1078 0, 400 /* XXX */);
1079
1080 default:
1081 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1082 0, 20 /* XXX */,
1083 10 /* XXX */, 100 /* XXX */, 0, 0,
1084 0, 0);
1085 }
1086 } else if (devinfo->ver >= 7) {
1087 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1088 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1089 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1090 30 /* XXX */, 400 /* XXX */,
1091 10 /* XXX */, 100 /* XXX */,
1092 0, 0, 0, 400 /* XXX */);
1093 default:
1094 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1095 0, 20 /* XXX */,
1096 10 /* XXX */, 100 /* XXX */, 0, 0,
1097 0, 0);
1098 }
1099 } else {
1100 abort();
1101 }
1102
1103 case GFX12_SFID_UGM:
1104 case GFX12_SFID_TGM:
1105 case GFX12_SFID_SLM:
1106 switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1107 case LSC_OP_LOAD:
1108 case LSC_OP_STORE:
1109 case LSC_OP_LOAD_CMASK:
1110 case LSC_OP_STORE_CMASK:
1111 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1112 0, 20 /* XXX */,
1113 10 /* XXX */, 100 /* XXX */, 0, 0,
1114 0, 0);
1115
1116 case LSC_OP_FENCE:
1117 case LSC_OP_ATOMIC_INC:
1118 case LSC_OP_ATOMIC_DEC:
1119 case LSC_OP_ATOMIC_LOAD:
1120 case LSC_OP_ATOMIC_STORE:
1121 case LSC_OP_ATOMIC_ADD:
1122 case LSC_OP_ATOMIC_SUB:
1123 case LSC_OP_ATOMIC_MIN:
1124 case LSC_OP_ATOMIC_MAX:
1125 case LSC_OP_ATOMIC_UMIN:
1126 case LSC_OP_ATOMIC_UMAX:
1127 case LSC_OP_ATOMIC_CMPXCHG:
1128 case LSC_OP_ATOMIC_FADD:
1129 case LSC_OP_ATOMIC_FSUB:
1130 case LSC_OP_ATOMIC_FMIN:
1131 case LSC_OP_ATOMIC_FMAX:
1132 case LSC_OP_ATOMIC_FCMPXCHG:
1133 case LSC_OP_ATOMIC_AND:
1134 case LSC_OP_ATOMIC_OR:
1135 case LSC_OP_ATOMIC_XOR:
1136 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1137 30 /* XXX */, 400 /* XXX */,
1138 10 /* XXX */, 100 /* XXX */, 0, 0,
1139 0, 400 /* XXX */);
1140 default:
1141 abort();
1142 }
1143
1144 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
1145 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
1146 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1147 10 /* XXX */, 0, 0, 0, 0, 0);
1148
1149 case BRW_SFID_URB:
1150 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
1151 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
1152
1153 default:
1154 abort();
1155 }
1156
1157 case SHADER_OPCODE_UNDEF:
1158 case SHADER_OPCODE_HALT_TARGET:
1159 case FS_OPCODE_SCHEDULING_FENCE:
1160 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
1161 0, 0, 0, 0, 0, 0);
1162
1163 default:
1164 abort();
1165 }
1166 }
1167
1168 /**
1169 * Model the performance behavior of a stall on the specified dependency
1170 * ID.
1171 */
1172 void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)1173 stall_on_dependency(state &st, enum intel_eu_dependency_id id)
1174 {
1175 if (id < ARRAY_SIZE(st.dep_ready))
1176 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1177 st.dep_ready[id]);
1178 }
1179
1180 /**
1181 * Model the performance behavior of the front-end and back-end while
1182 * executing an instruction with the specified timing information, assuming
1183 * all dependencies are already clear.
1184 */
1185 void
execute_instruction(state & st,const perf_desc & perf)1186 execute_instruction(state &st, const perf_desc &perf)
1187 {
1188 /* Compute the time at which the front-end will be ready to execute the
1189 * next instruction.
1190 */
1191 st.unit_ready[EU_UNIT_FE] += perf.df;
1192
1193 if (perf.u < EU_NUM_UNITS) {
1194 /* Wait for the back-end to be ready to execute this instruction. */
1195 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1196 st.unit_ready[perf.u]);
1197
1198 /* Compute the time at which the back-end will be ready to execute
1199 * the next instruction, and update the back-end utilization.
1200 */
1201 st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
1202 st.unit_busy[perf.u] += perf.db * st.weight;
1203 }
1204 }
1205
1206 /**
1207 * Model the performance behavior of a read dependency provided by an
1208 * instruction.
1209 */
1210 void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1211 mark_read_dependency(state &st, const perf_desc &perf,
1212 enum intel_eu_dependency_id id)
1213 {
1214 if (id < ARRAY_SIZE(st.dep_ready))
1215 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
1216 }
1217
1218 /**
1219 * Model the performance behavior of a write dependency provided by an
1220 * instruction.
1221 */
1222 void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1223 mark_write_dependency(state &st, const perf_desc &perf,
1224 enum intel_eu_dependency_id id)
1225 {
1226 if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
1227 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
1228 else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
1229 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
1230 else if (id < ARRAY_SIZE(st.dep_ready))
1231 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
1232 }
1233
1234 /**
1235 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1236 */
1237 enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const backend_reg & r,const int delta)1238 reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
1239 const int delta)
1240 {
1241 if (r.file == VGRF) {
1242 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1243 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1244 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1245
1246 } else if (r.file == FIXED_GRF) {
1247 const unsigned i = r.nr + delta;
1248 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1249 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1250
1251 } else if (r.file == MRF && devinfo->ver >= 7) {
1252 const unsigned i = GFX7_MRF_HACK_START +
1253 r.nr + r.offset / REG_SIZE + delta;
1254 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1255 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1256
1257 } else if (r.file == MRF && devinfo->ver < 7) {
1258 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1259 r.offset / REG_SIZE + delta;
1260 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
1261 return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
1262
1263 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1264 r.nr < BRW_ARF_ACCUMULATOR) {
1265 assert(delta == 0);
1266 return EU_DEPENDENCY_ID_ADDR0;
1267
1268 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1269 r.nr < BRW_ARF_FLAG) {
1270 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1271 assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
1272 return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
1273
1274 } else {
1275 return EU_NUM_DEPENDENCY_IDS;
1276 }
1277 }
1278
1279 /**
1280 * Return the dependency ID of flag register starting at offset \p i.
1281 */
1282 enum intel_eu_dependency_id
flag_dependency_id(unsigned i)1283 flag_dependency_id(unsigned i)
1284 {
1285 assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
1286 return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
1287 }
1288
1289 /**
1290 * Return the dependency ID corresponding to the SBID read completion
1291 * condition of a Gfx12+ SWSB.
1292 */
1293 enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1294 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1295 {
1296 if (swsb.mode) {
1297 assert(swsb.sbid <
1298 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
1299 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
1300 } else {
1301 return EU_NUM_DEPENDENCY_IDS;
1302 }
1303 }
1304
1305 /**
1306 * Return the dependency ID corresponding to the SBID write completion
1307 * condition of a Gfx12+ SWSB.
1308 */
1309 enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1310 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1311 {
1312 if (swsb.mode) {
1313 assert(swsb.sbid <
1314 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
1315 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
1316 } else {
1317 return EU_NUM_DEPENDENCY_IDS;
1318 }
1319 }
1320
1321 /**
1322 * Return the implicit accumulator register accessed by channel \p i of the
1323 * instruction.
1324 */
1325 unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)1326 accum_reg_of_channel(const intel_device_info *devinfo,
1327 const backend_instruction *inst,
1328 brw_reg_type tx, unsigned i)
1329 {
1330 assert(inst->reads_accumulator_implicitly() ||
1331 inst->writes_accumulator_implicitly(devinfo));
1332 const unsigned offset = (inst->group + i) * type_sz(tx) *
1333 (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1334 return offset / REG_SIZE % 2;
1335 }
1336
1337 /**
1338 * Model the performance behavior of an FS back-end instruction.
1339 */
1340 void
issue_fs_inst(state & st,const struct brw_isa_info * isa,const backend_instruction * be_inst)1341 issue_fs_inst(state &st, const struct brw_isa_info *isa,
1342 const backend_instruction *be_inst)
1343 {
1344 const struct intel_device_info *devinfo = isa->devinfo;
1345 const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1346 const instruction_info info(isa, inst);
1347 const perf_desc perf = instruction_desc(info);
1348
1349 /* Stall on any source dependencies. */
1350 for (unsigned i = 0; i < inst->sources; i++) {
1351 for (unsigned j = 0; j < regs_read(inst, i); j++)
1352 stall_on_dependency(
1353 st, reg_dependency_id(devinfo, inst->src[i], j));
1354 }
1355
1356 if (inst->reads_accumulator_implicitly()) {
1357 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1358 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1359 inst->exec_size - 1); j++)
1360 stall_on_dependency(
1361 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1362 }
1363
1364 if (is_send(inst) && inst->base_mrf != -1) {
1365 for (unsigned j = 0; j < inst->mlen; j++)
1366 stall_on_dependency(
1367 st, reg_dependency_id(
1368 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1369 }
1370
1371 if (const unsigned mask = inst->flags_read(devinfo)) {
1372 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1373 if (mask & (1 << i))
1374 stall_on_dependency(st, flag_dependency_id(i));
1375 }
1376 }
1377
1378 /* Stall on any write dependencies. */
1379 if (!inst->no_dd_check) {
1380 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1381 for (unsigned j = 0; j < regs_written(inst); j++)
1382 stall_on_dependency(
1383 st, reg_dependency_id(devinfo, inst->dst, j));
1384 }
1385
1386 if (inst->writes_accumulator_implicitly(devinfo)) {
1387 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1388 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1389 inst->exec_size - 1); j++)
1390 stall_on_dependency(
1391 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1392 }
1393
1394 if (const unsigned mask = inst->flags_written(devinfo)) {
1395 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1396 if (mask & (1 << i))
1397 stall_on_dependency(st, flag_dependency_id(i));
1398 }
1399 }
1400 }
1401
1402 /* Stall on any SBID dependencies. */
1403 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1404 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1405 else if (inst->sched.mode & TGL_SBID_SRC)
1406 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1407
1408 /* Execute the instruction. */
1409 execute_instruction(st, perf);
1410
1411 /* Mark any source dependencies. */
1412 if (inst->is_send_from_grf()) {
1413 for (unsigned i = 0; i < inst->sources; i++) {
1414 if (inst->is_payload(i)) {
1415 for (unsigned j = 0; j < regs_read(inst, i); j++)
1416 mark_read_dependency(
1417 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1418 }
1419 }
1420 }
1421
1422 if (is_send(inst) && inst->base_mrf != -1) {
1423 for (unsigned j = 0; j < inst->mlen; j++)
1424 mark_read_dependency(st, perf,
1425 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1426 }
1427
1428 /* Mark any destination dependencies. */
1429 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1430 for (unsigned j = 0; j < regs_written(inst); j++) {
1431 mark_write_dependency(st, perf,
1432 reg_dependency_id(devinfo, inst->dst, j));
1433 }
1434 }
1435
1436 if (inst->writes_accumulator_implicitly(devinfo)) {
1437 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1438 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1439 inst->exec_size - 1); j++)
1440 mark_write_dependency(st, perf,
1441 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1442 }
1443
1444 if (const unsigned mask = inst->flags_written(devinfo)) {
1445 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1446 if (mask & (1 << i))
1447 mark_write_dependency(st, perf, flag_dependency_id(i));
1448 }
1449 }
1450
1451 /* Mark any SBID dependencies. */
1452 if (inst->sched.mode & TGL_SBID_SET) {
1453 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1454 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1455 }
1456 }
1457
1458 /**
1459 * Model the performance behavior of a VEC4 back-end instruction.
1460 */
1461 void
issue_vec4_instruction(state & st,const struct brw_isa_info * isa,const backend_instruction * be_inst)1462 issue_vec4_instruction(state &st, const struct brw_isa_info *isa,
1463 const backend_instruction *be_inst)
1464 {
1465 const struct intel_device_info *devinfo = isa->devinfo;
1466 const vec4_instruction *inst =
1467 static_cast<const vec4_instruction *>(be_inst);
1468 const instruction_info info(isa, inst);
1469 const perf_desc perf = instruction_desc(info);
1470
1471 /* Stall on any source dependencies. */
1472 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1473 for (unsigned j = 0; j < regs_read(inst, i); j++)
1474 stall_on_dependency(
1475 st, reg_dependency_id(devinfo, inst->src[i], j));
1476 }
1477
1478 if (inst->reads_accumulator_implicitly()) {
1479 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1480 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1481 inst->exec_size - 1); j++)
1482 stall_on_dependency(
1483 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1484 }
1485
1486 if (inst->base_mrf != -1) {
1487 for (unsigned j = 0; j < inst->mlen; j++)
1488 stall_on_dependency(
1489 st, reg_dependency_id(
1490 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1491 }
1492
1493 if (inst->reads_flag())
1494 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1495
1496 /* Stall on any write dependencies. */
1497 if (!inst->no_dd_check) {
1498 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1499 for (unsigned j = 0; j < regs_written(inst); j++)
1500 stall_on_dependency(
1501 st, reg_dependency_id(devinfo, inst->dst, j));
1502 }
1503
1504 if (inst->writes_accumulator_implicitly(devinfo)) {
1505 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1506 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1507 inst->exec_size - 1); j++)
1508 stall_on_dependency(
1509 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1510 }
1511
1512 if (inst->writes_flag(devinfo))
1513 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1514 }
1515
1516 /* Execute the instruction. */
1517 execute_instruction(st, perf);
1518
1519 /* Mark any source dependencies. */
1520 if (inst->is_send_from_grf()) {
1521 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1522 for (unsigned j = 0; j < regs_read(inst, i); j++)
1523 mark_read_dependency(
1524 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1525 }
1526 }
1527
1528 if (inst->base_mrf != -1) {
1529 for (unsigned j = 0; j < inst->mlen; j++)
1530 mark_read_dependency(st, perf,
1531 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1532 }
1533
1534 /* Mark any destination dependencies. */
1535 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1536 for (unsigned j = 0; j < regs_written(inst); j++) {
1537 mark_write_dependency(st, perf,
1538 reg_dependency_id(devinfo, inst->dst, j));
1539 }
1540 }
1541
1542 if (inst->writes_accumulator_implicitly(devinfo)) {
1543 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1544 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1545 inst->exec_size - 1); j++)
1546 mark_write_dependency(st, perf,
1547 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1548 }
1549
1550 if (inst->writes_flag(devinfo))
1551 mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
1552 }
1553
1554 /**
1555 * Calculate the maximum possible throughput of the program compatible with
1556 * the cycle-count utilization estimated for each asynchronous unit, in
1557 * threads-per-cycle units.
1558 */
1559 float
calculate_thread_throughput(const state & st,float busy)1560 calculate_thread_throughput(const state &st, float busy)
1561 {
1562 for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1563 busy = MAX2(busy, st.unit_busy[i]);
1564
1565 return 1.0 / busy;
1566 }
1567
1568 /**
1569 * Estimate the performance of the specified shader.
1570 */
1571 void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const struct brw_isa_info *,const backend_instruction *),unsigned dispatch_width)1572 calculate_performance(performance &p, const backend_shader *s,
1573 void (*issue_instruction)(
1574 state &, const struct brw_isa_info *,
1575 const backend_instruction *),
1576 unsigned dispatch_width)
1577 {
1578 /* XXX - Note that the previous version of this code used worst-case
1579 * scenario estimation of branching divergence for SIMD32 shaders,
1580 * but this heuristic was removed to improve performance in common
1581 * scenarios. Wider shader variants are less optimal when divergence
1582 * is high, e.g. when application renders complex scene on a small
1583 * surface. It is assumed that such renders are short, so their
1584 * time doesn't matter and when it comes to the overall performance,
1585 * they are dominated by more optimal larger renders.
1586 *
1587 * It's possible that we could do better with divergence analysis
1588 * by isolating branches which are 100% uniform.
1589 *
1590 * Plumbing the trip counts from NIR loop analysis would allow us
1591 * to do a better job regarding the loop weights.
1592 *
1593 * In the meantime use values that roughly match the control flow
1594 * weights used elsewhere in the compiler back-end.
1595 *
1596 * Note that we provide slightly more pessimistic weights on
1597 * Gfx12+ for SIMD32, since the effective warp size on that
1598 * platform is 2x the SIMD width due to EU fusion, which increases
1599 * the likelihood of divergent control flow in comparison to
1600 * previous generations, giving narrower SIMD modes a performance
1601 * advantage in several test-cases with non-uniform discard jumps.
1602 */
1603 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1604 1.0 : 0.5);
1605 const float loop_weight = 10;
1606 unsigned halt_count = 0;
1607 unsigned elapsed = 0;
1608 state st;
1609
1610 foreach_block(block, s->cfg) {
1611 const unsigned elapsed0 = elapsed;
1612
1613 foreach_inst_in_block(backend_instruction, inst, block) {
1614 const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1615
1616 issue_instruction(st, &s->compiler->isa, inst);
1617
1618 if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1619 st.weight /= discard_weight;
1620
1621 elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1622
1623 if (inst->opcode == BRW_OPCODE_DO)
1624 st.weight *= loop_weight;
1625 else if (inst->opcode == BRW_OPCODE_WHILE)
1626 st.weight /= loop_weight;
1627 else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1628 st.weight *= discard_weight;
1629 }
1630
1631 p.block_latency[block->num] = elapsed - elapsed0;
1632 }
1633
1634 p.latency = elapsed;
1635 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1636 }
1637 }
1638
performance(const fs_visitor * v)1639 brw::performance::performance(const fs_visitor *v) :
1640 block_latency(new unsigned[v->cfg->num_blocks])
1641 {
1642 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1643 }
1644
performance(const vec4_visitor * v)1645 brw::performance::performance(const vec4_visitor *v) :
1646 block_latency(new unsigned[v->cfg->num_blocks])
1647 {
1648 calculate_performance(*this, v, issue_vec4_instruction, 8);
1649 }
1650
~performance()1651 brw::performance::~performance()
1652 {
1653 delete[] block_latency;
1654 }
1655