1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_eu.h"
25 #include "elk_fs.h"
26 #include "elk_vec4.h"
27 #include "elk_cfg.h"
28
29 using namespace elk;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum intel_eu_unit {
37 /** EU front-end. */
38 EU_UNIT_FE,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 EU_UNIT_FPU,
41 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42 EU_UNIT_EM,
43 /** Sampler shared function. */
44 EU_UNIT_SAMPLER,
45 /** Pixel Interpolator shared function. */
46 EU_UNIT_PI,
47 /** Unified Return Buffer shared function. */
48 EU_UNIT_URB,
49 /** Data Port Data Cache shared function. */
50 EU_UNIT_DP_DC,
51 /** Data Port Render Cache shared function. */
52 EU_UNIT_DP_RC,
53 /** Data Port Constant Cache shared function. */
54 EU_UNIT_DP_CC,
55 /** Message Gateway shared function. */
56 EU_UNIT_GATEWAY,
57 /** Thread Spawner shared function. */
58 EU_UNIT_SPAWNER,
59 /* EU_UNIT_VME, */
60 /* EU_UNIT_CRE, */
61 /** Number of asynchronous units currently tracked. */
62 EU_NUM_UNITS,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 EU_UNIT_NULL = EU_NUM_UNITS
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum intel_eu_dependency_id {
72 /* Register part of the GRF. */
73 EU_DEPENDENCY_ID_GRF0 = 0,
74 /* Register part of the MRF. Only used on Gfx4-6. */
75 EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
76 /* Address register part of the ARF. */
77 EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
78 /* Accumulator register part of the ARF. */
79 EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
80 /* Flag register part of the ARF. */
81 EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
82 /* SBID token write completion. Only used on Gfx12+. */
83 EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
84 /* SBID token read completion. Only used on Gfx12+. */
85 EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
86 /* Number of computation dependencies currently tracked. */
87 EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
state__anon789f65800111::state94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[EU_NUM_UNITS];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[EU_NUM_UNITS];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
instruction_info__anon789f65800111::instruction_info123 instruction_info(const struct elk_isa_info *isa, const elk_fs_inst *inst) :
124 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(elk_has_bank_conflict(isa, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
132 */
133 if (inst->opcode == ELK_SHADER_OPCODE_SEND) {
134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136 } else {
137 for (unsigned i = 0; i < inst->sources; i++)
138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139 }
140
141 /* Convert the execution size to GRF units. */
142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
146 */
147 if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
148 !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150 tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
151
152 rcount = inst->opcode == ELK_OPCODE_DPAS ? inst->rcount : 0;
153 }
154
instruction_info__anon789f65800111::instruction_info155 instruction_info(const struct elk_isa_info *isa,
156 const vec4_instruction *inst) :
157 isa(isa), devinfo(isa->devinfo), op(inst->opcode),
158 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
159 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
160 desc(inst->desc), sfid(inst->sfid), rcount(0)
161 {
162 /* Compute the maximum source size. */
163 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
164 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
165
166 /* Convert the execution size to GRF units. */
167 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
168
169 /* 32x32 integer multiplication has half the usual ALU throughput.
170 * Treat it as double-precision.
171 */
172 if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
173 !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
174 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
175 tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
176 }
177
178 /** ISA encoding information */
179 const struct elk_isa_info *isa;
180 /** Device information. */
181 const struct intel_device_info *devinfo;
182 /** Instruction opcode. */
183 elk_opcode op;
184 /** Destination type. */
185 elk_reg_type td;
186 /** Destination size in GRF units. */
187 unsigned sd;
188 /** Execution type. */
189 elk_reg_type tx;
190 /** Execution size in GRF units. */
191 unsigned sx;
192 /** Source size. */
193 unsigned ss;
194 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
195 unsigned sc;
196 /** Send message descriptor. */
197 uint32_t desc;
198 /** Send message shared function ID. */
199 uint8_t sfid;
200 /** Repeat count for DPAS instructions. */
201 uint8_t rcount;
202 };
203
204 /**
205 * Timing information of an instruction used to estimate the performance of
206 * the program.
207 */
208 struct perf_desc {
perf_desc__anon789f65800111::perf_desc209 perf_desc(enum intel_eu_unit u, int df, int db,
210 int ls, int ld, int la, int lf) :
211 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
212
213 /**
214 * Back-end unit its runtime shall be accounted to, in addition to the
215 * EU front-end which is always assumed to be involved.
216 */
217 enum intel_eu_unit u;
218 /**
219 * Overhead cycles from the time that the EU front-end starts executing
220 * the instruction until it's ready to execute the next instruction.
221 */
222 int df;
223 /**
224 * Overhead cycles from the time that the back-end starts executing the
225 * instruction until it's ready to execute the next instruction.
226 */
227 int db;
228 /**
229 * Latency cycles from the time that the back-end starts executing the
230 * instruction until its sources have been read from the register file.
231 */
232 int ls;
233 /**
234 * Latency cycles from the time that the back-end starts executing the
235 * instruction until its regular destination has been written to the
236 * register file.
237 */
238 int ld;
239 /**
240 * Latency cycles from the time that the back-end starts executing the
241 * instruction until its accumulator destination has been written to the
242 * ARF file.
243 *
244 * Note that this is an approximation of the real behavior of
245 * accumulating instructions in the hardware: Instead of modeling a pair
246 * of back-to-back accumulating instructions as a first computation with
247 * latency equal to ld followed by another computation with a
248 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
249 * model the stall as if it occurred at the top of the pipeline, with
250 * the latency of the accumulator computation offset accordingly.
251 */
252 int la;
253 /**
254 * Latency cycles from the time that the back-end starts executing the
255 * instruction until its flag destination has been written to the ARF
256 * file.
257 */
258 int lf;
259 };
260
261 /**
262 * Compute the timing information of an instruction based on any relevant
263 * information from the IR and a number of parameters specifying a linear
264 * approximation: Parameter X_Y specifies the derivative of timing X
265 * relative to info field Y, while X_1 specifies the independent term of
266 * the approximation of timing X.
267 */
268 perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)269 calculate_desc(const instruction_info &info, enum intel_eu_unit u,
270 int df_1, int df_sd, int df_sc,
271 int db_1, int db_sx,
272 int ls_1, int ld_1, int la_1, int lf_1,
273 int l_ss, int l_sd)
274 {
275 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
276 db_1 + db_sx * int(info.sx),
277 ls_1 + l_ss * int(info.ss),
278 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
279 la_1, lf_1);
280 }
281
282 /**
283 * Compute the timing information of an instruction based on any relevant
284 * information from the IR and a number of linear approximation parameters
285 * hard-coded for each IR instruction.
286 *
287 * Most timing parameters are obtained from the multivariate linear
288 * regression of a sample of empirical timings measured using the tm0
289 * register (as can be done today by using the shader_time debugging
290 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
291 * "Shared Functions - Extended Math", Section 3.2 "Performance".
292 * Parameters marked XXX shall be considered low-quality, they're possibly
293 * high variance or completely guessed in cases where experimental data was
294 * unavailable.
295 */
296 const perf_desc
instruction_desc(const instruction_info & info)297 instruction_desc(const instruction_info &info)
298 {
299 const struct intel_device_info *devinfo = info.devinfo;
300
301 switch (info.op) {
302 case ELK_OPCODE_SYNC:
303 case ELK_OPCODE_SEL:
304 case ELK_OPCODE_NOT:
305 case ELK_OPCODE_AND:
306 case ELK_OPCODE_OR:
307 case ELK_OPCODE_XOR:
308 case ELK_OPCODE_SHR:
309 case ELK_OPCODE_SHL:
310 case ELK_OPCODE_DIM:
311 case ELK_OPCODE_ASR:
312 case ELK_OPCODE_CMPN:
313 case ELK_OPCODE_F16TO32:
314 case ELK_OPCODE_BFREV:
315 case ELK_OPCODE_BFI1:
316 case ELK_OPCODE_AVG:
317 case ELK_OPCODE_FRC:
318 case ELK_OPCODE_RNDU:
319 case ELK_OPCODE_RNDD:
320 case ELK_OPCODE_RNDE:
321 case ELK_OPCODE_RNDZ:
322 case ELK_OPCODE_MAC:
323 case ELK_OPCODE_MACH:
324 case ELK_OPCODE_LZD:
325 case ELK_OPCODE_FBH:
326 case ELK_OPCODE_FBL:
327 case ELK_OPCODE_CBIT:
328 case ELK_OPCODE_ADDC:
329 case ELK_OPCODE_ROR:
330 case ELK_OPCODE_ROL:
331 case ELK_OPCODE_SUBB:
332 case ELK_OPCODE_SAD2:
333 case ELK_OPCODE_SADA2:
334 case ELK_OPCODE_LINE:
335 case ELK_OPCODE_NOP:
336 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
337 case ELK_SHADER_OPCODE_SCRATCH_HEADER:
338 case ELK_FS_OPCODE_DDX_COARSE:
339 case ELK_FS_OPCODE_DDX_FINE:
340 case ELK_FS_OPCODE_DDY_COARSE:
341 case ELK_FS_OPCODE_PIXEL_X:
342 case ELK_FS_OPCODE_PIXEL_Y:
343 case ELK_FS_OPCODE_SET_SAMPLE_ID:
344 case ELK_VEC4_OPCODE_MOV_BYTES:
345 case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
346 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
347 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
348 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
349 case ELK_VEC4_OPCODE_TO_DOUBLE:
350 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
351 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
352 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
353 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
354 case ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
355 case ELK_GS_OPCODE_SET_DWORD_2:
356 case ELK_GS_OPCODE_SET_WRITE_OFFSET:
357 case ELK_GS_OPCODE_SET_VERTEX_COUNT:
358 case ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS:
359 case ELK_GS_OPCODE_SET_CHANNEL_MASKS:
360 case ELK_GS_OPCODE_GET_INSTANCE_ID:
361 case ELK_GS_OPCODE_SET_PRIMITIVE_ID:
362 case ELK_GS_OPCODE_SVB_SET_DST_INDEX:
363 case ELK_TCS_OPCODE_SRC0_010_IS_ZERO:
364 case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
365 case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
366 case ELK_SHADER_OPCODE_READ_SR_REG:
367 if (devinfo->ver >= 11) {
368 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
369 0, 10, 6 /* XXX */, 14, 0, 0);
370 } else if (devinfo->ver >= 8) {
371 if (type_sz(info.tx) > 4)
372 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
373 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
374 else
375 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
376 0, 8, 4, 12, 0, 0);
377 } else if (devinfo->verx10 >= 75) {
378 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
379 0, 10, 6 /* XXX */, 16, 0, 0);
380 } else {
381 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
382 0, 12, 8 /* XXX */, 18, 0, 0);
383 }
384
385 case ELK_OPCODE_MOV:
386 case ELK_OPCODE_CMP:
387 case ELK_OPCODE_ADD:
388 case ELK_OPCODE_ADD3:
389 case ELK_OPCODE_MUL:
390 case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
391 case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
392 if (devinfo->ver >= 11) {
393 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
394 0, 10, 6, 14, 0, 0);
395 } else if (devinfo->ver >= 8) {
396 if (type_sz(info.tx) > 4)
397 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
398 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
399 else
400 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
401 0, 8, 4, 12, 0, 0);
402 } else if (devinfo->verx10 >= 75) {
403 if (info.tx == ELK_REGISTER_TYPE_F)
404 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
405 0, 12, 8 /* XXX */, 18, 0, 0);
406 else
407 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
408 0, 10, 6 /* XXX */, 16, 0, 0);
409 } else if (devinfo->ver >= 7) {
410 if (info.tx == ELK_REGISTER_TYPE_F)
411 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
412 0, 14, 10 /* XXX */, 20, 0, 0);
413 else
414 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
415 0, 12, 8 /* XXX */, 18, 0, 0);
416 } else {
417 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
418 0, 2 /* XXX */,
419 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
420 0, 0);
421 }
422
423 case ELK_OPCODE_BFE:
424 case ELK_OPCODE_BFI2:
425 case ELK_OPCODE_CSEL:
426 if (devinfo->ver >= 11)
427 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
428 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
429 else if (devinfo->ver >= 8)
430 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
431 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
432 else if (devinfo->verx10 >= 75)
433 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
434 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
435 else if (devinfo->ver >= 7)
436 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
437 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
438 else
439 abort();
440
441 case ELK_OPCODE_MAD:
442 if (devinfo->ver >= 11) {
443 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
444 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
445 } else if (devinfo->ver >= 8) {
446 if (type_sz(info.tx) > 4)
447 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
448 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
449 else
450 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
451 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
452 } else if (devinfo->verx10 >= 75) {
453 if (info.tx == ELK_REGISTER_TYPE_F)
454 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
455 0, 12, 8 /* XXX */, 18, 0, 0);
456 else
457 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
458 0, 10, 6 /* XXX */, 16, 0, 0);
459 } else if (devinfo->ver >= 7) {
460 if (info.tx == ELK_REGISTER_TYPE_F)
461 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
462 0, 14, 10 /* XXX */, 20, 0, 0);
463 else
464 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
465 0, 12, 8 /* XXX */, 18, 0, 0);
466 } else if (devinfo->ver >= 6) {
467 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
468 0, 2 /* XXX */,
469 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
470 0, 0);
471 } else {
472 abort();
473 }
474
475 case ELK_OPCODE_F32TO16:
476 if (devinfo->ver >= 11)
477 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
478 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
479 else if (devinfo->ver >= 8)
480 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
481 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
482 else if (devinfo->verx10 >= 75)
483 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
484 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
485 else if (devinfo->ver >= 7)
486 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
487 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
488 else
489 abort();
490
491 case ELK_OPCODE_DP4:
492 case ELK_OPCODE_DPH:
493 case ELK_OPCODE_DP3:
494 case ELK_OPCODE_DP2:
495 if (devinfo->ver >= 8)
496 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
497 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
498 else if (devinfo->verx10 >= 75)
499 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
500 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
501 else
502 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
503 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
504
505 case ELK_OPCODE_DP4A:
506 if (devinfo->ver >= 12)
507 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
508 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
509 else
510 abort();
511
512 case ELK_OPCODE_DPAS: {
513 unsigned ld;
514
515 switch (info.rcount) {
516 case 1:
517 ld = 21;
518 break;
519 case 2:
520 ld = 22;
521 break;
522 case 8:
523 default:
524 ld = 32;
525 break;
526 }
527
528 /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
529 * for la and lf.
530 */
531 if (devinfo->verx10 >= 125)
532 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
533 0, ld, UINT_MAX, UINT_MAX, 0, 0);
534 else
535 abort();
536 }
537
538 case ELK_SHADER_OPCODE_RCP:
539 case ELK_SHADER_OPCODE_RSQ:
540 case ELK_SHADER_OPCODE_SQRT:
541 case ELK_SHADER_OPCODE_EXP2:
542 case ELK_SHADER_OPCODE_LOG2:
543 case ELK_SHADER_OPCODE_SIN:
544 case ELK_SHADER_OPCODE_COS:
545 case ELK_SHADER_OPCODE_POW:
546 case ELK_SHADER_OPCODE_INT_QUOTIENT:
547 case ELK_SHADER_OPCODE_INT_REMAINDER:
548 if (devinfo->ver >= 6) {
549 switch (info.op) {
550 case ELK_SHADER_OPCODE_RCP:
551 case ELK_SHADER_OPCODE_RSQ:
552 case ELK_SHADER_OPCODE_SQRT:
553 case ELK_SHADER_OPCODE_EXP2:
554 case ELK_SHADER_OPCODE_LOG2:
555 case ELK_SHADER_OPCODE_SIN:
556 case ELK_SHADER_OPCODE_COS:
557 if (devinfo->ver >= 8)
558 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
559 0, 16, 0, 0, 0, 0);
560 else if (devinfo->verx10 >= 75)
561 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
562 0, 12, 0, 0, 0, 0);
563 else
564 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
565 0, 14, 0, 0, 0, 0);
566
567 case ELK_SHADER_OPCODE_POW:
568 if (devinfo->ver >= 8)
569 return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
570 0, 24, 0, 0, 0, 0);
571 else if (devinfo->verx10 >= 75)
572 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
573 0, 20, 0, 0, 0, 0);
574 else
575 return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
576 0, 22, 0, 0, 0, 0);
577
578 case ELK_SHADER_OPCODE_INT_QUOTIENT:
579 case ELK_SHADER_OPCODE_INT_REMAINDER:
580 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
581 0, 28 /* XXX */, 0, 0, 0, 0);
582
583 default:
584 abort();
585 }
586 } else {
587 switch (info.op) {
588 case ELK_SHADER_OPCODE_RCP:
589 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
590 0, 22, 0, 0, 0, 8);
591
592 case ELK_SHADER_OPCODE_RSQ:
593 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
594 0, 44, 0, 0, 0, 8);
595
596 case ELK_SHADER_OPCODE_INT_QUOTIENT:
597 case ELK_SHADER_OPCODE_SQRT:
598 case ELK_SHADER_OPCODE_LOG2:
599 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
600 0, 66, 0, 0, 0, 8);
601
602 case ELK_SHADER_OPCODE_INT_REMAINDER:
603 case ELK_SHADER_OPCODE_EXP2:
604 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
605 0, 88, 0, 0, 0, 8);
606
607 case ELK_SHADER_OPCODE_SIN:
608 case ELK_SHADER_OPCODE_COS:
609 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
610 0, 132, 0, 0, 0, 8);
611
612 case ELK_SHADER_OPCODE_POW:
613 return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
614 0, 176, 0, 0, 0, 8);
615
616 default:
617 abort();
618 }
619 }
620
621 case ELK_OPCODE_DO:
622 if (devinfo->ver >= 6)
623 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0);
625 else
626 return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
627 0, 0, 0, 0, 0, 0);
628
629 case ELK_OPCODE_IF:
630 case ELK_OPCODE_ELSE:
631 case ELK_OPCODE_ENDIF:
632 case ELK_OPCODE_WHILE:
633 case ELK_OPCODE_BREAK:
634 case ELK_OPCODE_CONTINUE:
635 case ELK_OPCODE_HALT:
636 if (devinfo->ver >= 8)
637 return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
638 0, 0, 0, 0, 0, 0);
639 else if (devinfo->verx10 >= 75)
640 return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
641 0, 0, 0, 0, 0, 0);
642 else
643 return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
644 0, 0, 0, 0, 0, 0);
645
646 case ELK_FS_OPCODE_LINTERP:
647 if (devinfo->ver >= 8)
648 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
649 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
650 else if (devinfo->verx10 >= 75)
651 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
652 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
653 else
654 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
655 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
656
657 case ELK_OPCODE_LRP:
658 if (devinfo->ver >= 8)
659 return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
660 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
661 else if (devinfo->verx10 >= 75)
662 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
663 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
664 else if (devinfo->ver >= 6)
665 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
666 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
667 else
668 abort();
669
670 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
671 if (devinfo->ver >= 11)
672 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
673 0, 10 /* XXX */, 6 /* XXX */,
674 14 /* XXX */, 0, 0);
675 else if (devinfo->ver >= 8)
676 return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
677 0, 8 /* XXX */, 4 /* XXX */,
678 12 /* XXX */, 0, 0);
679 else if (devinfo->verx10 >= 75)
680 return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
681 0, 10 /* XXX */, 6 /* XXX */,
682 16 /* XXX */, 0, 0);
683 else if (devinfo->ver >= 7)
684 return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
685 0, 12 /* XXX */, 8 /* XXX */,
686 18 /* XXX */, 0, 0);
687 else
688 abort();
689
690 case ELK_SHADER_OPCODE_MOV_INDIRECT:
691 if (devinfo->ver >= 11)
692 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
693 0, 10 /* XXX */, 6 /* XXX */,
694 14 /* XXX */, 0, 0);
695 else if (devinfo->ver >= 8)
696 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
697 0, 8 /* XXX */, 4 /* XXX */,
698 12 /* XXX */, 0, 0);
699 else if (devinfo->verx10 >= 75)
700 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
701 0, 10 /* XXX */, 6 /* XXX */,
702 16 /* XXX */, 0, 0);
703 else
704 return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
705 0, 12 /* XXX */, 8 /* XXX */,
706 18 /* XXX */, 0, 0);
707
708 case ELK_SHADER_OPCODE_BROADCAST:
709 if (devinfo->ver >= 11)
710 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
711 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
712 else if (devinfo->ver >= 8)
713 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
714 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
715 else if (devinfo->verx10 >= 75)
716 return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
717 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
718 else if (devinfo->ver >= 7)
719 return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
720 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
721 else
722 abort();
723
724 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
725 case ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
726 if (devinfo->ver >= 11)
727 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
728 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
729 else if (devinfo->ver >= 8)
730 return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
731 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
732 else if (devinfo->verx10 >= 75)
733 return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
734 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
735 else if (devinfo->ver >= 7)
736 return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
737 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
738 else
739 abort();
740
741 case ELK_SHADER_OPCODE_RND_MODE:
742 case ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE:
743 if (devinfo->ver >= 11)
744 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
745 4 /* XXX */, 0,
746 0, 0, 0, 0, 0, 0);
747 else if (devinfo->ver >= 8)
748 return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
749 4 /* XXX */, 0,
750 0, 0, 0, 0, 0, 0);
751 else if (devinfo->verx10 >= 75)
752 return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
753 4 /* XXX */, 0,
754 0, 0, 0, 0, 0, 0);
755 else if (devinfo->ver >= 6)
756 return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
757 4 /* XXX */, 0,
758 0, 0, 0, 0, 0, 0);
759 else
760 abort();
761
762 case ELK_SHADER_OPCODE_SHUFFLE:
763 if (devinfo->ver >= 11)
764 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
765 44 /* XXX */, 0,
766 0, 10 /* XXX */, 6 /* XXX */,
767 14 /* XXX */, 0, 0);
768 else if (devinfo->ver >= 8)
769 return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
770 42 /* XXX */, 0,
771 0, 8 /* XXX */, 4 /* XXX */,
772 12 /* XXX */, 0, 0);
773 else if (devinfo->verx10 >= 75)
774 return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
775 0, 44 /* XXX */,
776 0, 10 /* XXX */, 6 /* XXX */,
777 16 /* XXX */, 0, 0);
778 else if (devinfo->ver >= 6)
779 return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
780 0, 46 /* XXX */,
781 0, 12 /* XXX */, 8 /* XXX */,
782 18 /* XXX */, 0, 0);
783 else
784 abort();
785
786 case ELK_SHADER_OPCODE_SEL_EXEC:
787 if (devinfo->ver >= 11)
788 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
789 0, 4 /* XXX */,
790 0, 10 /* XXX */, 6 /* XXX */,
791 14 /* XXX */, 0, 0);
792 else if (devinfo->ver >= 8)
793 return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
794 0, 4 /* XXX */,
795 0, 8 /* XXX */, 4 /* XXX */,
796 12 /* XXX */, 0, 0);
797 else if (devinfo->verx10 >= 75)
798 return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
799 0, 4 /* XXX */,
800 0, 10 /* XXX */, 6 /* XXX */,
801 16 /* XXX */, 0, 0);
802 else
803 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
804 0, 4 /* XXX */,
805 0, 12 /* XXX */, 8 /* XXX */,
806 18 /* XXX */, 0, 0);
807
808 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
809 if (devinfo->ver >= 11)
810 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
811 0, 8 /* XXX */,
812 0, 10 /* XXX */, 6 /* XXX */,
813 14 /* XXX */, 0, 0);
814 else if (devinfo->ver >= 8)
815 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
816 0, 8 /* XXX */,
817 0, 8 /* XXX */, 4 /* XXX */,
818 12 /* XXX */, 0, 0);
819 else if (devinfo->verx10 >= 75)
820 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
821 0, 8 /* XXX */,
822 0, 10 /* XXX */, 6 /* XXX */,
823 16 /* XXX */, 0, 0);
824 else
825 return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
826 0, 8 /* XXX */,
827 0, 12 /* XXX */, 8 /* XXX */,
828 18 /* XXX */, 0, 0);
829
830 case ELK_FS_OPCODE_DDY_FINE:
831 if (devinfo->ver >= 11)
832 return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
833 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
834 else if (devinfo->ver >= 8)
835 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
836 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
837 else if (devinfo->verx10 >= 75)
838 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
839 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
840 else
841 return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
842 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
843
844 case ELK_FS_OPCODE_LOAD_LIVE_CHANNELS:
845 if (devinfo->ver >= 11)
846 return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
847 2 /* XXX */, 0,
848 0, 0, 0, 10 /* XXX */, 0, 0);
849 else if (devinfo->ver >= 8)
850 return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
851 0, 2 /* XXX */,
852 0, 0, 0, 8 /* XXX */, 0, 0);
853 else
854 abort();
855
856 case ELK_VEC4_OPCODE_PACK_BYTES:
857 if (devinfo->ver >= 8)
858 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
859 4 /* XXX */, 0,
860 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
861 0, 0);
862 else if (devinfo->verx10 >= 75)
863 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
864 4 /* XXX */, 0,
865 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
866 0, 0);
867 else
868 return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
869 4 /* XXX */, 0,
870 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
871 0, 0);
872
873 case ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
874 case ELK_TCS_OPCODE_GET_INSTANCE_ID:
875 case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
876 case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
877 case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
878 if (devinfo->ver >= 8)
879 return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
880 6 /* XXX */, 0,
881 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
882 0, 0);
883 else if (devinfo->verx10 >= 75)
884 return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
885 6 /* XXX */, 0,
886 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
887 0, 0);
888 else
889 return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
890 6 /* XXX */, 0,
891 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
892 0, 0);
893
894 case ELK_GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
895 case ELK_TCS_OPCODE_CREATE_BARRIER_HEADER:
896 if (devinfo->ver >= 8)
897 return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
898 8 /* XXX */, 0,
899 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
900 0, 0);
901 else if (devinfo->verx10 >= 75)
902 return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
903 8 /* XXX */, 0,
904 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
905 0, 0);
906 else if (devinfo->ver >= 6)
907 return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
908 8 /* XXX */, 0,
909 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
910 0, 0);
911 else
912 abort();
913
914 case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
915 if (devinfo->ver >= 8)
916 return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
917 4 /* XXX */, 0,
918 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
919 0, 0);
920 else if (devinfo->verx10 >= 75)
921 return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
922 4 /* XXX */, 0,
923 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
924 0, 0);
925 else if (devinfo->ver >= 7)
926 return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
927 4 /* XXX */, 0,
928 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
929 0, 0);
930 else
931 abort();
932
933 case ELK_SHADER_OPCODE_TEX:
934 case ELK_FS_OPCODE_TXB:
935 case ELK_SHADER_OPCODE_TXD:
936 case ELK_SHADER_OPCODE_TXF:
937 case ELK_SHADER_OPCODE_TXF_LZ:
938 case ELK_SHADER_OPCODE_TXL:
939 case ELK_SHADER_OPCODE_TXL_LZ:
940 case ELK_SHADER_OPCODE_TXF_CMS:
941 case ELK_SHADER_OPCODE_TXF_CMS_W:
942 case ELK_SHADER_OPCODE_TXF_UMS:
943 case ELK_SHADER_OPCODE_TXF_MCS:
944 case ELK_SHADER_OPCODE_TXS:
945 case ELK_SHADER_OPCODE_LOD:
946 case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
947 case ELK_SHADER_OPCODE_TG4:
948 case ELK_SHADER_OPCODE_TG4_OFFSET:
949 case ELK_SHADER_OPCODE_SAMPLEINFO:
950 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
951 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
952 8 /* XXX */, 750 /* XXX */, 0, 0,
953 2 /* XXX */, 0);
954
955 case ELK_VEC4_OPCODE_URB_READ:
956 case ELK_VEC4_VS_OPCODE_URB_WRITE:
957 case ELK_VEC4_GS_OPCODE_URB_WRITE:
958 case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
959 case ELK_GS_OPCODE_THREAD_END:
960 case ELK_GS_OPCODE_FF_SYNC:
961 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
962 case ELK_TCS_OPCODE_RELEASE_INPUT:
963 case ELK_TCS_OPCODE_THREAD_END:
964 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
965 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
966
967 case ELK_SHADER_OPCODE_MEMORY_FENCE:
968 case ELK_SHADER_OPCODE_INTERLOCK:
969 switch (info.sfid) {
970 case GFX6_SFID_DATAPORT_RENDER_CACHE:
971 if (devinfo->ver >= 7)
972 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
973 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
974 else
975 abort();
976
977 case ELK_SFID_URB:
978 case GFX7_SFID_DATAPORT_DATA_CACHE:
979 case GFX12_SFID_SLM:
980 case GFX12_SFID_TGM:
981 case GFX12_SFID_UGM:
982 case HSW_SFID_DATAPORT_DATA_CACHE_1:
983 if (devinfo->ver >= 7)
984 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
985 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
986 else
987 abort();
988
989 default:
990 abort();
991 }
992
993 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
994 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
995 case ELK_SHADER_OPCODE_GFX7_SCRATCH_READ:
996 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
997 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
998
999 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
1000 if (devinfo->ver >= 7)
1001 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1002 30 /* XXX */, 400 /* XXX */,
1003 10 /* XXX */, 100 /* XXX */, 0, 0,
1004 0, 400 /* XXX */);
1005 else
1006 abort();
1007
1008 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
1009 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
1010 if (devinfo->ver >= 7)
1011 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1012 0, 20 /* XXX */,
1013 10 /* XXX */, 100 /* XXX */, 0, 0,
1014 0, 0);
1015 else
1016 abort();
1017
1018 case ELK_FS_OPCODE_FB_WRITE:
1019 case ELK_FS_OPCODE_FB_READ:
1020 case ELK_FS_OPCODE_REP_FB_WRITE:
1021 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
1022 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1023
1024 case ELK_GS_OPCODE_SVB_WRITE:
1025 if (devinfo->ver >= 6)
1026 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
1027 0, 450 /* XXX */,
1028 10 /* XXX */, 300 /* XXX */, 0, 0,
1029 0, 0);
1030 else
1031 abort();
1032
1033 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1034 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
1035 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1036
1037 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
1038 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1039 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1040 8, 750, 0, 0, 2, 0);
1041
1042 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1043 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1044 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1045 if (devinfo->ver >= 7)
1046 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1047 0, 90 /* XXX */, 0, 0, 0, 0);
1048 else
1049 abort();
1050
1051 case ELK_SHADER_OPCODE_BARRIER:
1052 if (devinfo->ver >= 7)
1053 return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
1054 0 /* XXX */, 0,
1055 0, 0, 0, 0, 0, 0);
1056 else
1057 abort();
1058
1059 case ELK_CS_OPCODE_CS_TERMINATE:
1060 if (devinfo->ver >= 7)
1061 return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1062 10 /* XXX */, 0, 0, 0, 0, 0);
1063 else
1064 abort();
1065
1066 case ELK_SHADER_OPCODE_SEND:
1067 switch (info.sfid) {
1068 case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
1069 if (devinfo->ver >= 7) {
1070 /* See ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
1071 return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
1072 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1073 } else {
1074 abort();
1075 }
1076 case GFX6_SFID_DATAPORT_RENDER_CACHE:
1077 if (devinfo->ver >= 7) {
1078 switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1079 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1080 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1081 30 /* XXX */, 450 /* XXX */,
1082 10 /* XXX */, 100 /* XXX */,
1083 0, 0, 0, 400 /* XXX */);
1084 default:
1085 return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1086 0, 450 /* XXX */,
1087 10 /* XXX */, 300 /* XXX */, 0, 0,
1088 0, 0);
1089 }
1090 } else if (devinfo->ver >= 6) {
1091 return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
1092 0, 450 /* XXX */,
1093 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1094 } else {
1095 abort();
1096 }
1097 case ELK_SFID_SAMPLER: {
1098 if (devinfo->ver >= 6)
1099 return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1100 8, 750, 0, 0, 2, 0);
1101 else
1102 abort();
1103 }
1104 case GFX7_SFID_DATAPORT_DATA_CACHE:
1105 case HSW_SFID_DATAPORT_DATA_CACHE_1:
1106 if (devinfo->verx10 >= 75) {
1107 switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1108 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1109 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1110 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1111 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1112 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1113 30 /* XXX */, 400 /* XXX */,
1114 10 /* XXX */, 100 /* XXX */, 0, 0,
1115 0, 400 /* XXX */);
1116
1117 default:
1118 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1119 0, 20 /* XXX */,
1120 10 /* XXX */, 100 /* XXX */, 0, 0,
1121 0, 0);
1122 }
1123 } else if (devinfo->ver >= 7) {
1124 switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1125 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1126 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1127 30 /* XXX */, 400 /* XXX */,
1128 10 /* XXX */, 100 /* XXX */,
1129 0, 0, 0, 400 /* XXX */);
1130 default:
1131 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1132 0, 20 /* XXX */,
1133 10 /* XXX */, 100 /* XXX */, 0, 0,
1134 0, 0);
1135 }
1136 } else {
1137 abort();
1138 }
1139
1140 case GFX7_SFID_PIXEL_INTERPOLATOR:
1141 if (devinfo->ver >= 7)
1142 return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1143 0, 90 /* XXX */, 0, 0, 0, 0);
1144 else
1145 abort();
1146
1147 case GFX12_SFID_UGM:
1148 case GFX12_SFID_TGM:
1149 case GFX12_SFID_SLM:
1150 switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1151 case LSC_OP_LOAD:
1152 case LSC_OP_STORE:
1153 case LSC_OP_LOAD_CMASK:
1154 case LSC_OP_STORE_CMASK:
1155 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1156 0, 20 /* XXX */,
1157 10 /* XXX */, 100 /* XXX */, 0, 0,
1158 0, 0);
1159
1160 case LSC_OP_FENCE:
1161 case LSC_OP_ATOMIC_INC:
1162 case LSC_OP_ATOMIC_DEC:
1163 case LSC_OP_ATOMIC_LOAD:
1164 case LSC_OP_ATOMIC_STORE:
1165 case LSC_OP_ATOMIC_ADD:
1166 case LSC_OP_ATOMIC_SUB:
1167 case LSC_OP_ATOMIC_MIN:
1168 case LSC_OP_ATOMIC_MAX:
1169 case LSC_OP_ATOMIC_UMIN:
1170 case LSC_OP_ATOMIC_UMAX:
1171 case LSC_OP_ATOMIC_CMPXCHG:
1172 case LSC_OP_ATOMIC_FADD:
1173 case LSC_OP_ATOMIC_FSUB:
1174 case LSC_OP_ATOMIC_FMIN:
1175 case LSC_OP_ATOMIC_FMAX:
1176 case LSC_OP_ATOMIC_FCMPXCHG:
1177 case LSC_OP_ATOMIC_AND:
1178 case LSC_OP_ATOMIC_OR:
1179 case LSC_OP_ATOMIC_XOR:
1180 return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1181 30 /* XXX */, 400 /* XXX */,
1182 10 /* XXX */, 100 /* XXX */, 0, 0,
1183 0, 400 /* XXX */);
1184 default:
1185 abort();
1186 }
1187
1188 case ELK_SFID_URB:
1189 return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
1190 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
1191
1192 default:
1193 abort();
1194 }
1195
1196 case ELK_SHADER_OPCODE_UNDEF:
1197 case ELK_SHADER_OPCODE_HALT_TARGET:
1198 case ELK_FS_OPCODE_SCHEDULING_FENCE:
1199 return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
1200 0, 0, 0, 0, 0, 0);
1201
1202 default:
1203 abort();
1204 }
1205 }
1206
1207 /**
1208 * Model the performance behavior of a stall on the specified dependency
1209 * ID.
1210 */
1211 void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)1212 stall_on_dependency(state &st, enum intel_eu_dependency_id id)
1213 {
1214 if (id < ARRAY_SIZE(st.dep_ready))
1215 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1216 st.dep_ready[id]);
1217 }
1218
1219 /**
1220 * Model the performance behavior of the front-end and back-end while
1221 * executing an instruction with the specified timing information, assuming
1222 * all dependencies are already clear.
1223 */
1224 void
execute_instruction(state & st,const perf_desc & perf)1225 execute_instruction(state &st, const perf_desc &perf)
1226 {
1227 /* Compute the time at which the front-end will be ready to execute the
1228 * next instruction.
1229 */
1230 st.unit_ready[EU_UNIT_FE] += perf.df;
1231
1232 if (perf.u < EU_NUM_UNITS) {
1233 /* Wait for the back-end to be ready to execute this instruction. */
1234 st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1235 st.unit_ready[perf.u]);
1236
1237 /* Compute the time at which the back-end will be ready to execute
1238 * the next instruction, and update the back-end utilization.
1239 */
1240 st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
1241 st.unit_busy[perf.u] += perf.db * st.weight;
1242 }
1243 }
1244
1245 /**
1246 * Model the performance behavior of a read dependency provided by an
1247 * instruction.
1248 */
1249 void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1250 mark_read_dependency(state &st, const perf_desc &perf,
1251 enum intel_eu_dependency_id id)
1252 {
1253 if (id < ARRAY_SIZE(st.dep_ready))
1254 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
1255 }
1256
1257 /**
1258 * Model the performance behavior of a write dependency provided by an
1259 * instruction.
1260 */
1261 void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1262 mark_write_dependency(state &st, const perf_desc &perf,
1263 enum intel_eu_dependency_id id)
1264 {
1265 if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
1266 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
1267 else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
1268 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
1269 else if (id < ARRAY_SIZE(st.dep_ready))
1270 st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
1271 }
1272
1273 /**
1274 * Return the dependency ID of a elk_backend_reg, offset by \p delta GRFs.
1275 */
1276 enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const elk_backend_reg & r,const int delta)1277 reg_dependency_id(const intel_device_info *devinfo, const elk_backend_reg &r,
1278 const int delta)
1279 {
1280 if (r.file == VGRF) {
1281 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1282 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1283 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1284
1285 } else if (r.file == FIXED_GRF) {
1286 const unsigned i = r.nr + delta;
1287 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1288 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1289
1290 } else if (r.file == MRF && devinfo->ver >= 7) {
1291 const unsigned i = GFX7_MRF_HACK_START +
1292 r.nr + r.offset / REG_SIZE + delta;
1293 assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1294 return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1295
1296 } else if (r.file == MRF && devinfo->ver < 7) {
1297 const unsigned i = (r.nr & ~ELK_MRF_COMPR4) +
1298 r.offset / REG_SIZE + delta;
1299 assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
1300 return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
1301
1302 } else if (r.file == ARF && r.nr >= ELK_ARF_ADDRESS &&
1303 r.nr < ELK_ARF_ACCUMULATOR) {
1304 assert(delta == 0);
1305 return EU_DEPENDENCY_ID_ADDR0;
1306
1307 } else if (r.file == ARF && r.nr >= ELK_ARF_ACCUMULATOR &&
1308 r.nr < ELK_ARF_FLAG) {
1309 const unsigned i = r.nr - ELK_ARF_ACCUMULATOR + delta;
1310 assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
1311 return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
1312
1313 } else {
1314 return EU_NUM_DEPENDENCY_IDS;
1315 }
1316 }
1317
1318 /**
1319 * Return the dependency ID of flag register starting at offset \p i.
1320 */
1321 enum intel_eu_dependency_id
flag_dependency_id(unsigned i)1322 flag_dependency_id(unsigned i)
1323 {
1324 assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
1325 return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
1326 }
1327
1328 /**
1329 * Return the dependency ID corresponding to the SBID read completion
1330 * condition of a Gfx12+ SWSB.
1331 */
1332 enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1333 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1334 {
1335 if (swsb.mode) {
1336 assert(swsb.sbid <
1337 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
1338 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
1339 } else {
1340 return EU_NUM_DEPENDENCY_IDS;
1341 }
1342 }
1343
1344 /**
1345 * Return the dependency ID corresponding to the SBID write completion
1346 * condition of a Gfx12+ SWSB.
1347 */
1348 enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1349 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1350 {
1351 if (swsb.mode) {
1352 assert(swsb.sbid <
1353 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
1354 return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
1355 } else {
1356 return EU_NUM_DEPENDENCY_IDS;
1357 }
1358 }
1359
1360 /**
1361 * Return the implicit accumulator register accessed by channel \p i of the
1362 * instruction.
1363 */
1364 unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const elk_backend_instruction * inst,elk_reg_type tx,unsigned i)1365 accum_reg_of_channel(const intel_device_info *devinfo,
1366 const elk_backend_instruction *inst,
1367 elk_reg_type tx, unsigned i)
1368 {
1369 assert(inst->reads_accumulator_implicitly() ||
1370 inst->writes_accumulator_implicitly(devinfo));
1371 const unsigned offset = (inst->group + i) * type_sz(tx) *
1372 (devinfo->ver < 7 || elk_reg_type_is_floating_point(tx) ? 1 : 2);
1373 return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
1374 }
1375
1376 /**
1377 * Model the performance behavior of an FS back-end instruction.
1378 */
1379 void
issue_fs_inst(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1380 issue_fs_inst(state &st, const struct elk_isa_info *isa,
1381 const elk_backend_instruction *be_inst)
1382 {
1383 const struct intel_device_info *devinfo = isa->devinfo;
1384 const elk_fs_inst *inst = static_cast<const elk_fs_inst *>(be_inst);
1385 const instruction_info info(isa, inst);
1386 const perf_desc perf = instruction_desc(info);
1387
1388 /* Stall on any source dependencies. */
1389 for (unsigned i = 0; i < inst->sources; i++) {
1390 for (unsigned j = 0; j < regs_read(inst, i); j++)
1391 stall_on_dependency(
1392 st, reg_dependency_id(devinfo, inst->src[i], j));
1393 }
1394
1395 if (inst->reads_accumulator_implicitly()) {
1396 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1397 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1398 inst->exec_size - 1); j++)
1399 stall_on_dependency(
1400 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1401 }
1402
1403 if (is_send(inst) && inst->base_mrf != -1) {
1404 for (unsigned j = 0; j < inst->mlen; j++)
1405 stall_on_dependency(
1406 st, reg_dependency_id(
1407 devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1408 }
1409
1410 if (const unsigned mask = inst->flags_read(devinfo)) {
1411 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1412 if (mask & (1 << i))
1413 stall_on_dependency(st, flag_dependency_id(i));
1414 }
1415 }
1416
1417 /* Stall on any write dependencies. */
1418 if (!inst->no_dd_check) {
1419 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1420 for (unsigned j = 0; j < regs_written(inst); j++)
1421 stall_on_dependency(
1422 st, reg_dependency_id(devinfo, inst->dst, j));
1423 }
1424
1425 if (inst->writes_accumulator_implicitly(devinfo)) {
1426 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1427 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1428 inst->exec_size - 1); j++)
1429 stall_on_dependency(
1430 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1431 }
1432
1433 if (const unsigned mask = inst->flags_written(devinfo)) {
1434 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1435 if (mask & (1 << i))
1436 stall_on_dependency(st, flag_dependency_id(i));
1437 }
1438 }
1439 }
1440
1441 /* Stall on any SBID dependencies. */
1442 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1443 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1444 else if (inst->sched.mode & TGL_SBID_SRC)
1445 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1446
1447 /* Execute the instruction. */
1448 execute_instruction(st, perf);
1449
1450 /* Mark any source dependencies. */
1451 if (inst->is_send_from_grf()) {
1452 for (unsigned i = 0; i < inst->sources; i++) {
1453 if (inst->is_payload(i)) {
1454 for (unsigned j = 0; j < regs_read(inst, i); j++)
1455 mark_read_dependency(
1456 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1457 }
1458 }
1459 }
1460
1461 if (is_send(inst) && inst->base_mrf != -1) {
1462 for (unsigned j = 0; j < inst->mlen; j++)
1463 mark_read_dependency(st, perf,
1464 reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1465 }
1466
1467 /* Mark any destination dependencies. */
1468 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1469 for (unsigned j = 0; j < regs_written(inst); j++) {
1470 mark_write_dependency(st, perf,
1471 reg_dependency_id(devinfo, inst->dst, j));
1472 }
1473 }
1474
1475 if (inst->writes_accumulator_implicitly(devinfo)) {
1476 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1477 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1478 inst->exec_size - 1); j++)
1479 mark_write_dependency(st, perf,
1480 reg_dependency_id(devinfo, elk_acc_reg(8), j));
1481 }
1482
1483 if (const unsigned mask = inst->flags_written(devinfo)) {
1484 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1485 if (mask & (1 << i))
1486 mark_write_dependency(st, perf, flag_dependency_id(i));
1487 }
1488 }
1489
1490 /* Mark any SBID dependencies. */
1491 if (inst->sched.mode & TGL_SBID_SET) {
1492 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1493 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1494 }
1495 }
1496
1497 /**
1498 * Model the performance behavior of a VEC4 back-end instruction.
1499 */
1500 void
issue_vec4_instruction(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1501 issue_vec4_instruction(state &st, const struct elk_isa_info *isa,
1502 const elk_backend_instruction *be_inst)
1503 {
1504 const struct intel_device_info *devinfo = isa->devinfo;
1505 const vec4_instruction *inst =
1506 static_cast<const vec4_instruction *>(be_inst);
1507 const instruction_info info(isa, inst);
1508 const perf_desc perf = instruction_desc(info);
1509
1510 /* Stall on any source dependencies. */
1511 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1512 for (unsigned j = 0; j < regs_read(inst, i); j++)
1513 stall_on_dependency(
1514 st, reg_dependency_id(devinfo, inst->src[i], j));
1515 }
1516
1517 if (inst->reads_accumulator_implicitly()) {
1518 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1519 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1520 inst->exec_size - 1); j++)
1521 stall_on_dependency(
1522 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1523 }
1524
1525 if (inst->base_mrf != -1) {
1526 for (unsigned j = 0; j < inst->mlen; j++)
1527 stall_on_dependency(
1528 st, reg_dependency_id(
1529 devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1530 }
1531
1532 if (inst->reads_flag())
1533 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1534
1535 /* Stall on any write dependencies. */
1536 if (!inst->no_dd_check) {
1537 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1538 for (unsigned j = 0; j < regs_written(inst); j++)
1539 stall_on_dependency(
1540 st, reg_dependency_id(devinfo, inst->dst, j));
1541 }
1542
1543 if (inst->writes_accumulator_implicitly(devinfo)) {
1544 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1545 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1546 inst->exec_size - 1); j++)
1547 stall_on_dependency(
1548 st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1549 }
1550
1551 if (inst->writes_flag(devinfo))
1552 stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1553 }
1554
1555 /* Execute the instruction. */
1556 execute_instruction(st, perf);
1557
1558 /* Mark any source dependencies. */
1559 if (inst->is_send_from_grf()) {
1560 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1561 for (unsigned j = 0; j < regs_read(inst, i); j++)
1562 mark_read_dependency(
1563 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1564 }
1565 }
1566
1567 if (inst->base_mrf != -1) {
1568 for (unsigned j = 0; j < inst->mlen; j++)
1569 mark_read_dependency(st, perf,
1570 reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1571 }
1572
1573 /* Mark any destination dependencies. */
1574 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1575 for (unsigned j = 0; j < regs_written(inst); j++) {
1576 mark_write_dependency(st, perf,
1577 reg_dependency_id(devinfo, inst->dst, j));
1578 }
1579 }
1580
1581 if (inst->writes_accumulator_implicitly(devinfo)) {
1582 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1583 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1584 inst->exec_size - 1); j++)
1585 mark_write_dependency(st, perf,
1586 reg_dependency_id(devinfo, elk_acc_reg(8), j));
1587 }
1588
1589 if (inst->writes_flag(devinfo))
1590 mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
1591 }
1592
1593 /**
1594 * Calculate the maximum possible throughput of the program compatible with
1595 * the cycle-count utilization estimated for each asynchronous unit, in
1596 * threads-per-cycle units.
1597 */
1598 float
calculate_thread_throughput(const state & st,float busy)1599 calculate_thread_throughput(const state &st, float busy)
1600 {
1601 for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1602 busy = MAX2(busy, st.unit_busy[i]);
1603
1604 return 1.0 / busy;
1605 }
1606
1607 /**
1608 * Estimate the performance of the specified shader.
1609 */
1610 void
calculate_performance(performance & p,const elk_backend_shader * s,void (* issue_instruction)(state &,const struct elk_isa_info *,const elk_backend_instruction *),unsigned dispatch_width)1611 calculate_performance(performance &p, const elk_backend_shader *s,
1612 void (*issue_instruction)(
1613 state &, const struct elk_isa_info *,
1614 const elk_backend_instruction *),
1615 unsigned dispatch_width)
1616 {
1617 /* XXX - Note that the previous version of this code used worst-case
1618 * scenario estimation of branching divergence for SIMD32 shaders,
1619 * but this heuristic was removed to improve performance in common
1620 * scenarios. Wider shader variants are less optimal when divergence
1621 * is high, e.g. when application renders complex scene on a small
1622 * surface. It is assumed that such renders are short, so their
1623 * time doesn't matter and when it comes to the overall performance,
1624 * they are dominated by more optimal larger renders.
1625 *
1626 * It's possible that we could do better with divergence analysis
1627 * by isolating branches which are 100% uniform.
1628 *
1629 * Plumbing the trip counts from NIR loop analysis would allow us
1630 * to do a better job regarding the loop weights.
1631 *
1632 * In the meantime use values that roughly match the control flow
1633 * weights used elsewhere in the compiler back-end.
1634 *
1635 * Note that we provide slightly more pessimistic weights on
1636 * Gfx12+ for SIMD32, since the effective warp size on that
1637 * platform is 2x the SIMD width due to EU fusion, which increases
1638 * the likelihood of divergent control flow in comparison to
1639 * previous generations, giving narrower SIMD modes a performance
1640 * advantage in several test-cases with non-uniform discard jumps.
1641 */
1642 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1643 1.0 : 0.5);
1644 const float loop_weight = 10;
1645 unsigned halt_count = 0;
1646 unsigned elapsed = 0;
1647 state st;
1648
1649 foreach_block(block, s->cfg) {
1650 const unsigned elapsed0 = elapsed;
1651
1652 foreach_inst_in_block(elk_backend_instruction, inst, block) {
1653 const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1654
1655 issue_instruction(st, &s->compiler->isa, inst);
1656
1657 if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET && halt_count)
1658 st.weight /= discard_weight;
1659
1660 elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1661
1662 if (inst->opcode == ELK_OPCODE_DO)
1663 st.weight *= loop_weight;
1664 else if (inst->opcode == ELK_OPCODE_WHILE)
1665 st.weight /= loop_weight;
1666 else if (inst->opcode == ELK_OPCODE_HALT && !halt_count++)
1667 st.weight *= discard_weight;
1668 }
1669
1670 p.block_latency[block->num] = elapsed - elapsed0;
1671 }
1672
1673 p.latency = elapsed;
1674 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1675 }
1676 }
1677
performance(const elk_fs_visitor * v)1678 elk::performance::performance(const elk_fs_visitor *v) :
1679 block_latency(new unsigned[v->cfg->num_blocks])
1680 {
1681 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1682 }
1683
performance(const vec4_visitor * v)1684 elk::performance::performance(const vec4_visitor *v) :
1685 block_latency(new unsigned[v->cfg->num_blocks])
1686 {
1687 calculate_performance(*this, v, issue_vec4_instruction, 8);
1688 }
1689
~performance()1690 elk::performance::~performance()
1691 {
1692 delete[] block_latency;
1693 }
1694