• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_eu.h"
25 #include "elk_fs.h"
26 #include "elk_vec4.h"
27 #include "elk_cfg.h"
28 
29 using namespace elk;
30 
31 namespace {
32    /**
33     * Enumeration representing the various asynchronous units that can run
34     * computations in parallel on behalf of a shader thread.
35     */
36    enum intel_eu_unit {
37       /** EU front-end. */
38       EU_UNIT_FE,
39       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40       EU_UNIT_FPU,
41       /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42       EU_UNIT_EM,
43       /** Sampler shared function. */
44       EU_UNIT_SAMPLER,
45       /** Pixel Interpolator shared function. */
46       EU_UNIT_PI,
47       /** Unified Return Buffer shared function. */
48       EU_UNIT_URB,
49       /** Data Port Data Cache shared function. */
50       EU_UNIT_DP_DC,
51       /** Data Port Render Cache shared function. */
52       EU_UNIT_DP_RC,
53       /** Data Port Constant Cache shared function. */
54       EU_UNIT_DP_CC,
55       /** Message Gateway shared function. */
56       EU_UNIT_GATEWAY,
57       /** Thread Spawner shared function. */
58       EU_UNIT_SPAWNER,
59       /* EU_UNIT_VME, */
60       /* EU_UNIT_CRE, */
61       /** Number of asynchronous units currently tracked. */
62       EU_NUM_UNITS,
63       /** Dummy unit for instructions that don't consume runtime from the above. */
64       EU_UNIT_NULL = EU_NUM_UNITS
65    };
66 
67    /**
68     * Enumeration representing a computation result another computation can
69     * potentially depend on.
70     */
71    enum intel_eu_dependency_id {
72       /* Register part of the GRF. */
73       EU_DEPENDENCY_ID_GRF0 = 0,
74       /* Register part of the MRF.  Only used on Gfx4-6. */
75       EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
76       /* Address register part of the ARF. */
77       EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
78       /* Accumulator register part of the ARF. */
79       EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
80       /* Flag register part of the ARF. */
81       EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
82       /* SBID token write completion.  Only used on Gfx12+. */
83       EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
84       /* SBID token read completion.  Only used on Gfx12+. */
85       EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
86       /* Number of computation dependencies currently tracked. */
87       EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
88    };
89 
90    /**
91     * State of our modeling of the program execution.
92     */
93    struct state {
state__anon789f65800111::state94       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95       /**
96        * Time at which a given unit will be ready to execute the next
97        * computation, in clock units.
98        */
99       unsigned unit_ready[EU_NUM_UNITS];
100       /**
101        * Time at which an instruction dependent on a given dependency ID will
102        * be ready to execute, in clock units.
103        */
104       unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
105       /**
106        * Aggregated utilization of a given unit excluding idle cycles,
107        * in clock units.
108        */
109       float unit_busy[EU_NUM_UNITS];
110       /**
111        * Factor of the overhead of a computation accounted for in the
112        * aggregated utilization calculation.
113        */
114       float weight;
115    };
116 
117    /**
118     * Information derived from an IR instruction used to compute performance
119     * estimates.  Allows the timing calculation to work on both FS and VEC4
120     * instructions.
121     */
122    struct instruction_info {
instruction_info__anon789f65800111::instruction_info123       instruction_info(const struct elk_isa_info *isa, const elk_fs_inst *inst) :
124          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
125          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126          tx(get_exec_type(inst)), sx(0), ss(0),
127          sc(elk_has_bank_conflict(isa, inst) ? sd : 0),
128          desc(inst->desc), sfid(inst->sfid)
129       {
130          /* We typically want the maximum source size, except for split send
131           * messages which require the total size.
132           */
133          if (inst->opcode == ELK_SHADER_OPCODE_SEND) {
134             ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135                  DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136          } else {
137             for (unsigned i = 0; i < inst->sources; i++)
138                ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139          }
140 
141          /* Convert the execution size to GRF units. */
142          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143 
144          /* 32x32 integer multiplication has half the usual ALU throughput.
145           * Treat it as double-precision.
146           */
147          if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
148              !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150             tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
151 
152          rcount = inst->opcode == ELK_OPCODE_DPAS ? inst->rcount : 0;
153       }
154 
instruction_info__anon789f65800111::instruction_info155       instruction_info(const struct elk_isa_info *isa,
156                        const vec4_instruction *inst) :
157          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
158          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
159          tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
160          desc(inst->desc), sfid(inst->sfid), rcount(0)
161       {
162          /* Compute the maximum source size. */
163          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
164             ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
165 
166          /* Convert the execution size to GRF units. */
167          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
168 
169          /* 32x32 integer multiplication has half the usual ALU throughput.
170           * Treat it as double-precision.
171           */
172          if ((inst->opcode == ELK_OPCODE_MUL || inst->opcode == ELK_OPCODE_MAD) &&
173              !elk_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
174              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
175             tx = elk_int_type(8, tx == ELK_REGISTER_TYPE_D);
176       }
177 
178       /** ISA encoding information */
179       const struct elk_isa_info *isa;
180       /** Device information. */
181       const struct intel_device_info *devinfo;
182       /** Instruction opcode. */
183       elk_opcode op;
184       /** Destination type. */
185       elk_reg_type td;
186       /** Destination size in GRF units. */
187       unsigned sd;
188       /** Execution type. */
189       elk_reg_type tx;
190       /** Execution size in GRF units. */
191       unsigned sx;
192       /** Source size. */
193       unsigned ss;
194       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
195       unsigned sc;
196       /** Send message descriptor. */
197       uint32_t desc;
198       /** Send message shared function ID. */
199       uint8_t sfid;
200       /** Repeat count for DPAS instructions. */
201       uint8_t rcount;
202    };
203 
204    /**
205     * Timing information of an instruction used to estimate the performance of
206     * the program.
207     */
208    struct perf_desc {
perf_desc__anon789f65800111::perf_desc209       perf_desc(enum intel_eu_unit u, int df, int db,
210                 int ls, int ld, int la, int lf) :
211          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
212 
213       /**
214        * Back-end unit its runtime shall be accounted to, in addition to the
215        * EU front-end which is always assumed to be involved.
216        */
217       enum intel_eu_unit u;
218       /**
219        * Overhead cycles from the time that the EU front-end starts executing
220        * the instruction until it's ready to execute the next instruction.
221        */
222       int df;
223       /**
224        * Overhead cycles from the time that the back-end starts executing the
225        * instruction until it's ready to execute the next instruction.
226        */
227       int db;
228       /**
229        * Latency cycles from the time that the back-end starts executing the
230        * instruction until its sources have been read from the register file.
231        */
232       int ls;
233       /**
234        * Latency cycles from the time that the back-end starts executing the
235        * instruction until its regular destination has been written to the
236        * register file.
237        */
238       int ld;
239       /**
240        * Latency cycles from the time that the back-end starts executing the
241        * instruction until its accumulator destination has been written to the
242        * ARF file.
243        *
244        * Note that this is an approximation of the real behavior of
245        * accumulating instructions in the hardware: Instead of modeling a pair
246        * of back-to-back accumulating instructions as a first computation with
247        * latency equal to ld followed by another computation with a
248        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
249        * model the stall as if it occurred at the top of the pipeline, with
250        * the latency of the accumulator computation offset accordingly.
251        */
252       int la;
253       /**
254        * Latency cycles from the time that the back-end starts executing the
255        * instruction until its flag destination has been written to the ARF
256        * file.
257        */
258       int lf;
259    };
260 
261    /**
262     * Compute the timing information of an instruction based on any relevant
263     * information from the IR and a number of parameters specifying a linear
264     * approximation: Parameter X_Y specifies the derivative of timing X
265     * relative to info field Y, while X_1 specifies the independent term of
266     * the approximation of timing X.
267     */
268    perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)269    calculate_desc(const instruction_info &info, enum intel_eu_unit u,
270                   int df_1, int df_sd, int df_sc,
271                   int db_1, int db_sx,
272                   int ls_1, int ld_1, int la_1, int lf_1,
273                   int l_ss, int l_sd)
274    {
275       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
276                           db_1 + db_sx * int(info.sx),
277                           ls_1 + l_ss * int(info.ss),
278                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
279                           la_1, lf_1);
280    }
281 
282    /**
283     * Compute the timing information of an instruction based on any relevant
284     * information from the IR and a number of linear approximation parameters
285     * hard-coded for each IR instruction.
286     *
287     * Most timing parameters are obtained from the multivariate linear
288     * regression of a sample of empirical timings measured using the tm0
289     * register (as can be done today by using the shader_time debugging
290     * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
291     * "Shared Functions - Extended Math", Section 3.2 "Performance".
292     * Parameters marked XXX shall be considered low-quality, they're possibly
293     * high variance or completely guessed in cases where experimental data was
294     * unavailable.
295     */
296    const perf_desc
instruction_desc(const instruction_info & info)297    instruction_desc(const instruction_info &info)
298    {
299       const struct intel_device_info *devinfo = info.devinfo;
300 
301       switch (info.op) {
302       case ELK_OPCODE_SYNC:
303       case ELK_OPCODE_SEL:
304       case ELK_OPCODE_NOT:
305       case ELK_OPCODE_AND:
306       case ELK_OPCODE_OR:
307       case ELK_OPCODE_XOR:
308       case ELK_OPCODE_SHR:
309       case ELK_OPCODE_SHL:
310       case ELK_OPCODE_DIM:
311       case ELK_OPCODE_ASR:
312       case ELK_OPCODE_CMPN:
313       case ELK_OPCODE_F16TO32:
314       case ELK_OPCODE_BFREV:
315       case ELK_OPCODE_BFI1:
316       case ELK_OPCODE_AVG:
317       case ELK_OPCODE_FRC:
318       case ELK_OPCODE_RNDU:
319       case ELK_OPCODE_RNDD:
320       case ELK_OPCODE_RNDE:
321       case ELK_OPCODE_RNDZ:
322       case ELK_OPCODE_MAC:
323       case ELK_OPCODE_MACH:
324       case ELK_OPCODE_LZD:
325       case ELK_OPCODE_FBH:
326       case ELK_OPCODE_FBL:
327       case ELK_OPCODE_CBIT:
328       case ELK_OPCODE_ADDC:
329       case ELK_OPCODE_ROR:
330       case ELK_OPCODE_ROL:
331       case ELK_OPCODE_SUBB:
332       case ELK_OPCODE_SAD2:
333       case ELK_OPCODE_SADA2:
334       case ELK_OPCODE_LINE:
335       case ELK_OPCODE_NOP:
336       case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
337       case ELK_SHADER_OPCODE_SCRATCH_HEADER:
338       case ELK_FS_OPCODE_DDX_COARSE:
339       case ELK_FS_OPCODE_DDX_FINE:
340       case ELK_FS_OPCODE_DDY_COARSE:
341       case ELK_FS_OPCODE_PIXEL_X:
342       case ELK_FS_OPCODE_PIXEL_Y:
343       case ELK_FS_OPCODE_SET_SAMPLE_ID:
344       case ELK_VEC4_OPCODE_MOV_BYTES:
345       case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
346       case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
347       case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
348       case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
349       case ELK_VEC4_OPCODE_TO_DOUBLE:
350       case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
351       case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
352       case ELK_VEC4_OPCODE_SET_LOW_32BIT:
353       case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
354       case ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
355       case ELK_GS_OPCODE_SET_DWORD_2:
356       case ELK_GS_OPCODE_SET_WRITE_OFFSET:
357       case ELK_GS_OPCODE_SET_VERTEX_COUNT:
358       case ELK_GS_OPCODE_PREPARE_CHANNEL_MASKS:
359       case ELK_GS_OPCODE_SET_CHANNEL_MASKS:
360       case ELK_GS_OPCODE_GET_INSTANCE_ID:
361       case ELK_GS_OPCODE_SET_PRIMITIVE_ID:
362       case ELK_GS_OPCODE_SVB_SET_DST_INDEX:
363       case ELK_TCS_OPCODE_SRC0_010_IS_ZERO:
364       case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
365       case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
366       case ELK_SHADER_OPCODE_READ_SR_REG:
367          if (devinfo->ver >= 11) {
368             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
369                                   0, 10, 6 /* XXX */, 14, 0, 0);
370          } else if (devinfo->ver >= 8) {
371             if (type_sz(info.tx) > 4)
372                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
373                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
374             else
375                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
376                                      0, 8, 4, 12, 0, 0);
377          } else if (devinfo->verx10 >= 75) {
378             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
379                                   0, 10, 6 /* XXX */, 16, 0, 0);
380          } else {
381             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
382                                   0, 12, 8 /* XXX */, 18, 0, 0);
383          }
384 
385       case ELK_OPCODE_MOV:
386       case ELK_OPCODE_CMP:
387       case ELK_OPCODE_ADD:
388       case ELK_OPCODE_ADD3:
389       case ELK_OPCODE_MUL:
390       case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
391       case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
392          if (devinfo->ver >= 11) {
393             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
394                                   0, 10, 6, 14, 0, 0);
395          } else if (devinfo->ver >= 8) {
396             if (type_sz(info.tx) > 4)
397                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
398                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
399             else
400                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
401                                      0, 8, 4, 12, 0, 0);
402          } else if (devinfo->verx10 >= 75) {
403             if (info.tx == ELK_REGISTER_TYPE_F)
404                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
405                                      0, 12, 8 /* XXX */, 18, 0, 0);
406             else
407                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
408                                      0, 10, 6 /* XXX */, 16, 0, 0);
409          } else if (devinfo->ver >= 7) {
410             if (info.tx == ELK_REGISTER_TYPE_F)
411                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
412                                      0, 14, 10 /* XXX */, 20, 0, 0);
413             else
414                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
415                                      0, 12, 8 /* XXX */, 18, 0, 0);
416          } else {
417             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
418                                   0, 2 /* XXX */,
419                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
420                                   0, 0);
421          }
422 
423       case ELK_OPCODE_BFE:
424       case ELK_OPCODE_BFI2:
425       case ELK_OPCODE_CSEL:
426          if (devinfo->ver >= 11)
427             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
428                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
429          else if (devinfo->ver >= 8)
430             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
431                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
432          else if (devinfo->verx10 >= 75)
433             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
434                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
435          else if (devinfo->ver >= 7)
436             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
437                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
438          else
439             abort();
440 
441       case ELK_OPCODE_MAD:
442          if (devinfo->ver >= 11) {
443             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
444                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
445          } else if (devinfo->ver >= 8) {
446             if (type_sz(info.tx) > 4)
447                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
448                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
449             else
450                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
451                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
452          } else if (devinfo->verx10 >= 75) {
453             if (info.tx == ELK_REGISTER_TYPE_F)
454                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
455                                      0, 12, 8 /* XXX */, 18, 0, 0);
456             else
457                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
458                                      0, 10, 6 /* XXX */, 16, 0, 0);
459          } else if (devinfo->ver >= 7) {
460             if (info.tx == ELK_REGISTER_TYPE_F)
461                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
462                                      0, 14, 10 /* XXX */, 20, 0, 0);
463             else
464                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
465                                      0, 12, 8 /* XXX */, 18, 0, 0);
466          } else if (devinfo->ver >= 6) {
467             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
468                                   0, 2 /* XXX */,
469                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
470                                   0, 0);
471          } else {
472             abort();
473          }
474 
475       case ELK_OPCODE_F32TO16:
476          if (devinfo->ver >= 11)
477             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
478                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
479          else if (devinfo->ver >= 8)
480             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
481                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
482          else if (devinfo->verx10 >= 75)
483             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
484                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
485          else if (devinfo->ver >= 7)
486             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
487                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
488          else
489             abort();
490 
491       case ELK_OPCODE_DP4:
492       case ELK_OPCODE_DPH:
493       case ELK_OPCODE_DP3:
494       case ELK_OPCODE_DP2:
495          if (devinfo->ver >= 8)
496             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
497                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
498          else if (devinfo->verx10 >= 75)
499             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
500                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
501          else
502             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
503                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
504 
505       case ELK_OPCODE_DP4A:
506          if (devinfo->ver >= 12)
507             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
508                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
509          else
510             abort();
511 
512       case ELK_OPCODE_DPAS: {
513          unsigned ld;
514 
515          switch (info.rcount) {
516          case 1:
517             ld = 21;
518             break;
519          case 2:
520             ld = 22;
521             break;
522          case 8:
523          default:
524             ld = 32;
525             break;
526          }
527 
528          /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
529           * for la and lf.
530           */
531          if (devinfo->verx10 >= 125)
532             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
533                                   0, ld, UINT_MAX, UINT_MAX, 0, 0);
534          else
535             abort();
536       }
537 
538       case ELK_SHADER_OPCODE_RCP:
539       case ELK_SHADER_OPCODE_RSQ:
540       case ELK_SHADER_OPCODE_SQRT:
541       case ELK_SHADER_OPCODE_EXP2:
542       case ELK_SHADER_OPCODE_LOG2:
543       case ELK_SHADER_OPCODE_SIN:
544       case ELK_SHADER_OPCODE_COS:
545       case ELK_SHADER_OPCODE_POW:
546       case ELK_SHADER_OPCODE_INT_QUOTIENT:
547       case ELK_SHADER_OPCODE_INT_REMAINDER:
548          if (devinfo->ver >= 6) {
549             switch (info.op) {
550             case ELK_SHADER_OPCODE_RCP:
551             case ELK_SHADER_OPCODE_RSQ:
552             case ELK_SHADER_OPCODE_SQRT:
553             case ELK_SHADER_OPCODE_EXP2:
554             case ELK_SHADER_OPCODE_LOG2:
555             case ELK_SHADER_OPCODE_SIN:
556             case ELK_SHADER_OPCODE_COS:
557                if (devinfo->ver >= 8)
558                   return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
559                                         0, 16, 0, 0, 0, 0);
560                else if (devinfo->verx10 >= 75)
561                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
562                                         0, 12, 0, 0, 0, 0);
563                else
564                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
565                                         0, 14, 0, 0, 0, 0);
566 
567             case ELK_SHADER_OPCODE_POW:
568                if (devinfo->ver >= 8)
569                   return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
570                                         0, 24, 0, 0, 0, 0);
571                else if (devinfo->verx10 >= 75)
572                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
573                                         0, 20, 0, 0, 0, 0);
574                else
575                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
576                                         0, 22, 0, 0, 0, 0);
577 
578             case ELK_SHADER_OPCODE_INT_QUOTIENT:
579             case ELK_SHADER_OPCODE_INT_REMAINDER:
580                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
581                                      0, 28 /* XXX */, 0, 0, 0, 0);
582 
583             default:
584                abort();
585             }
586          } else {
587             switch (info.op) {
588             case ELK_SHADER_OPCODE_RCP:
589                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
590                                      0, 22, 0, 0, 0, 8);
591 
592             case ELK_SHADER_OPCODE_RSQ:
593                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
594                                      0, 44, 0, 0, 0, 8);
595 
596             case ELK_SHADER_OPCODE_INT_QUOTIENT:
597             case ELK_SHADER_OPCODE_SQRT:
598             case ELK_SHADER_OPCODE_LOG2:
599                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
600                                      0, 66, 0, 0, 0, 8);
601 
602             case ELK_SHADER_OPCODE_INT_REMAINDER:
603             case ELK_SHADER_OPCODE_EXP2:
604                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
605                                      0, 88, 0, 0, 0, 8);
606 
607             case ELK_SHADER_OPCODE_SIN:
608             case ELK_SHADER_OPCODE_COS:
609                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
610                                      0, 132, 0, 0, 0, 8);
611 
612             case ELK_SHADER_OPCODE_POW:
613                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
614                                      0, 176, 0, 0, 0, 8);
615 
616             default:
617                abort();
618             }
619          }
620 
621       case ELK_OPCODE_DO:
622          if (devinfo->ver >= 6)
623             return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
624                                   0, 0, 0, 0, 0, 0);
625          else
626             return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
627                                   0, 0, 0, 0, 0, 0);
628 
629       case ELK_OPCODE_IF:
630       case ELK_OPCODE_ELSE:
631       case ELK_OPCODE_ENDIF:
632       case ELK_OPCODE_WHILE:
633       case ELK_OPCODE_BREAK:
634       case ELK_OPCODE_CONTINUE:
635       case ELK_OPCODE_HALT:
636          if (devinfo->ver >= 8)
637             return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
638                                   0, 0, 0, 0, 0, 0);
639          else if (devinfo->verx10 >= 75)
640             return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
641                                   0, 0, 0, 0, 0, 0);
642          else
643             return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
644                                   0, 0, 0, 0, 0, 0);
645 
646       case ELK_FS_OPCODE_LINTERP:
647          if (devinfo->ver >= 8)
648             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
649                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
650          else if (devinfo->verx10 >= 75)
651             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
652                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
653          else
654             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
655                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
656 
657       case ELK_OPCODE_LRP:
658          if (devinfo->ver >= 8)
659             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
660                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
661          else if (devinfo->verx10 >= 75)
662             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
663                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
664          else if (devinfo->ver >= 6)
665             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
666                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
667          else
668             abort();
669 
670       case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
671          if (devinfo->ver >= 11)
672             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
673                                   0, 10 /* XXX */, 6 /* XXX */,
674                                   14 /* XXX */, 0, 0);
675          else if (devinfo->ver >= 8)
676             return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
677                                   0, 8 /* XXX */, 4 /* XXX */,
678                                   12 /* XXX */, 0, 0);
679          else if (devinfo->verx10 >= 75)
680             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
681                                   0, 10 /* XXX */, 6 /* XXX */,
682                                   16 /* XXX */, 0, 0);
683          else if (devinfo->ver >= 7)
684             return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
685                                   0, 12 /* XXX */, 8 /* XXX */,
686                                   18 /* XXX */, 0, 0);
687          else
688             abort();
689 
690       case ELK_SHADER_OPCODE_MOV_INDIRECT:
691          if (devinfo->ver >= 11)
692             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
693                                   0, 10 /* XXX */, 6 /* XXX */,
694                                   14 /* XXX */, 0, 0);
695          else if (devinfo->ver >= 8)
696             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
697                                   0, 8 /* XXX */, 4 /* XXX */,
698                                   12 /* XXX */, 0, 0);
699          else if (devinfo->verx10 >= 75)
700             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
701                                   0, 10 /* XXX */, 6 /* XXX */,
702                                   16 /* XXX */, 0, 0);
703          else
704             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
705                                   0, 12 /* XXX */, 8 /* XXX */,
706                                   18 /* XXX */, 0, 0);
707 
708       case ELK_SHADER_OPCODE_BROADCAST:
709          if (devinfo->ver >= 11)
710             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
711                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
712          else if (devinfo->ver >= 8)
713             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
714                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
715          else if (devinfo->verx10 >= 75)
716             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
717                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
718          else if (devinfo->ver >= 7)
719             return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
720                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
721          else
722             abort();
723 
724       case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
725       case ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
726          if (devinfo->ver >= 11)
727             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
728                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
729          else if (devinfo->ver >= 8)
730             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
731                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
732          else if (devinfo->verx10 >= 75)
733             return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
734                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
735          else if (devinfo->ver >= 7)
736             return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
737                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
738          else
739             abort();
740 
741       case ELK_SHADER_OPCODE_RND_MODE:
742       case ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE:
743          if (devinfo->ver >= 11)
744             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
745                                   4 /* XXX */, 0,
746                                   0, 0, 0, 0, 0, 0);
747          else if (devinfo->ver >= 8)
748             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
749                                   4 /* XXX */, 0,
750                                   0, 0, 0, 0, 0, 0);
751          else if (devinfo->verx10 >= 75)
752             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
753                                   4 /* XXX */, 0,
754                                   0, 0, 0, 0, 0, 0);
755          else if (devinfo->ver >= 6)
756             return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
757                                   4 /* XXX */, 0,
758                                   0, 0, 0, 0, 0, 0);
759          else
760             abort();
761 
762       case ELK_SHADER_OPCODE_SHUFFLE:
763          if (devinfo->ver >= 11)
764             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
765                                   44 /* XXX */, 0,
766                                   0, 10 /* XXX */, 6 /* XXX */,
767                                   14 /* XXX */, 0, 0);
768          else if (devinfo->ver >= 8)
769             return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
770                                   42 /* XXX */, 0,
771                                   0, 8 /* XXX */, 4 /* XXX */,
772                                   12 /* XXX */, 0, 0);
773          else if (devinfo->verx10 >= 75)
774             return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
775                                   0, 44 /* XXX */,
776                                   0, 10 /* XXX */, 6 /* XXX */,
777                                   16 /* XXX */, 0, 0);
778          else if (devinfo->ver >= 6)
779             return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
780                                   0, 46 /* XXX */,
781                                   0, 12 /* XXX */, 8 /* XXX */,
782                                   18 /* XXX */, 0, 0);
783          else
784             abort();
785 
786       case ELK_SHADER_OPCODE_SEL_EXEC:
787          if (devinfo->ver >= 11)
788             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
789                                   0, 4 /* XXX */,
790                                   0, 10 /* XXX */, 6 /* XXX */,
791                                   14 /* XXX */, 0, 0);
792          else if (devinfo->ver >= 8)
793             return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
794                                   0, 4 /* XXX */,
795                                   0, 8 /* XXX */, 4 /* XXX */,
796                                   12 /* XXX */, 0, 0);
797          else if (devinfo->verx10 >= 75)
798             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
799                                   0, 4 /* XXX */,
800                                   0, 10 /* XXX */, 6 /* XXX */,
801                                   16 /* XXX */, 0, 0);
802          else
803             return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
804                                   0, 4 /* XXX */,
805                                   0, 12 /* XXX */, 8 /* XXX */,
806                                   18 /* XXX */, 0, 0);
807 
808       case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
809          if (devinfo->ver >= 11)
810             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
811                                   0, 8 /* XXX */,
812                                   0, 10 /* XXX */, 6 /* XXX */,
813                                   14 /* XXX */, 0, 0);
814          else if (devinfo->ver >= 8)
815             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
816                                   0, 8 /* XXX */,
817                                   0, 8 /* XXX */, 4 /* XXX */,
818                                   12 /* XXX */, 0, 0);
819          else if (devinfo->verx10 >= 75)
820             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
821                                   0, 8 /* XXX */,
822                                   0, 10 /* XXX */, 6 /* XXX */,
823                                   16 /* XXX */, 0, 0);
824          else
825             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
826                                   0, 8 /* XXX */,
827                                   0, 12 /* XXX */, 8 /* XXX */,
828                                   18 /* XXX */, 0, 0);
829 
830       case ELK_FS_OPCODE_DDY_FINE:
831          if (devinfo->ver >= 11)
832             return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
833                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
834          else if (devinfo->ver >= 8)
835             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
836                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
837          else if (devinfo->verx10 >= 75)
838             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
839                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
840          else
841             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
842                                   0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
843 
844       case ELK_FS_OPCODE_LOAD_LIVE_CHANNELS:
845          if (devinfo->ver >= 11)
846             return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
847                                   2 /* XXX */, 0,
848                                   0, 0, 0, 10 /* XXX */, 0, 0);
849          else if (devinfo->ver >= 8)
850             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
851                                   0, 2 /* XXX */,
852                                   0, 0, 0, 8 /* XXX */, 0, 0);
853          else
854             abort();
855 
856       case ELK_VEC4_OPCODE_PACK_BYTES:
857          if (devinfo->ver >= 8)
858             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
859                                   4 /* XXX */, 0,
860                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
861                                   0, 0);
862          else if (devinfo->verx10 >= 75)
863             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
864                                   4 /* XXX */, 0,
865                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
866                                   0, 0);
867          else
868             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
869                                   4 /* XXX */, 0,
870                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
871                                   0, 0);
872 
873       case ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
874       case ELK_TCS_OPCODE_GET_INSTANCE_ID:
875       case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
876       case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
877       case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
878          if (devinfo->ver >= 8)
879             return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
880                                   6 /* XXX */, 0,
881                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
882                                   0, 0);
883          else if (devinfo->verx10 >= 75)
884             return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
885                                   6 /* XXX */, 0,
886                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
887                                   0, 0);
888          else
889             return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
890                                   6 /* XXX */, 0,
891                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
892                                   0, 0);
893 
894       case ELK_GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
895       case ELK_TCS_OPCODE_CREATE_BARRIER_HEADER:
896          if (devinfo->ver >= 8)
897             return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
898                                   8 /* XXX */, 0,
899                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
900                                   0, 0);
901          else if (devinfo->verx10 >= 75)
902             return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
903                                   8 /* XXX */, 0,
904                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
905                                   0, 0);
906          else if (devinfo->ver >= 6)
907             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
908                                   8 /* XXX */, 0,
909                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
910                                   0, 0);
911          else
912             abort();
913 
914       case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
915          if (devinfo->ver >= 8)
916             return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
917                                   4 /* XXX */, 0,
918                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
919                                   0, 0);
920          else if (devinfo->verx10 >= 75)
921             return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
922                                   4 /* XXX */, 0,
923                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
924                                   0, 0);
925          else if (devinfo->ver >= 7)
926             return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
927                                   4 /* XXX */, 0,
928                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
929                                   0, 0);
930          else
931             abort();
932 
933       case ELK_SHADER_OPCODE_TEX:
934       case ELK_FS_OPCODE_TXB:
935       case ELK_SHADER_OPCODE_TXD:
936       case ELK_SHADER_OPCODE_TXF:
937       case ELK_SHADER_OPCODE_TXF_LZ:
938       case ELK_SHADER_OPCODE_TXL:
939       case ELK_SHADER_OPCODE_TXL_LZ:
940       case ELK_SHADER_OPCODE_TXF_CMS:
941       case ELK_SHADER_OPCODE_TXF_CMS_W:
942       case ELK_SHADER_OPCODE_TXF_UMS:
943       case ELK_SHADER_OPCODE_TXF_MCS:
944       case ELK_SHADER_OPCODE_TXS:
945       case ELK_SHADER_OPCODE_LOD:
946       case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
947       case ELK_SHADER_OPCODE_TG4:
948       case ELK_SHADER_OPCODE_TG4_OFFSET:
949       case ELK_SHADER_OPCODE_SAMPLEINFO:
950       case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
951          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
952                                8 /* XXX */, 750 /* XXX */, 0, 0,
953                                2 /* XXX */, 0);
954 
955       case ELK_VEC4_OPCODE_URB_READ:
956       case ELK_VEC4_VS_OPCODE_URB_WRITE:
957       case ELK_VEC4_GS_OPCODE_URB_WRITE:
958       case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
959       case ELK_GS_OPCODE_THREAD_END:
960       case ELK_GS_OPCODE_FF_SYNC:
961       case ELK_VEC4_TCS_OPCODE_URB_WRITE:
962       case ELK_TCS_OPCODE_RELEASE_INPUT:
963       case ELK_TCS_OPCODE_THREAD_END:
964          return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
965                                32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
966 
967       case ELK_SHADER_OPCODE_MEMORY_FENCE:
968       case ELK_SHADER_OPCODE_INTERLOCK:
969          switch (info.sfid) {
970          case GFX6_SFID_DATAPORT_RENDER_CACHE:
971             if (devinfo->ver >= 7)
972                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
973                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
974             else
975                abort();
976 
977          case ELK_SFID_URB:
978          case GFX7_SFID_DATAPORT_DATA_CACHE:
979          case GFX12_SFID_SLM:
980          case GFX12_SFID_TGM:
981          case GFX12_SFID_UGM:
982          case HSW_SFID_DATAPORT_DATA_CACHE_1:
983             if (devinfo->ver >= 7)
984                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
985                                      10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
986             else
987                abort();
988 
989          default:
990             abort();
991          }
992 
993       case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
994       case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
995       case ELK_SHADER_OPCODE_GFX7_SCRATCH_READ:
996          return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
997                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
998 
999       case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
1000          if (devinfo->ver >= 7)
1001             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1002                                   30 /* XXX */, 400 /* XXX */,
1003                                   10 /* XXX */, 100 /* XXX */, 0, 0,
1004                                   0, 400 /* XXX */);
1005          else
1006             abort();
1007 
1008       case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
1009       case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
1010          if (devinfo->ver >= 7)
1011             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1012                                   0, 20 /* XXX */,
1013                                   10 /* XXX */, 100 /* XXX */, 0, 0,
1014                                   0, 0);
1015          else
1016             abort();
1017 
1018       case ELK_FS_OPCODE_FB_WRITE:
1019       case ELK_FS_OPCODE_FB_READ:
1020       case ELK_FS_OPCODE_REP_FB_WRITE:
1021          return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
1022                                10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1023 
1024       case ELK_GS_OPCODE_SVB_WRITE:
1025          if (devinfo->ver >= 6)
1026             return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
1027                                   0, 450 /* XXX */,
1028                                   10 /* XXX */, 300 /* XXX */, 0, 0,
1029                                   0, 0);
1030          else
1031             abort();
1032 
1033       case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1034          return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
1035                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1036 
1037       case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
1038       case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1039          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1040                                8, 750, 0, 0, 2, 0);
1041 
1042       case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1043       case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1044       case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1045          if (devinfo->ver >= 7)
1046             return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1047                                   0, 90 /* XXX */, 0, 0, 0, 0);
1048          else
1049             abort();
1050 
1051       case ELK_SHADER_OPCODE_BARRIER:
1052          if (devinfo->ver >= 7)
1053             return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
1054                                   0 /* XXX */, 0,
1055                                   0, 0, 0, 0, 0, 0);
1056          else
1057             abort();
1058 
1059       case ELK_CS_OPCODE_CS_TERMINATE:
1060          if (devinfo->ver >= 7)
1061             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1062                                   10 /* XXX */, 0, 0, 0, 0, 0);
1063          else
1064             abort();
1065 
1066       case ELK_SHADER_OPCODE_SEND:
1067          switch (info.sfid) {
1068          case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
1069             if (devinfo->ver >= 7) {
1070                /* See ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
1071                return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
1072                                      10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1073             } else {
1074                abort();
1075             }
1076          case GFX6_SFID_DATAPORT_RENDER_CACHE:
1077             if (devinfo->ver >= 7) {
1078                switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1079                case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1080                   return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1081                                         30 /* XXX */, 450 /* XXX */,
1082                                         10 /* XXX */, 100 /* XXX */,
1083                                         0, 0, 0, 400 /* XXX */);
1084                default:
1085                   return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1086                                         0, 450 /* XXX */,
1087                                         10 /* XXX */, 300 /* XXX */, 0, 0,
1088                                         0, 0);
1089                }
1090             } else if (devinfo->ver >= 6)  {
1091                return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
1092                                      0, 450 /* XXX */,
1093                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1094             } else {
1095                abort();
1096             }
1097          case ELK_SFID_SAMPLER: {
1098             if (devinfo->ver >= 6)
1099                return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1100                                      8, 750, 0, 0, 2, 0);
1101             else
1102                abort();
1103          }
1104          case GFX7_SFID_DATAPORT_DATA_CACHE:
1105          case HSW_SFID_DATAPORT_DATA_CACHE_1:
1106             if (devinfo->verx10 >= 75) {
1107                switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1108                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1109                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1110                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1111                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1112                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1113                                         30 /* XXX */, 400 /* XXX */,
1114                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1115                                         0, 400 /* XXX */);
1116 
1117                default:
1118                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1119                                         0, 20 /* XXX */,
1120                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1121                                         0, 0);
1122                }
1123             } else if (devinfo->ver >= 7) {
1124                switch (elk_dp_desc_msg_type(devinfo, info.desc)) {
1125                case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1126                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1127                                         30 /* XXX */, 400 /* XXX */,
1128                                         10 /* XXX */, 100 /* XXX */,
1129                                         0, 0, 0, 400 /* XXX */);
1130                default:
1131                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1132                                         0, 20 /* XXX */,
1133                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1134                                         0, 0);
1135                }
1136             } else {
1137                abort();
1138             }
1139 
1140          case GFX7_SFID_PIXEL_INTERPOLATOR:
1141             if (devinfo->ver >= 7)
1142                return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1143                                      0, 90 /* XXX */, 0, 0, 0, 0);
1144             else
1145                abort();
1146 
1147          case GFX12_SFID_UGM:
1148          case GFX12_SFID_TGM:
1149          case GFX12_SFID_SLM:
1150             switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1151             case LSC_OP_LOAD:
1152             case LSC_OP_STORE:
1153             case LSC_OP_LOAD_CMASK:
1154             case LSC_OP_STORE_CMASK:
1155                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1156                                      0, 20 /* XXX */,
1157                                      10 /* XXX */, 100 /* XXX */, 0, 0,
1158                                      0, 0);
1159 
1160             case LSC_OP_FENCE:
1161             case LSC_OP_ATOMIC_INC:
1162             case LSC_OP_ATOMIC_DEC:
1163             case LSC_OP_ATOMIC_LOAD:
1164             case LSC_OP_ATOMIC_STORE:
1165             case LSC_OP_ATOMIC_ADD:
1166             case LSC_OP_ATOMIC_SUB:
1167             case LSC_OP_ATOMIC_MIN:
1168             case LSC_OP_ATOMIC_MAX:
1169             case LSC_OP_ATOMIC_UMIN:
1170             case LSC_OP_ATOMIC_UMAX:
1171             case LSC_OP_ATOMIC_CMPXCHG:
1172             case LSC_OP_ATOMIC_FADD:
1173             case LSC_OP_ATOMIC_FSUB:
1174             case LSC_OP_ATOMIC_FMIN:
1175             case LSC_OP_ATOMIC_FMAX:
1176             case LSC_OP_ATOMIC_FCMPXCHG:
1177             case LSC_OP_ATOMIC_AND:
1178             case LSC_OP_ATOMIC_OR:
1179             case LSC_OP_ATOMIC_XOR:
1180                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1181                                      30 /* XXX */, 400 /* XXX */,
1182                                      10 /* XXX */, 100 /* XXX */, 0, 0,
1183                                      0, 400 /* XXX */);
1184             default:
1185                abort();
1186             }
1187 
1188          case ELK_SFID_URB:
1189             return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
1190                                   32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
1191 
1192          default:
1193             abort();
1194          }
1195 
1196       case ELK_SHADER_OPCODE_UNDEF:
1197       case ELK_SHADER_OPCODE_HALT_TARGET:
1198       case ELK_FS_OPCODE_SCHEDULING_FENCE:
1199          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
1200                                0, 0, 0, 0, 0, 0);
1201 
1202       default:
1203          abort();
1204       }
1205    }
1206 
1207    /**
1208     * Model the performance behavior of a stall on the specified dependency
1209     * ID.
1210     */
1211    void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)1212    stall_on_dependency(state &st, enum intel_eu_dependency_id id)
1213    {
1214       if (id < ARRAY_SIZE(st.dep_ready))
1215          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1216                                        st.dep_ready[id]);
1217    }
1218 
1219    /**
1220     * Model the performance behavior of the front-end and back-end while
1221     * executing an instruction with the specified timing information, assuming
1222     * all dependencies are already clear.
1223     */
1224    void
execute_instruction(state & st,const perf_desc & perf)1225    execute_instruction(state &st, const perf_desc &perf)
1226    {
1227       /* Compute the time at which the front-end will be ready to execute the
1228        * next instruction.
1229        */
1230       st.unit_ready[EU_UNIT_FE] += perf.df;
1231 
1232       if (perf.u < EU_NUM_UNITS) {
1233          /* Wait for the back-end to be ready to execute this instruction. */
1234          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1235                                        st.unit_ready[perf.u]);
1236 
1237          /* Compute the time at which the back-end will be ready to execute
1238           * the next instruction, and update the back-end utilization.
1239           */
1240          st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
1241          st.unit_busy[perf.u] += perf.db * st.weight;
1242       }
1243    }
1244 
1245    /**
1246     * Model the performance behavior of a read dependency provided by an
1247     * instruction.
1248     */
1249    void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1250    mark_read_dependency(state &st, const perf_desc &perf,
1251                         enum intel_eu_dependency_id id)
1252    {
1253       if (id < ARRAY_SIZE(st.dep_ready))
1254          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
1255    }
1256 
1257    /**
1258     * Model the performance behavior of a write dependency provided by an
1259     * instruction.
1260     */
1261    void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1262    mark_write_dependency(state &st, const perf_desc &perf,
1263                          enum intel_eu_dependency_id id)
1264    {
1265       if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
1266          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
1267       else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
1268          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
1269       else if (id < ARRAY_SIZE(st.dep_ready))
1270          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
1271    }
1272 
1273    /**
1274     * Return the dependency ID of a elk_backend_reg, offset by \p delta GRFs.
1275     */
1276    enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const elk_backend_reg & r,const int delta)1277    reg_dependency_id(const intel_device_info *devinfo, const elk_backend_reg &r,
1278                      const int delta)
1279    {
1280       if (r.file == VGRF) {
1281          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1282          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1283          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1284 
1285       } else if (r.file == FIXED_GRF) {
1286          const unsigned i = r.nr + delta;
1287          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1288          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1289 
1290       } else if (r.file == MRF && devinfo->ver >= 7) {
1291          const unsigned i = GFX7_MRF_HACK_START +
1292                             r.nr + r.offset / REG_SIZE + delta;
1293          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1294          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1295 
1296       } else if (r.file == MRF && devinfo->ver < 7) {
1297          const unsigned i = (r.nr & ~ELK_MRF_COMPR4) +
1298                             r.offset / REG_SIZE + delta;
1299          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
1300          return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
1301 
1302       } else if (r.file == ARF && r.nr >= ELK_ARF_ADDRESS &&
1303                  r.nr < ELK_ARF_ACCUMULATOR) {
1304          assert(delta == 0);
1305          return EU_DEPENDENCY_ID_ADDR0;
1306 
1307       } else if (r.file == ARF && r.nr >= ELK_ARF_ACCUMULATOR &&
1308                  r.nr < ELK_ARF_FLAG) {
1309          const unsigned i = r.nr - ELK_ARF_ACCUMULATOR + delta;
1310          assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
1311          return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
1312 
1313       } else {
1314          return EU_NUM_DEPENDENCY_IDS;
1315       }
1316    }
1317 
1318    /**
1319     * Return the dependency ID of flag register starting at offset \p i.
1320     */
1321    enum intel_eu_dependency_id
flag_dependency_id(unsigned i)1322    flag_dependency_id(unsigned i)
1323    {
1324       assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
1325       return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
1326    }
1327 
1328    /**
1329     * Return the dependency ID corresponding to the SBID read completion
1330     * condition of a Gfx12+ SWSB.
1331     */
1332    enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1333    tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1334    {
1335       if (swsb.mode) {
1336          assert(swsb.sbid <
1337                 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
1338          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
1339       } else {
1340          return EU_NUM_DEPENDENCY_IDS;
1341       }
1342    }
1343 
1344    /**
1345     * Return the dependency ID corresponding to the SBID write completion
1346     * condition of a Gfx12+ SWSB.
1347     */
1348    enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1349    tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1350    {
1351       if (swsb.mode) {
1352          assert(swsb.sbid <
1353                 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
1354          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
1355       } else {
1356          return EU_NUM_DEPENDENCY_IDS;
1357       }
1358    }
1359 
1360    /**
1361     * Return the implicit accumulator register accessed by channel \p i of the
1362     * instruction.
1363     */
1364    unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const elk_backend_instruction * inst,elk_reg_type tx,unsigned i)1365    accum_reg_of_channel(const intel_device_info *devinfo,
1366                         const elk_backend_instruction *inst,
1367                         elk_reg_type tx, unsigned i)
1368    {
1369       assert(inst->reads_accumulator_implicitly() ||
1370              inst->writes_accumulator_implicitly(devinfo));
1371       const unsigned offset = (inst->group + i) * type_sz(tx) *
1372          (devinfo->ver < 7 || elk_reg_type_is_floating_point(tx) ? 1 : 2);
1373       return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
1374    }
1375 
1376    /**
1377     * Model the performance behavior of an FS back-end instruction.
1378     */
1379    void
issue_fs_inst(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1380    issue_fs_inst(state &st, const struct elk_isa_info *isa,
1381                  const elk_backend_instruction *be_inst)
1382    {
1383       const struct intel_device_info *devinfo = isa->devinfo;
1384       const elk_fs_inst *inst = static_cast<const elk_fs_inst *>(be_inst);
1385       const instruction_info info(isa, inst);
1386       const perf_desc perf = instruction_desc(info);
1387 
1388       /* Stall on any source dependencies. */
1389       for (unsigned i = 0; i < inst->sources; i++) {
1390          for (unsigned j = 0; j < regs_read(inst, i); j++)
1391             stall_on_dependency(
1392                st, reg_dependency_id(devinfo, inst->src[i], j));
1393       }
1394 
1395       if (inst->reads_accumulator_implicitly()) {
1396          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1397               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1398                                         inst->exec_size - 1); j++)
1399             stall_on_dependency(
1400                st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1401       }
1402 
1403       if (is_send(inst) && inst->base_mrf != -1) {
1404          for (unsigned j = 0; j < inst->mlen; j++)
1405             stall_on_dependency(
1406                st, reg_dependency_id(
1407                   devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1408       }
1409 
1410       if (const unsigned mask = inst->flags_read(devinfo)) {
1411          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1412             if (mask & (1 << i))
1413                stall_on_dependency(st, flag_dependency_id(i));
1414          }
1415       }
1416 
1417       /* Stall on any write dependencies. */
1418       if (!inst->no_dd_check) {
1419          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1420             for (unsigned j = 0; j < regs_written(inst); j++)
1421                stall_on_dependency(
1422                   st, reg_dependency_id(devinfo, inst->dst, j));
1423          }
1424 
1425          if (inst->writes_accumulator_implicitly(devinfo)) {
1426             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1427                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1428                                            inst->exec_size - 1); j++)
1429                stall_on_dependency(
1430                   st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1431          }
1432 
1433          if (const unsigned mask = inst->flags_written(devinfo)) {
1434             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1435                if (mask & (1 << i))
1436                   stall_on_dependency(st, flag_dependency_id(i));
1437             }
1438          }
1439       }
1440 
1441       /* Stall on any SBID dependencies. */
1442       if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1443          stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1444       else if (inst->sched.mode & TGL_SBID_SRC)
1445          stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1446 
1447       /* Execute the instruction. */
1448       execute_instruction(st, perf);
1449 
1450       /* Mark any source dependencies. */
1451       if (inst->is_send_from_grf()) {
1452          for (unsigned i = 0; i < inst->sources; i++) {
1453             if (inst->is_payload(i)) {
1454                for (unsigned j = 0; j < regs_read(inst, i); j++)
1455                   mark_read_dependency(
1456                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1457             }
1458          }
1459       }
1460 
1461       if (is_send(inst) && inst->base_mrf != -1) {
1462          for (unsigned j = 0; j < inst->mlen; j++)
1463             mark_read_dependency(st, perf,
1464                reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1465       }
1466 
1467       /* Mark any destination dependencies. */
1468       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1469          for (unsigned j = 0; j < regs_written(inst); j++) {
1470             mark_write_dependency(st, perf,
1471                                   reg_dependency_id(devinfo, inst->dst, j));
1472          }
1473       }
1474 
1475       if (inst->writes_accumulator_implicitly(devinfo)) {
1476          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1477               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1478                                         inst->exec_size - 1); j++)
1479             mark_write_dependency(st, perf,
1480                                   reg_dependency_id(devinfo, elk_acc_reg(8), j));
1481       }
1482 
1483       if (const unsigned mask = inst->flags_written(devinfo)) {
1484          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1485             if (mask & (1 << i))
1486                mark_write_dependency(st, perf, flag_dependency_id(i));
1487          }
1488       }
1489 
1490       /* Mark any SBID dependencies. */
1491       if (inst->sched.mode & TGL_SBID_SET) {
1492          mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1493          mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1494       }
1495    }
1496 
1497    /**
1498     * Model the performance behavior of a VEC4 back-end instruction.
1499     */
1500    void
issue_vec4_instruction(state & st,const struct elk_isa_info * isa,const elk_backend_instruction * be_inst)1501    issue_vec4_instruction(state &st, const struct elk_isa_info *isa,
1502                           const elk_backend_instruction *be_inst)
1503    {
1504       const struct intel_device_info *devinfo = isa->devinfo;
1505       const vec4_instruction *inst =
1506          static_cast<const vec4_instruction *>(be_inst);
1507       const instruction_info info(isa, inst);
1508       const perf_desc perf = instruction_desc(info);
1509 
1510       /* Stall on any source dependencies. */
1511       for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1512          for (unsigned j = 0; j < regs_read(inst, i); j++)
1513             stall_on_dependency(
1514                st, reg_dependency_id(devinfo, inst->src[i], j));
1515       }
1516 
1517       if (inst->reads_accumulator_implicitly()) {
1518          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1519               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1520                                         inst->exec_size - 1); j++)
1521             stall_on_dependency(
1522                st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1523       }
1524 
1525       if (inst->base_mrf != -1) {
1526          for (unsigned j = 0; j < inst->mlen; j++)
1527             stall_on_dependency(
1528                st, reg_dependency_id(
1529                   devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1530       }
1531 
1532       if (inst->reads_flag())
1533          stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1534 
1535       /* Stall on any write dependencies. */
1536       if (!inst->no_dd_check) {
1537          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1538             for (unsigned j = 0; j < regs_written(inst); j++)
1539                stall_on_dependency(
1540                   st, reg_dependency_id(devinfo, inst->dst, j));
1541          }
1542 
1543          if (inst->writes_accumulator_implicitly(devinfo)) {
1544             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1545                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1546                                            inst->exec_size - 1); j++)
1547                stall_on_dependency(
1548                   st, reg_dependency_id(devinfo, elk_acc_reg(8), j));
1549          }
1550 
1551          if (inst->writes_flag(devinfo))
1552             stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1553       }
1554 
1555       /* Execute the instruction. */
1556       execute_instruction(st, perf);
1557 
1558       /* Mark any source dependencies. */
1559       if (inst->is_send_from_grf()) {
1560          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1561             for (unsigned j = 0; j < regs_read(inst, i); j++)
1562                mark_read_dependency(
1563                   st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1564          }
1565       }
1566 
1567       if (inst->base_mrf != -1) {
1568          for (unsigned j = 0; j < inst->mlen; j++)
1569             mark_read_dependency(st, perf,
1570                reg_dependency_id(devinfo, elk_uvec_mrf(8, inst->base_mrf, 0), j));
1571       }
1572 
1573       /* Mark any destination dependencies. */
1574       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1575          for (unsigned j = 0; j < regs_written(inst); j++) {
1576             mark_write_dependency(st, perf,
1577                                   reg_dependency_id(devinfo, inst->dst, j));
1578          }
1579       }
1580 
1581       if (inst->writes_accumulator_implicitly(devinfo)) {
1582          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1583               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1584                                         inst->exec_size - 1); j++)
1585             mark_write_dependency(st, perf,
1586                                   reg_dependency_id(devinfo, elk_acc_reg(8), j));
1587       }
1588 
1589       if (inst->writes_flag(devinfo))
1590          mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
1591    }
1592 
1593    /**
1594     * Calculate the maximum possible throughput of the program compatible with
1595     * the cycle-count utilization estimated for each asynchronous unit, in
1596     * threads-per-cycle units.
1597     */
1598    float
calculate_thread_throughput(const state & st,float busy)1599    calculate_thread_throughput(const state &st, float busy)
1600    {
1601       for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1602          busy = MAX2(busy, st.unit_busy[i]);
1603 
1604       return 1.0 / busy;
1605    }
1606 
1607    /**
1608     * Estimate the performance of the specified shader.
1609     */
1610    void
calculate_performance(performance & p,const elk_backend_shader * s,void (* issue_instruction)(state &,const struct elk_isa_info *,const elk_backend_instruction *),unsigned dispatch_width)1611    calculate_performance(performance &p, const elk_backend_shader *s,
1612                          void (*issue_instruction)(
1613                             state &, const struct elk_isa_info *,
1614                             const elk_backend_instruction *),
1615                          unsigned dispatch_width)
1616    {
1617       /* XXX - Note that the previous version of this code used worst-case
1618        *       scenario estimation of branching divergence for SIMD32 shaders,
1619        *       but this heuristic was removed to improve performance in common
1620        *       scenarios. Wider shader variants are less optimal when divergence
1621        *       is high, e.g. when application renders complex scene on a small
1622        *       surface. It is assumed that such renders are short, so their
1623        *       time doesn't matter and when it comes to the overall performance,
1624        *       they are dominated by more optimal larger renders.
1625        *
1626        *       It's possible that we could do better with divergence analysis
1627        *       by isolating branches which are 100% uniform.
1628        *
1629        *       Plumbing the trip counts from NIR loop analysis would allow us
1630        *       to do a better job regarding the loop weights.
1631        *
1632        *       In the meantime use values that roughly match the control flow
1633        *       weights used elsewhere in the compiler back-end.
1634        *
1635        *       Note that we provide slightly more pessimistic weights on
1636        *       Gfx12+ for SIMD32, since the effective warp size on that
1637        *       platform is 2x the SIMD width due to EU fusion, which increases
1638        *       the likelihood of divergent control flow in comparison to
1639        *       previous generations, giving narrower SIMD modes a performance
1640        *       advantage in several test-cases with non-uniform discard jumps.
1641        */
1642       const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1643                                     1.0 : 0.5);
1644       const float loop_weight = 10;
1645       unsigned halt_count = 0;
1646       unsigned elapsed = 0;
1647       state st;
1648 
1649       foreach_block(block, s->cfg) {
1650          const unsigned elapsed0 = elapsed;
1651 
1652          foreach_inst_in_block(elk_backend_instruction, inst, block) {
1653             const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1654 
1655             issue_instruction(st, &s->compiler->isa, inst);
1656 
1657             if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET && halt_count)
1658                st.weight /= discard_weight;
1659 
1660             elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1661 
1662             if (inst->opcode == ELK_OPCODE_DO)
1663                st.weight *= loop_weight;
1664             else if (inst->opcode == ELK_OPCODE_WHILE)
1665                st.weight /= loop_weight;
1666             else if (inst->opcode == ELK_OPCODE_HALT && !halt_count++)
1667                st.weight *= discard_weight;
1668          }
1669 
1670          p.block_latency[block->num] = elapsed - elapsed0;
1671       }
1672 
1673       p.latency = elapsed;
1674       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1675    }
1676 }
1677 
performance(const elk_fs_visitor * v)1678 elk::performance::performance(const elk_fs_visitor *v) :
1679    block_latency(new unsigned[v->cfg->num_blocks])
1680 {
1681    calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1682 }
1683 
performance(const vec4_visitor * v)1684 elk::performance::performance(const vec4_visitor *v) :
1685    block_latency(new unsigned[v->cfg->num_blocks])
1686 {
1687    calculate_performance(*this, v, issue_vec4_instruction, 8);
1688 }
1689 
~performance()1690 elk::performance::~performance()
1691 {
1692    delete[] block_latency;
1693 }
1694