• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28 
29 using namespace brw;
30 
31 namespace {
32    /**
33     * Enumeration representing the various asynchronous units that can run
34     * computations in parallel on behalf of a shader thread.
35     */
36    enum intel_eu_unit {
37       /** EU front-end. */
38       EU_UNIT_FE,
39       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40       EU_UNIT_FPU,
41       /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42       EU_UNIT_EM,
43       /** Sampler shared function. */
44       EU_UNIT_SAMPLER,
45       /** Pixel Interpolator shared function. */
46       EU_UNIT_PI,
47       /** Unified Return Buffer shared function. */
48       EU_UNIT_URB,
49       /** Data Port Data Cache shared function. */
50       EU_UNIT_DP_DC,
51       /** Data Port Render Cache shared function. */
52       EU_UNIT_DP_RC,
53       /** Data Port Constant Cache shared function. */
54       EU_UNIT_DP_CC,
55       /** Message Gateway shared function. */
56       EU_UNIT_GATEWAY,
57       /** Thread Spawner shared function. */
58       EU_UNIT_SPAWNER,
59       /* EU_UNIT_VME, */
60       /* EU_UNIT_CRE, */
61       /** Number of asynchronous units currently tracked. */
62       EU_NUM_UNITS,
63       /** Dummy unit for instructions that don't consume runtime from the above. */
64       EU_UNIT_NULL = EU_NUM_UNITS
65    };
66 
67    /**
68     * Enumeration representing a computation result another computation can
69     * potentially depend on.
70     */
71    enum intel_eu_dependency_id {
72       /* Register part of the GRF. */
73       EU_DEPENDENCY_ID_GRF0 = 0,
74       /* Register part of the MRF.  Only used on Gfx4-6. */
75       EU_DEPENDENCY_ID_MRF0 = EU_DEPENDENCY_ID_GRF0 + BRW_MAX_GRF,
76       /* Address register part of the ARF. */
77       EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_MRF0 + 24,
78       /* Accumulator register part of the ARF. */
79       EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
80       /* Flag register part of the ARF. */
81       EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
82       /* SBID token write completion.  Only used on Gfx12+. */
83       EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
84       /* SBID token read completion.  Only used on Gfx12+. */
85       EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 16,
86       /* Number of computation dependencies currently tracked. */
87       EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 16
88    };
89 
90    /**
91     * State of our modeling of the program execution.
92     */
93    struct state {
state__anon5a8962000111::state94       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95       /**
96        * Time at which a given unit will be ready to execute the next
97        * computation, in clock units.
98        */
99       unsigned unit_ready[EU_NUM_UNITS];
100       /**
101        * Time at which an instruction dependent on a given dependency ID will
102        * be ready to execute, in clock units.
103        */
104       unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
105       /**
106        * Aggregated utilization of a given unit excluding idle cycles,
107        * in clock units.
108        */
109       float unit_busy[EU_NUM_UNITS];
110       /**
111        * Factor of the overhead of a computation accounted for in the
112        * aggregated utilization calculation.
113        */
114       float weight;
115    };
116 
117    /**
118     * Information derived from an IR instruction used to compute performance
119     * estimates.  Allows the timing calculation to work on both FS and VEC4
120     * instructions.
121     */
122    struct instruction_info {
instruction_info__anon5a8962000111::instruction_info123       instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
124          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
125          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126          tx(get_exec_type(inst)), sx(0), ss(0),
127          sc(has_bank_conflict(isa, inst) ? sd : 0),
128          desc(inst->desc), sfid(inst->sfid)
129       {
130          /* We typically want the maximum source size, except for split send
131           * messages which require the total size.
132           */
133          if (inst->opcode == SHADER_OPCODE_SEND) {
134             ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135                  DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136          } else {
137             for (unsigned i = 0; i < inst->sources; i++)
138                ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139          }
140 
141          /* Convert the execution size to GRF units. */
142          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143 
144          /* 32x32 integer multiplication has half the usual ALU throughput.
145           * Treat it as double-precision.
146           */
147          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151       }
152 
instruction_info__anon5a8962000111::instruction_info153       instruction_info(const struct brw_isa_info *isa,
154                        const vec4_instruction *inst) :
155          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
156          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157          tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158          desc(inst->desc), sfid(inst->sfid)
159       {
160          /* Compute the maximum source size. */
161          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162             ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163 
164          /* Convert the execution size to GRF units. */
165          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166 
167          /* 32x32 integer multiplication has half the usual ALU throughput.
168           * Treat it as double-precision.
169           */
170          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174       }
175 
176       /** ISA encoding information */
177       const struct brw_isa_info *isa;
178       /** Device information. */
179       const struct intel_device_info *devinfo;
180       /** Instruction opcode. */
181       opcode op;
182       /** Destination type. */
183       brw_reg_type td;
184       /** Destination size in GRF units. */
185       unsigned sd;
186       /** Execution type. */
187       brw_reg_type tx;
188       /** Execution size in GRF units. */
189       unsigned sx;
190       /** Source size. */
191       unsigned ss;
192       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
193       unsigned sc;
194       /** Send message descriptor. */
195       uint32_t desc;
196       /** Send message shared function ID. */
197       uint8_t sfid;
198    };
199 
200    /**
201     * Timing information of an instruction used to estimate the performance of
202     * the program.
203     */
204    struct perf_desc {
perf_desc__anon5a8962000111::perf_desc205       perf_desc(enum intel_eu_unit u, int df, int db,
206                 int ls, int ld, int la, int lf) :
207          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
208 
209       /**
210        * Back-end unit its runtime shall be accounted to, in addition to the
211        * EU front-end which is always assumed to be involved.
212        */
213       enum intel_eu_unit u;
214       /**
215        * Overhead cycles from the time that the EU front-end starts executing
216        * the instruction until it's ready to execute the next instruction.
217        */
218       int df;
219       /**
220        * Overhead cycles from the time that the back-end starts executing the
221        * instruction until it's ready to execute the next instruction.
222        */
223       int db;
224       /**
225        * Latency cycles from the time that the back-end starts executing the
226        * instruction until its sources have been read from the register file.
227        */
228       int ls;
229       /**
230        * Latency cycles from the time that the back-end starts executing the
231        * instruction until its regular destination has been written to the
232        * register file.
233        */
234       int ld;
235       /**
236        * Latency cycles from the time that the back-end starts executing the
237        * instruction until its accumulator destination has been written to the
238        * ARF file.
239        *
240        * Note that this is an approximation of the real behavior of
241        * accumulating instructions in the hardware: Instead of modeling a pair
242        * of back-to-back accumulating instructions as a first computation with
243        * latency equal to ld followed by another computation with a
244        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
245        * model the stall as if it occurred at the top of the pipeline, with
246        * the latency of the accumulator computation offset accordingly.
247        */
248       int la;
249       /**
250        * Latency cycles from the time that the back-end starts executing the
251        * instruction until its flag destination has been written to the ARF
252        * file.
253        */
254       int lf;
255    };
256 
257    /**
258     * Compute the timing information of an instruction based on any relevant
259     * information from the IR and a number of parameters specifying a linear
260     * approximation: Parameter X_Y specifies the derivative of timing X
261     * relative to info field Y, while X_1 specifies the independent term of
262     * the approximation of timing X.
263     */
264    perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)265    calculate_desc(const instruction_info &info, enum intel_eu_unit u,
266                   int df_1, int df_sd, int df_sc,
267                   int db_1, int db_sx,
268                   int ls_1, int ld_1, int la_1, int lf_1,
269                   int l_ss, int l_sd)
270    {
271       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
272                           db_1 + db_sx * int(info.sx),
273                           ls_1 + l_ss * int(info.ss),
274                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
275                           la_1, lf_1);
276    }
277 
278    /**
279     * Compute the timing information of an instruction based on any relevant
280     * information from the IR and a number of linear approximation parameters
281     * hard-coded for each IR instruction.
282     *
283     * Most timing parameters are obtained from the multivariate linear
284     * regression of a sample of empirical timings measured using the tm0
285     * register (as can be done today by using the shader_time debugging
286     * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
287     * "Shared Functions - Extended Math", Section 3.2 "Performance".
288     * Parameters marked XXX shall be considered low-quality, they're possibly
289     * high variance or completely guessed in cases where experimental data was
290     * unavailable.
291     */
292    const perf_desc
instruction_desc(const instruction_info & info)293    instruction_desc(const instruction_info &info)
294    {
295       const struct intel_device_info *devinfo = info.devinfo;
296 
297       switch (info.op) {
298       case BRW_OPCODE_SYNC:
299       case BRW_OPCODE_SEL:
300       case BRW_OPCODE_NOT:
301       case BRW_OPCODE_AND:
302       case BRW_OPCODE_OR:
303       case BRW_OPCODE_XOR:
304       case BRW_OPCODE_SHR:
305       case BRW_OPCODE_SHL:
306       case BRW_OPCODE_DIM:
307       case BRW_OPCODE_ASR:
308       case BRW_OPCODE_CMPN:
309       case BRW_OPCODE_F16TO32:
310       case BRW_OPCODE_BFREV:
311       case BRW_OPCODE_BFI1:
312       case BRW_OPCODE_AVG:
313       case BRW_OPCODE_FRC:
314       case BRW_OPCODE_RNDU:
315       case BRW_OPCODE_RNDD:
316       case BRW_OPCODE_RNDE:
317       case BRW_OPCODE_RNDZ:
318       case BRW_OPCODE_MAC:
319       case BRW_OPCODE_MACH:
320       case BRW_OPCODE_LZD:
321       case BRW_OPCODE_FBH:
322       case BRW_OPCODE_FBL:
323       case BRW_OPCODE_CBIT:
324       case BRW_OPCODE_ADDC:
325       case BRW_OPCODE_ROR:
326       case BRW_OPCODE_ROL:
327       case BRW_OPCODE_SUBB:
328       case BRW_OPCODE_SAD2:
329       case BRW_OPCODE_SADA2:
330       case BRW_OPCODE_LINE:
331       case BRW_OPCODE_NOP:
332       case SHADER_OPCODE_CLUSTER_BROADCAST:
333       case SHADER_OPCODE_SCRATCH_HEADER:
334       case FS_OPCODE_DDX_COARSE:
335       case FS_OPCODE_DDX_FINE:
336       case FS_OPCODE_DDY_COARSE:
337       case FS_OPCODE_PIXEL_X:
338       case FS_OPCODE_PIXEL_Y:
339       case FS_OPCODE_SET_SAMPLE_ID:
340       case VEC4_OPCODE_MOV_BYTES:
341       case VEC4_OPCODE_UNPACK_UNIFORM:
342       case VEC4_OPCODE_DOUBLE_TO_F32:
343       case VEC4_OPCODE_DOUBLE_TO_D32:
344       case VEC4_OPCODE_DOUBLE_TO_U32:
345       case VEC4_OPCODE_TO_DOUBLE:
346       case VEC4_OPCODE_PICK_LOW_32BIT:
347       case VEC4_OPCODE_PICK_HIGH_32BIT:
348       case VEC4_OPCODE_SET_LOW_32BIT:
349       case VEC4_OPCODE_SET_HIGH_32BIT:
350       case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
351       case GS_OPCODE_SET_DWORD_2:
352       case GS_OPCODE_SET_WRITE_OFFSET:
353       case GS_OPCODE_SET_VERTEX_COUNT:
354       case GS_OPCODE_PREPARE_CHANNEL_MASKS:
355       case GS_OPCODE_SET_CHANNEL_MASKS:
356       case GS_OPCODE_GET_INSTANCE_ID:
357       case GS_OPCODE_SET_PRIMITIVE_ID:
358       case GS_OPCODE_SVB_SET_DST_INDEX:
359       case TCS_OPCODE_SRC0_010_IS_ZERO:
360       case TCS_OPCODE_GET_PRIMITIVE_ID:
361       case TES_OPCODE_GET_PRIMITIVE_ID:
362       case SHADER_OPCODE_READ_SR_REG:
363          if (devinfo->ver >= 11) {
364             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
365                                   0, 10, 6 /* XXX */, 14, 0, 0);
366          } else if (devinfo->ver >= 8) {
367             if (type_sz(info.tx) > 4)
368                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
369                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
370             else
371                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
372                                      0, 8, 4, 12, 0, 0);
373          } else if (devinfo->verx10 >= 75) {
374             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
375                                   0, 10, 6 /* XXX */, 16, 0, 0);
376          } else {
377             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
378                                   0, 12, 8 /* XXX */, 18, 0, 0);
379          }
380 
381       case BRW_OPCODE_MOV:
382       case BRW_OPCODE_CMP:
383       case BRW_OPCODE_ADD:
384       case BRW_OPCODE_ADD3:
385       case BRW_OPCODE_MUL:
386       case SHADER_OPCODE_MOV_RELOC_IMM:
387       case VEC4_OPCODE_MOV_FOR_SCRATCH:
388          if (devinfo->ver >= 11) {
389             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
390                                   0, 10, 6, 14, 0, 0);
391          } else if (devinfo->ver >= 8) {
392             if (type_sz(info.tx) > 4)
393                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
394                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
395             else
396                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
397                                      0, 8, 4, 12, 0, 0);
398          } else if (devinfo->verx10 >= 75) {
399             if (info.tx == BRW_REGISTER_TYPE_F)
400                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
401                                      0, 12, 8 /* XXX */, 18, 0, 0);
402             else
403                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
404                                      0, 10, 6 /* XXX */, 16, 0, 0);
405          } else if (devinfo->ver >= 7) {
406             if (info.tx == BRW_REGISTER_TYPE_F)
407                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
408                                      0, 14, 10 /* XXX */, 20, 0, 0);
409             else
410                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
411                                      0, 12, 8 /* XXX */, 18, 0, 0);
412          } else {
413             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
414                                   0, 2 /* XXX */,
415                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
416                                   0, 0);
417          }
418 
419       case BRW_OPCODE_BFE:
420       case BRW_OPCODE_BFI2:
421       case BRW_OPCODE_CSEL:
422          if (devinfo->ver >= 11)
423             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
424                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
425          else if (devinfo->ver >= 8)
426             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
427                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
428          else if (devinfo->verx10 >= 75)
429             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
430                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
431          else if (devinfo->ver >= 7)
432             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
433                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
434          else
435             abort();
436 
437       case BRW_OPCODE_MAD:
438          if (devinfo->ver >= 11) {
439             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
440                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
441          } else if (devinfo->ver >= 8) {
442             if (type_sz(info.tx) > 4)
443                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
444                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
445             else
446                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
447                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
448          } else if (devinfo->verx10 >= 75) {
449             if (info.tx == BRW_REGISTER_TYPE_F)
450                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
451                                      0, 12, 8 /* XXX */, 18, 0, 0);
452             else
453                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
454                                      0, 10, 6 /* XXX */, 16, 0, 0);
455          } else if (devinfo->ver >= 7) {
456             if (info.tx == BRW_REGISTER_TYPE_F)
457                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
458                                      0, 14, 10 /* XXX */, 20, 0, 0);
459             else
460                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
461                                      0, 12, 8 /* XXX */, 18, 0, 0);
462          } else if (devinfo->ver >= 6) {
463             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 1 /* XXX */,
464                                   0, 2 /* XXX */,
465                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
466                                   0, 0);
467          } else {
468             abort();
469          }
470 
471       case BRW_OPCODE_F32TO16:
472          if (devinfo->ver >= 11)
473             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
474                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
475          else if (devinfo->ver >= 8)
476             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
477                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
478          else if (devinfo->verx10 >= 75)
479             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
480                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
481          else if (devinfo->ver >= 7)
482             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
483                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
484          else
485             abort();
486 
487       case BRW_OPCODE_DP4:
488       case BRW_OPCODE_DPH:
489       case BRW_OPCODE_DP3:
490       case BRW_OPCODE_DP2:
491          if (devinfo->ver >= 8)
492             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
493                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
494          else if (devinfo->verx10 >= 75)
495             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
496                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
497          else
498             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
499                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
500 
501       case BRW_OPCODE_DP4A:
502          if (devinfo->ver >= 12)
503             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
504                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
505          else
506             abort();
507 
508       case SHADER_OPCODE_RCP:
509       case SHADER_OPCODE_RSQ:
510       case SHADER_OPCODE_SQRT:
511       case SHADER_OPCODE_EXP2:
512       case SHADER_OPCODE_LOG2:
513       case SHADER_OPCODE_SIN:
514       case SHADER_OPCODE_COS:
515       case SHADER_OPCODE_POW:
516       case SHADER_OPCODE_INT_QUOTIENT:
517       case SHADER_OPCODE_INT_REMAINDER:
518          if (devinfo->ver >= 6) {
519             switch (info.op) {
520             case SHADER_OPCODE_RCP:
521             case SHADER_OPCODE_RSQ:
522             case SHADER_OPCODE_SQRT:
523             case SHADER_OPCODE_EXP2:
524             case SHADER_OPCODE_LOG2:
525             case SHADER_OPCODE_SIN:
526             case SHADER_OPCODE_COS:
527                if (devinfo->ver >= 8)
528                   return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
529                                         0, 16, 0, 0, 0, 0);
530                else if (devinfo->verx10 >= 75)
531                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
532                                         0, 12, 0, 0, 0, 0);
533                else
534                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 2,
535                                         0, 14, 0, 0, 0, 0);
536 
537             case SHADER_OPCODE_POW:
538                if (devinfo->ver >= 8)
539                   return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
540                                         0, 24, 0, 0, 0, 0);
541                else if (devinfo->verx10 >= 75)
542                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
543                                         0, 20, 0, 0, 0, 0);
544                else
545                   return calculate_desc(info, EU_UNIT_EM, 0, 2, 0, 0, 4,
546                                         0, 22, 0, 0, 0, 0);
547 
548             case SHADER_OPCODE_INT_QUOTIENT:
549             case SHADER_OPCODE_INT_REMAINDER:
550                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
551                                      0, 28 /* XXX */, 0, 0, 0, 0);
552 
553             default:
554                abort();
555             }
556          } else {
557             switch (info.op) {
558             case SHADER_OPCODE_RCP:
559                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 8,
560                                      0, 22, 0, 0, 0, 8);
561 
562             case SHADER_OPCODE_RSQ:
563                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 16,
564                                      0, 44, 0, 0, 0, 8);
565 
566             case SHADER_OPCODE_INT_QUOTIENT:
567             case SHADER_OPCODE_SQRT:
568             case SHADER_OPCODE_LOG2:
569                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 24,
570                                      0, 66, 0, 0, 0, 8);
571 
572             case SHADER_OPCODE_INT_REMAINDER:
573             case SHADER_OPCODE_EXP2:
574                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 32,
575                                      0, 88, 0, 0, 0, 8);
576 
577             case SHADER_OPCODE_SIN:
578             case SHADER_OPCODE_COS:
579                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 48,
580                                      0, 132, 0, 0, 0, 8);
581 
582             case SHADER_OPCODE_POW:
583                return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 0, 64,
584                                      0, 176, 0, 0, 0, 8);
585 
586             default:
587                abort();
588             }
589          }
590 
591       case BRW_OPCODE_DO:
592          if (devinfo->ver >= 6)
593             return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
594                                   0, 0, 0, 0, 0, 0);
595          else
596             return calculate_desc(info, EU_UNIT_NULL, 2 /* XXX */, 0, 0, 0, 0,
597                                   0, 0, 0, 0, 0, 0);
598 
599       case BRW_OPCODE_IF:
600       case BRW_OPCODE_ELSE:
601       case BRW_OPCODE_ENDIF:
602       case BRW_OPCODE_WHILE:
603       case BRW_OPCODE_BREAK:
604       case BRW_OPCODE_CONTINUE:
605       case BRW_OPCODE_HALT:
606          if (devinfo->ver >= 8)
607             return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
608                                   0, 0, 0, 0, 0, 0);
609          else if (devinfo->verx10 >= 75)
610             return calculate_desc(info, EU_UNIT_NULL, 6, 0, 0, 0, 0,
611                                   0, 0, 0, 0, 0, 0);
612          else
613             return calculate_desc(info, EU_UNIT_NULL, 2, 0, 0, 0, 0,
614                                   0, 0, 0, 0, 0, 0);
615 
616       case FS_OPCODE_LINTERP:
617          if (devinfo->ver >= 8)
618             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
619                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
620          else if (devinfo->verx10 >= 75)
621             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
622                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
623          else
624             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
625                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
626 
627       case BRW_OPCODE_LRP:
628          if (devinfo->ver >= 8)
629             return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
630                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
631          else if (devinfo->verx10 >= 75)
632             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
633                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
634          else if (devinfo->ver >= 6)
635             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
636                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
637          else
638             abort();
639 
640       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
641          if (devinfo->ver >= 11)
642             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
643                                   0, 10 /* XXX */, 6 /* XXX */,
644                                   14 /* XXX */, 0, 0);
645          else if (devinfo->ver >= 8)
646             return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
647                                   0, 8 /* XXX */, 4 /* XXX */,
648                                   12 /* XXX */, 0, 0);
649          else if (devinfo->verx10 >= 75)
650             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
651                                   0, 10 /* XXX */, 6 /* XXX */,
652                                   16 /* XXX */, 0, 0);
653          else if (devinfo->ver >= 7)
654             return calculate_desc(info, EU_UNIT_FPU, 24, 6, 0, 0, 6,
655                                   0, 12 /* XXX */, 8 /* XXX */,
656                                   18 /* XXX */, 0, 0);
657          else
658             abort();
659 
660       case SHADER_OPCODE_MOV_INDIRECT:
661          if (devinfo->ver >= 11)
662             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
663                                   0, 10 /* XXX */, 6 /* XXX */,
664                                   14 /* XXX */, 0, 0);
665          else if (devinfo->ver >= 8)
666             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
667                                   0, 8 /* XXX */, 4 /* XXX */,
668                                   12 /* XXX */, 0, 0);
669          else if (devinfo->verx10 >= 75)
670             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
671                                   0, 10 /* XXX */, 6 /* XXX */,
672                                   16 /* XXX */, 0, 0);
673          else
674             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
675                                   0, 12 /* XXX */, 8 /* XXX */,
676                                   18 /* XXX */, 0, 0);
677 
678       case SHADER_OPCODE_BROADCAST:
679          if (devinfo->ver >= 11)
680             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
681                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
682          else if (devinfo->ver >= 8)
683             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
684                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
685          else if (devinfo->verx10 >= 75)
686             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
687                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
688          else if (devinfo->ver >= 7)
689             return calculate_desc(info, EU_UNIT_FPU, 20, 0, 0, 4, 0,
690                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
691          else
692             abort();
693 
694       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
695       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
696          if (devinfo->ver >= 11)
697             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
698                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
699          else if (devinfo->ver >= 8)
700             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
701                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
702          else if (devinfo->verx10 >= 75)
703             return calculate_desc(info, EU_UNIT_FPU, 36, 0, 0, 6, 0,
704                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
705          else if (devinfo->ver >= 7)
706             return calculate_desc(info, EU_UNIT_FPU, 40, 0, 0, 6, 0,
707                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
708          else
709             abort();
710 
711       case SHADER_OPCODE_RND_MODE:
712       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
713          if (devinfo->ver >= 11)
714             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
715                                   4 /* XXX */, 0,
716                                   0, 0, 0, 0, 0, 0);
717          else if (devinfo->ver >= 8)
718             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
719                                   4 /* XXX */, 0,
720                                   0, 0, 0, 0, 0, 0);
721          else if (devinfo->verx10 >= 75)
722             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
723                                   4 /* XXX */, 0,
724                                   0, 0, 0, 0, 0, 0);
725          else if (devinfo->ver >= 6)
726             return calculate_desc(info, EU_UNIT_FPU, 28 /* XXX */, 0, 0,
727                                   4 /* XXX */, 0,
728                                   0, 0, 0, 0, 0, 0);
729          else
730             abort();
731 
732       case SHADER_OPCODE_SHUFFLE:
733          if (devinfo->ver >= 11)
734             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
735                                   44 /* XXX */, 0,
736                                   0, 10 /* XXX */, 6 /* XXX */,
737                                   14 /* XXX */, 0, 0);
738          else if (devinfo->ver >= 8)
739             return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
740                                   42 /* XXX */, 0,
741                                   0, 8 /* XXX */, 4 /* XXX */,
742                                   12 /* XXX */, 0, 0);
743          else if (devinfo->verx10 >= 75)
744             return calculate_desc(info, EU_UNIT_FPU, 0, 44 /* XXX */, 0,
745                                   0, 44 /* XXX */,
746                                   0, 10 /* XXX */, 6 /* XXX */,
747                                   16 /* XXX */, 0, 0);
748          else if (devinfo->ver >= 6)
749             return calculate_desc(info, EU_UNIT_FPU, 0, 46 /* XXX */, 0,
750                                   0, 46 /* XXX */,
751                                   0, 12 /* XXX */, 8 /* XXX */,
752                                   18 /* XXX */, 0, 0);
753          else
754             abort();
755 
756       case SHADER_OPCODE_SEL_EXEC:
757          if (devinfo->ver >= 11)
758             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
759                                   0, 4 /* XXX */,
760                                   0, 10 /* XXX */, 6 /* XXX */,
761                                   14 /* XXX */, 0, 0);
762          else if (devinfo->ver >= 8)
763             return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
764                                   0, 4 /* XXX */,
765                                   0, 8 /* XXX */, 4 /* XXX */,
766                                   12 /* XXX */, 0, 0);
767          else if (devinfo->verx10 >= 75)
768             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
769                                   0, 4 /* XXX */,
770                                   0, 10 /* XXX */, 6 /* XXX */,
771                                   16 /* XXX */, 0, 0);
772          else
773             return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 4 /* XXX */, 0,
774                                   0, 4 /* XXX */,
775                                   0, 12 /* XXX */, 8 /* XXX */,
776                                   18 /* XXX */, 0, 0);
777 
778       case SHADER_OPCODE_QUAD_SWIZZLE:
779          if (devinfo->ver >= 11)
780             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
781                                   0, 8 /* XXX */,
782                                   0, 10 /* XXX */, 6 /* XXX */,
783                                   14 /* XXX */, 0, 0);
784          else if (devinfo->ver >= 8)
785             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
786                                   0, 8 /* XXX */,
787                                   0, 8 /* XXX */, 4 /* XXX */,
788                                   12 /* XXX */, 0, 0);
789          else if (devinfo->verx10 >= 75)
790             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
791                                   0, 8 /* XXX */,
792                                   0, 10 /* XXX */, 6 /* XXX */,
793                                   16 /* XXX */, 0, 0);
794          else
795             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
796                                   0, 8 /* XXX */,
797                                   0, 12 /* XXX */, 8 /* XXX */,
798                                   18 /* XXX */, 0, 0);
799 
800       case FS_OPCODE_DDY_FINE:
801          if (devinfo->ver >= 11)
802             return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
803                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
804          else if (devinfo->ver >= 8)
805             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
806                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
807          else if (devinfo->verx10 >= 75)
808             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
809                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
810          else
811             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
812                                   0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
813 
814       case FS_OPCODE_LOAD_LIVE_CHANNELS:
815          if (devinfo->ver >= 11)
816             return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
817                                   2 /* XXX */, 0,
818                                   0, 0, 0, 10 /* XXX */, 0, 0);
819          else if (devinfo->ver >= 8)
820             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
821                                   0, 2 /* XXX */,
822                                   0, 0, 0, 8 /* XXX */, 0, 0);
823          else
824             abort();
825 
826       case VEC4_OPCODE_PACK_BYTES:
827          if (devinfo->ver >= 8)
828             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
829                                   4 /* XXX */, 0,
830                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
831                                   0, 0);
832          else if (devinfo->verx10 >= 75)
833             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
834                                   4 /* XXX */, 0,
835                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
836                                   0, 0);
837          else
838             return calculate_desc(info, EU_UNIT_FPU, 4 /* XXX */, 0, 0,
839                                   4 /* XXX */, 0,
840                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
841                                   0, 0);
842 
843       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
844       case TCS_OPCODE_GET_INSTANCE_ID:
845       case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
846       case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
847       case TES_OPCODE_CREATE_INPUT_READ_HEADER:
848          if (devinfo->ver >= 8)
849             return calculate_desc(info, EU_UNIT_FPU, 22 /* XXX */, 0, 0,
850                                   6 /* XXX */, 0,
851                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
852                                   0, 0);
853          else if (devinfo->verx10 >= 75)
854             return calculate_desc(info, EU_UNIT_FPU, 26 /* XXX */, 0, 0,
855                                   6 /* XXX */, 0,
856                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
857                                   0, 0);
858          else
859             return calculate_desc(info, EU_UNIT_FPU, 30 /* XXX */, 0, 0,
860                                   6 /* XXX */, 0,
861                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
862                                   0, 0);
863 
864       case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
865       case TCS_OPCODE_CREATE_BARRIER_HEADER:
866          if (devinfo->ver >= 8)
867             return calculate_desc(info, EU_UNIT_FPU, 32 /* XXX */, 0, 0,
868                                   8 /* XXX */, 0,
869                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
870                                   0, 0);
871          else if (devinfo->verx10 >= 75)
872             return calculate_desc(info, EU_UNIT_FPU, 38 /* XXX */, 0, 0,
873                                   8 /* XXX */, 0,
874                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
875                                   0, 0);
876          else if (devinfo->ver >= 6)
877             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
878                                   8 /* XXX */, 0,
879                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
880                                   0, 0);
881          else
882             abort();
883 
884       case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
885          if (devinfo->ver >= 8)
886             return calculate_desc(info, EU_UNIT_FPU, 12 /* XXX */, 0, 0,
887                                   4 /* XXX */, 0,
888                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
889                                   0, 0);
890          else if (devinfo->verx10 >= 75)
891             return calculate_desc(info, EU_UNIT_FPU, 14 /* XXX */, 0, 0,
892                                   4 /* XXX */, 0,
893                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
894                                   0, 0);
895          else if (devinfo->ver >= 7)
896             return calculate_desc(info, EU_UNIT_FPU, 16 /* XXX */, 0, 0,
897                                   4 /* XXX */, 0,
898                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
899                                   0, 0);
900          else
901             abort();
902 
903       case SHADER_OPCODE_TEX:
904       case FS_OPCODE_TXB:
905       case SHADER_OPCODE_TXD:
906       case SHADER_OPCODE_TXF:
907       case SHADER_OPCODE_TXF_LZ:
908       case SHADER_OPCODE_TXL:
909       case SHADER_OPCODE_TXL_LZ:
910       case SHADER_OPCODE_TXF_CMS:
911       case SHADER_OPCODE_TXF_CMS_W:
912       case SHADER_OPCODE_TXF_UMS:
913       case SHADER_OPCODE_TXF_MCS:
914       case SHADER_OPCODE_TXS:
915       case SHADER_OPCODE_LOD:
916       case SHADER_OPCODE_GET_BUFFER_SIZE:
917       case SHADER_OPCODE_TG4:
918       case SHADER_OPCODE_TG4_OFFSET:
919       case SHADER_OPCODE_SAMPLEINFO:
920       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
921          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
922                                8 /* XXX */, 750 /* XXX */, 0, 0,
923                                2 /* XXX */, 0);
924 
925       case VEC4_OPCODE_URB_READ:
926       case VEC4_VS_OPCODE_URB_WRITE:
927       case VEC4_GS_OPCODE_URB_WRITE:
928       case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
929       case GS_OPCODE_THREAD_END:
930       case GS_OPCODE_FF_SYNC:
931       case VEC4_TCS_OPCODE_URB_WRITE:
932       case TCS_OPCODE_RELEASE_INPUT:
933       case TCS_OPCODE_THREAD_END:
934          return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
935                                32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
936 
937       case SHADER_OPCODE_MEMORY_FENCE:
938       case SHADER_OPCODE_INTERLOCK:
939          switch (info.sfid) {
940          case GFX6_SFID_DATAPORT_RENDER_CACHE:
941             if (devinfo->ver >= 7)
942                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
943                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
944             else
945                abort();
946 
947          case BRW_SFID_URB:
948          case GFX7_SFID_DATAPORT_DATA_CACHE:
949          case GFX12_SFID_SLM:
950          case GFX12_SFID_TGM:
951          case GFX12_SFID_UGM:
952          case HSW_SFID_DATAPORT_DATA_CACHE_1:
953             if (devinfo->ver >= 7)
954                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
955                                      10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
956             else
957                abort();
958 
959          default:
960             abort();
961          }
962 
963       case SHADER_OPCODE_GFX4_SCRATCH_READ:
964       case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
965       case SHADER_OPCODE_GFX7_SCRATCH_READ:
966          return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 0, 8 /* XXX */,
967                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
968 
969       case VEC4_OPCODE_UNTYPED_ATOMIC:
970          if (devinfo->ver >= 7)
971             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
972                                   30 /* XXX */, 400 /* XXX */,
973                                   10 /* XXX */, 100 /* XXX */, 0, 0,
974                                   0, 400 /* XXX */);
975          else
976             abort();
977 
978       case VEC4_OPCODE_UNTYPED_SURFACE_READ:
979       case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
980          if (devinfo->ver >= 7)
981             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
982                                   0, 20 /* XXX */,
983                                   10 /* XXX */, 100 /* XXX */, 0, 0,
984                                   0, 0);
985          else
986             abort();
987 
988       case FS_OPCODE_FB_WRITE:
989       case FS_OPCODE_FB_READ:
990       case FS_OPCODE_REP_FB_WRITE:
991          return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
992                                10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
993 
994       case GS_OPCODE_SVB_WRITE:
995          if (devinfo->ver >= 6)
996             return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
997                                   0, 450 /* XXX */,
998                                   10 /* XXX */, 300 /* XXX */, 0, 0,
999                                   0, 0);
1000          else
1001             abort();
1002 
1003       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1004       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
1005          return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
1006                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1007 
1008       case VS_OPCODE_PULL_CONSTANT_LOAD:
1009       case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1010          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1011                                8, 750, 0, 0, 2, 0);
1012 
1013       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1014       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1015       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1016          if (devinfo->ver >= 7)
1017             return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
1018                                   0, 90 /* XXX */, 0, 0, 0, 0);
1019          else
1020             abort();
1021 
1022       case SHADER_OPCODE_BARRIER:
1023          if (devinfo->ver >= 7)
1024             return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
1025                                   0 /* XXX */, 0,
1026                                   0, 0, 0, 0, 0, 0);
1027          else
1028             abort();
1029 
1030       case CS_OPCODE_CS_TERMINATE:
1031          if (devinfo->ver >= 7)
1032             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1033                                   10 /* XXX */, 0, 0, 0, 0, 0);
1034          else
1035             abort();
1036 
1037       case SHADER_OPCODE_SEND:
1038          switch (info.sfid) {
1039          case GFX6_SFID_DATAPORT_RENDER_CACHE:
1040             if (devinfo->ver >= 7) {
1041                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1042                case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1043                   return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1044                                         30 /* XXX */, 450 /* XXX */,
1045                                         10 /* XXX */, 100 /* XXX */,
1046                                         0, 0, 0, 400 /* XXX */);
1047                default:
1048                   return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
1049                                         0, 450 /* XXX */,
1050                                         10 /* XXX */, 300 /* XXX */, 0, 0,
1051                                         0, 0);
1052                }
1053             } else if (devinfo->ver >= 6)  {
1054                return calculate_desc(info, EU_UNIT_DP_RC, 2 /* XXX */, 0, 0,
1055                                      0, 450 /* XXX */,
1056                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1057             } else {
1058                abort();
1059             }
1060          case BRW_SFID_SAMPLER: {
1061             if (devinfo->ver >= 6)
1062                return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
1063                                      8, 750, 0, 0, 2, 0);
1064             else
1065                abort();
1066          }
1067          case GFX7_SFID_DATAPORT_DATA_CACHE:
1068          case HSW_SFID_DATAPORT_DATA_CACHE_1:
1069             if (devinfo->verx10 >= 75) {
1070                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1071                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1072                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1073                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1074                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1075                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1076                                         30 /* XXX */, 400 /* XXX */,
1077                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1078                                         0, 400 /* XXX */);
1079 
1080                default:
1081                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1082                                         0, 20 /* XXX */,
1083                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1084                                         0, 0);
1085                }
1086             } else if (devinfo->ver >= 7) {
1087                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1088                case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1089                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1090                                         30 /* XXX */, 400 /* XXX */,
1091                                         10 /* XXX */, 100 /* XXX */,
1092                                         0, 0, 0, 400 /* XXX */);
1093                default:
1094                   return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1095                                         0, 20 /* XXX */,
1096                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1097                                         0, 0);
1098                }
1099             } else {
1100                abort();
1101             }
1102 
1103          case GFX12_SFID_UGM:
1104          case GFX12_SFID_TGM:
1105          case GFX12_SFID_SLM:
1106             switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1107             case LSC_OP_LOAD:
1108             case LSC_OP_STORE:
1109             case LSC_OP_LOAD_CMASK:
1110             case LSC_OP_STORE_CMASK:
1111                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1112                                      0, 20 /* XXX */,
1113                                      10 /* XXX */, 100 /* XXX */, 0, 0,
1114                                      0, 0);
1115 
1116             case LSC_OP_FENCE:
1117             case LSC_OP_ATOMIC_INC:
1118             case LSC_OP_ATOMIC_DEC:
1119             case LSC_OP_ATOMIC_LOAD:
1120             case LSC_OP_ATOMIC_STORE:
1121             case LSC_OP_ATOMIC_ADD:
1122             case LSC_OP_ATOMIC_SUB:
1123             case LSC_OP_ATOMIC_MIN:
1124             case LSC_OP_ATOMIC_MAX:
1125             case LSC_OP_ATOMIC_UMIN:
1126             case LSC_OP_ATOMIC_UMAX:
1127             case LSC_OP_ATOMIC_CMPXCHG:
1128             case LSC_OP_ATOMIC_FADD:
1129             case LSC_OP_ATOMIC_FSUB:
1130             case LSC_OP_ATOMIC_FMIN:
1131             case LSC_OP_ATOMIC_FMAX:
1132             case LSC_OP_ATOMIC_FCMPXCHG:
1133             case LSC_OP_ATOMIC_AND:
1134             case LSC_OP_ATOMIC_OR:
1135             case LSC_OP_ATOMIC_XOR:
1136                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
1137                                      30 /* XXX */, 400 /* XXX */,
1138                                      10 /* XXX */, 100 /* XXX */, 0, 0,
1139                                      0, 400 /* XXX */);
1140             default:
1141                abort();
1142             }
1143 
1144          case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
1145          case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
1146             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
1147                                   10 /* XXX */, 0, 0, 0, 0, 0);
1148 
1149          case BRW_SFID_URB:
1150             return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
1151                                   32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
1152 
1153          default:
1154             abort();
1155          }
1156 
1157       case SHADER_OPCODE_UNDEF:
1158       case SHADER_OPCODE_HALT_TARGET:
1159       case FS_OPCODE_SCHEDULING_FENCE:
1160          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
1161                                0, 0, 0, 0, 0, 0);
1162 
1163       default:
1164          abort();
1165       }
1166    }
1167 
1168    /**
1169     * Model the performance behavior of a stall on the specified dependency
1170     * ID.
1171     */
1172    void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)1173    stall_on_dependency(state &st, enum intel_eu_dependency_id id)
1174    {
1175       if (id < ARRAY_SIZE(st.dep_ready))
1176          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1177                                        st.dep_ready[id]);
1178    }
1179 
1180    /**
1181     * Model the performance behavior of the front-end and back-end while
1182     * executing an instruction with the specified timing information, assuming
1183     * all dependencies are already clear.
1184     */
1185    void
execute_instruction(state & st,const perf_desc & perf)1186    execute_instruction(state &st, const perf_desc &perf)
1187    {
1188       /* Compute the time at which the front-end will be ready to execute the
1189        * next instruction.
1190        */
1191       st.unit_ready[EU_UNIT_FE] += perf.df;
1192 
1193       if (perf.u < EU_NUM_UNITS) {
1194          /* Wait for the back-end to be ready to execute this instruction. */
1195          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
1196                                        st.unit_ready[perf.u]);
1197 
1198          /* Compute the time at which the back-end will be ready to execute
1199           * the next instruction, and update the back-end utilization.
1200           */
1201          st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
1202          st.unit_busy[perf.u] += perf.db * st.weight;
1203       }
1204    }
1205 
1206    /**
1207     * Model the performance behavior of a read dependency provided by an
1208     * instruction.
1209     */
1210    void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1211    mark_read_dependency(state &st, const perf_desc &perf,
1212                         enum intel_eu_dependency_id id)
1213    {
1214       if (id < ARRAY_SIZE(st.dep_ready))
1215          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
1216    }
1217 
1218    /**
1219     * Model the performance behavior of a write dependency provided by an
1220     * instruction.
1221     */
1222    void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)1223    mark_write_dependency(state &st, const perf_desc &perf,
1224                          enum intel_eu_dependency_id id)
1225    {
1226       if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
1227          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
1228       else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
1229          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
1230       else if (id < ARRAY_SIZE(st.dep_ready))
1231          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
1232    }
1233 
1234    /**
1235     * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1236     */
1237    enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const backend_reg & r,const int delta)1238    reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
1239                      const int delta)
1240    {
1241       if (r.file == VGRF) {
1242          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1243          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1244          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1245 
1246       } else if (r.file == FIXED_GRF) {
1247          const unsigned i = r.nr + delta;
1248          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1249          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1250 
1251       } else if (r.file == MRF && devinfo->ver >= 7) {
1252          const unsigned i = GFX7_MRF_HACK_START +
1253                             r.nr + r.offset / REG_SIZE + delta;
1254          assert(i < EU_DEPENDENCY_ID_MRF0 - EU_DEPENDENCY_ID_GRF0);
1255          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
1256 
1257       } else if (r.file == MRF && devinfo->ver < 7) {
1258          const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1259                             r.offset / REG_SIZE + delta;
1260          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_MRF0);
1261          return intel_eu_dependency_id(EU_DEPENDENCY_ID_MRF0 + i);
1262 
1263       } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1264                  r.nr < BRW_ARF_ACCUMULATOR) {
1265          assert(delta == 0);
1266          return EU_DEPENDENCY_ID_ADDR0;
1267 
1268       } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1269                  r.nr < BRW_ARF_FLAG) {
1270          const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1271          assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
1272          return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
1273 
1274       } else {
1275          return EU_NUM_DEPENDENCY_IDS;
1276       }
1277    }
1278 
1279    /**
1280     * Return the dependency ID of flag register starting at offset \p i.
1281     */
1282    enum intel_eu_dependency_id
flag_dependency_id(unsigned i)1283    flag_dependency_id(unsigned i)
1284    {
1285       assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
1286       return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
1287    }
1288 
1289    /**
1290     * Return the dependency ID corresponding to the SBID read completion
1291     * condition of a Gfx12+ SWSB.
1292     */
1293    enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1294    tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1295    {
1296       if (swsb.mode) {
1297          assert(swsb.sbid <
1298                 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
1299          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
1300       } else {
1301          return EU_NUM_DEPENDENCY_IDS;
1302       }
1303    }
1304 
1305    /**
1306     * Return the dependency ID corresponding to the SBID write completion
1307     * condition of a Gfx12+ SWSB.
1308     */
1309    enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1310    tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1311    {
1312       if (swsb.mode) {
1313          assert(swsb.sbid <
1314                 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
1315          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
1316       } else {
1317          return EU_NUM_DEPENDENCY_IDS;
1318       }
1319    }
1320 
1321    /**
1322     * Return the implicit accumulator register accessed by channel \p i of the
1323     * instruction.
1324     */
1325    unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)1326    accum_reg_of_channel(const intel_device_info *devinfo,
1327                         const backend_instruction *inst,
1328                         brw_reg_type tx, unsigned i)
1329    {
1330       assert(inst->reads_accumulator_implicitly() ||
1331              inst->writes_accumulator_implicitly(devinfo));
1332       const unsigned offset = (inst->group + i) * type_sz(tx) *
1333          (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1334       return offset / REG_SIZE % 2;
1335    }
1336 
1337    /**
1338     * Model the performance behavior of an FS back-end instruction.
1339     */
1340    void
issue_fs_inst(state & st,const struct brw_isa_info * isa,const backend_instruction * be_inst)1341    issue_fs_inst(state &st, const struct brw_isa_info *isa,
1342                  const backend_instruction *be_inst)
1343    {
1344       const struct intel_device_info *devinfo = isa->devinfo;
1345       const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1346       const instruction_info info(isa, inst);
1347       const perf_desc perf = instruction_desc(info);
1348 
1349       /* Stall on any source dependencies. */
1350       for (unsigned i = 0; i < inst->sources; i++) {
1351          for (unsigned j = 0; j < regs_read(inst, i); j++)
1352             stall_on_dependency(
1353                st, reg_dependency_id(devinfo, inst->src[i], j));
1354       }
1355 
1356       if (inst->reads_accumulator_implicitly()) {
1357          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1358               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1359                                         inst->exec_size - 1); j++)
1360             stall_on_dependency(
1361                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1362       }
1363 
1364       if (is_send(inst) && inst->base_mrf != -1) {
1365          for (unsigned j = 0; j < inst->mlen; j++)
1366             stall_on_dependency(
1367                st, reg_dependency_id(
1368                   devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1369       }
1370 
1371       if (const unsigned mask = inst->flags_read(devinfo)) {
1372          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1373             if (mask & (1 << i))
1374                stall_on_dependency(st, flag_dependency_id(i));
1375          }
1376       }
1377 
1378       /* Stall on any write dependencies. */
1379       if (!inst->no_dd_check) {
1380          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1381             for (unsigned j = 0; j < regs_written(inst); j++)
1382                stall_on_dependency(
1383                   st, reg_dependency_id(devinfo, inst->dst, j));
1384          }
1385 
1386          if (inst->writes_accumulator_implicitly(devinfo)) {
1387             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1388                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1389                                            inst->exec_size - 1); j++)
1390                stall_on_dependency(
1391                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1392          }
1393 
1394          if (const unsigned mask = inst->flags_written(devinfo)) {
1395             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1396                if (mask & (1 << i))
1397                   stall_on_dependency(st, flag_dependency_id(i));
1398             }
1399          }
1400       }
1401 
1402       /* Stall on any SBID dependencies. */
1403       if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1404          stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1405       else if (inst->sched.mode & TGL_SBID_SRC)
1406          stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1407 
1408       /* Execute the instruction. */
1409       execute_instruction(st, perf);
1410 
1411       /* Mark any source dependencies. */
1412       if (inst->is_send_from_grf()) {
1413          for (unsigned i = 0; i < inst->sources; i++) {
1414             if (inst->is_payload(i)) {
1415                for (unsigned j = 0; j < regs_read(inst, i); j++)
1416                   mark_read_dependency(
1417                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1418             }
1419          }
1420       }
1421 
1422       if (is_send(inst) && inst->base_mrf != -1) {
1423          for (unsigned j = 0; j < inst->mlen; j++)
1424             mark_read_dependency(st, perf,
1425                reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1426       }
1427 
1428       /* Mark any destination dependencies. */
1429       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1430          for (unsigned j = 0; j < regs_written(inst); j++) {
1431             mark_write_dependency(st, perf,
1432                                   reg_dependency_id(devinfo, inst->dst, j));
1433          }
1434       }
1435 
1436       if (inst->writes_accumulator_implicitly(devinfo)) {
1437          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1438               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1439                                         inst->exec_size - 1); j++)
1440             mark_write_dependency(st, perf,
1441                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
1442       }
1443 
1444       if (const unsigned mask = inst->flags_written(devinfo)) {
1445          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1446             if (mask & (1 << i))
1447                mark_write_dependency(st, perf, flag_dependency_id(i));
1448          }
1449       }
1450 
1451       /* Mark any SBID dependencies. */
1452       if (inst->sched.mode & TGL_SBID_SET) {
1453          mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1454          mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1455       }
1456    }
1457 
1458    /**
1459     * Model the performance behavior of a VEC4 back-end instruction.
1460     */
1461    void
issue_vec4_instruction(state & st,const struct brw_isa_info * isa,const backend_instruction * be_inst)1462    issue_vec4_instruction(state &st, const struct brw_isa_info *isa,
1463                           const backend_instruction *be_inst)
1464    {
1465       const struct intel_device_info *devinfo = isa->devinfo;
1466       const vec4_instruction *inst =
1467          static_cast<const vec4_instruction *>(be_inst);
1468       const instruction_info info(isa, inst);
1469       const perf_desc perf = instruction_desc(info);
1470 
1471       /* Stall on any source dependencies. */
1472       for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1473          for (unsigned j = 0; j < regs_read(inst, i); j++)
1474             stall_on_dependency(
1475                st, reg_dependency_id(devinfo, inst->src[i], j));
1476       }
1477 
1478       if (inst->reads_accumulator_implicitly()) {
1479          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1480               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1481                                         inst->exec_size - 1); j++)
1482             stall_on_dependency(
1483                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1484       }
1485 
1486       if (inst->base_mrf != -1) {
1487          for (unsigned j = 0; j < inst->mlen; j++)
1488             stall_on_dependency(
1489                st, reg_dependency_id(
1490                   devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1491       }
1492 
1493       if (inst->reads_flag())
1494          stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1495 
1496       /* Stall on any write dependencies. */
1497       if (!inst->no_dd_check) {
1498          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1499             for (unsigned j = 0; j < regs_written(inst); j++)
1500                stall_on_dependency(
1501                   st, reg_dependency_id(devinfo, inst->dst, j));
1502          }
1503 
1504          if (inst->writes_accumulator_implicitly(devinfo)) {
1505             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1506                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1507                                            inst->exec_size - 1); j++)
1508                stall_on_dependency(
1509                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1510          }
1511 
1512          if (inst->writes_flag(devinfo))
1513             stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
1514       }
1515 
1516       /* Execute the instruction. */
1517       execute_instruction(st, perf);
1518 
1519       /* Mark any source dependencies. */
1520       if (inst->is_send_from_grf()) {
1521          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1522             for (unsigned j = 0; j < regs_read(inst, i); j++)
1523                mark_read_dependency(
1524                   st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1525          }
1526       }
1527 
1528       if (inst->base_mrf != -1) {
1529          for (unsigned j = 0; j < inst->mlen; j++)
1530             mark_read_dependency(st, perf,
1531                reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1532       }
1533 
1534       /* Mark any destination dependencies. */
1535       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1536          for (unsigned j = 0; j < regs_written(inst); j++) {
1537             mark_write_dependency(st, perf,
1538                                   reg_dependency_id(devinfo, inst->dst, j));
1539          }
1540       }
1541 
1542       if (inst->writes_accumulator_implicitly(devinfo)) {
1543          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1544               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1545                                         inst->exec_size - 1); j++)
1546             mark_write_dependency(st, perf,
1547                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
1548       }
1549 
1550       if (inst->writes_flag(devinfo))
1551          mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
1552    }
1553 
1554    /**
1555     * Calculate the maximum possible throughput of the program compatible with
1556     * the cycle-count utilization estimated for each asynchronous unit, in
1557     * threads-per-cycle units.
1558     */
1559    float
calculate_thread_throughput(const state & st,float busy)1560    calculate_thread_throughput(const state &st, float busy)
1561    {
1562       for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1563          busy = MAX2(busy, st.unit_busy[i]);
1564 
1565       return 1.0 / busy;
1566    }
1567 
1568    /**
1569     * Estimate the performance of the specified shader.
1570     */
1571    void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const struct brw_isa_info *,const backend_instruction *),unsigned dispatch_width)1572    calculate_performance(performance &p, const backend_shader *s,
1573                          void (*issue_instruction)(
1574                             state &, const struct brw_isa_info *,
1575                             const backend_instruction *),
1576                          unsigned dispatch_width)
1577    {
1578       /* XXX - Note that the previous version of this code used worst-case
1579        *       scenario estimation of branching divergence for SIMD32 shaders,
1580        *       but this heuristic was removed to improve performance in common
1581        *       scenarios. Wider shader variants are less optimal when divergence
1582        *       is high, e.g. when application renders complex scene on a small
1583        *       surface. It is assumed that such renders are short, so their
1584        *       time doesn't matter and when it comes to the overall performance,
1585        *       they are dominated by more optimal larger renders.
1586        *
1587        *       It's possible that we could do better with divergence analysis
1588        *       by isolating branches which are 100% uniform.
1589        *
1590        *       Plumbing the trip counts from NIR loop analysis would allow us
1591        *       to do a better job regarding the loop weights.
1592        *
1593        *       In the meantime use values that roughly match the control flow
1594        *       weights used elsewhere in the compiler back-end.
1595        *
1596        *       Note that we provide slightly more pessimistic weights on
1597        *       Gfx12+ for SIMD32, since the effective warp size on that
1598        *       platform is 2x the SIMD width due to EU fusion, which increases
1599        *       the likelihood of divergent control flow in comparison to
1600        *       previous generations, giving narrower SIMD modes a performance
1601        *       advantage in several test-cases with non-uniform discard jumps.
1602        */
1603       const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1604                                     1.0 : 0.5);
1605       const float loop_weight = 10;
1606       unsigned halt_count = 0;
1607       unsigned elapsed = 0;
1608       state st;
1609 
1610       foreach_block(block, s->cfg) {
1611          const unsigned elapsed0 = elapsed;
1612 
1613          foreach_inst_in_block(backend_instruction, inst, block) {
1614             const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1615 
1616             issue_instruction(st, &s->compiler->isa, inst);
1617 
1618             if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1619                st.weight /= discard_weight;
1620 
1621             elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1622 
1623             if (inst->opcode == BRW_OPCODE_DO)
1624                st.weight *= loop_weight;
1625             else if (inst->opcode == BRW_OPCODE_WHILE)
1626                st.weight /= loop_weight;
1627             else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1628                st.weight *= discard_weight;
1629          }
1630 
1631          p.block_latency[block->num] = elapsed - elapsed0;
1632       }
1633 
1634       p.latency = elapsed;
1635       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1636    }
1637 }
1638 
performance(const fs_visitor * v)1639 brw::performance::performance(const fs_visitor *v) :
1640    block_latency(new unsigned[v->cfg->num_blocks])
1641 {
1642    calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1643 }
1644 
performance(const vec4_visitor * v)1645 brw::performance::performance(const vec4_visitor *v) :
1646    block_latency(new unsigned[v->cfg->num_blocks])
1647 {
1648    calculate_performance(*this, v, issue_vec4_instruction, 8);
1649 }
1650 
~performance()1651 brw::performance::~performance()
1652 {
1653    delete[] block_latency;
1654 }
1655