• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28 
29 using namespace brw;
30 
31 namespace {
32    /**
33     * Enumeration representing the various asynchronous units that can run
34     * computations in parallel on behalf of a shader thread.
35     */
36    enum unit {
37       /** EU front-end. */
38       unit_fe,
39       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40       unit_fpu,
41       /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
42       unit_em,
43       /** Sampler shared function. */
44       unit_sampler,
45       /** Pixel Interpolator shared function. */
46       unit_pi,
47       /** Unified Return Buffer shared function. */
48       unit_urb,
49       /** Data Port Data Cache shared function. */
50       unit_dp_dc,
51       /** Data Port Render Cache shared function. */
52       unit_dp_rc,
53       /** Data Port Constant Cache shared function. */
54       unit_dp_cc,
55       /** Message Gateway shared function. */
56       unit_gateway,
57       /** Thread Spawner shared function. */
58       unit_spawner,
59       /* unit_vme, */
60       /* unit_cre, */
61       /** Number of asynchronous units currently tracked. */
62       num_units,
63       /** Dummy unit for instructions that don't consume runtime from the above. */
64       unit_null = num_units
65    };
66 
67    /**
68     * Enumeration representing a computation result another computation can
69     * potentially depend on.
70     */
71    enum dependency_id {
72       /* Register part of the GRF. */
73       dependency_id_grf0 = 0,
74       /* Register part of the MRF.  Only used on Gfx4-6. */
75       dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76       /* Address register part of the ARF. */
77       dependency_id_addr0 = dependency_id_mrf0 + 24,
78       /* Accumulator register part of the ARF. */
79       dependency_id_accum0 = dependency_id_addr0 + 1,
80       /* Flag register part of the ARF. */
81       dependency_id_flag0 = dependency_id_accum0 + 12,
82       /* SBID token write completion.  Only used on Gfx12+. */
83       dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84       /* SBID token read completion.  Only used on Gfx12+. */
85       dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86       /* Number of computation dependencies currently tracked. */
87       num_dependency_ids = dependency_id_sbid_rd0 + 16
88    };
89 
90    /**
91     * State of our modeling of the program execution.
92     */
93    struct state {
state__anon63ed41e20111::state94       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95       /**
96        * Time at which a given unit will be ready to execute the next
97        * computation, in clock units.
98        */
99       unsigned unit_ready[num_units];
100       /**
101        * Time at which an instruction dependent on a given dependency ID will
102        * be ready to execute, in clock units.
103        */
104       unsigned dep_ready[num_dependency_ids];
105       /**
106        * Aggregated utilization of a given unit excluding idle cycles,
107        * in clock units.
108        */
109       float unit_busy[num_units];
110       /**
111        * Factor of the overhead of a computation accounted for in the
112        * aggregated utilization calculation.
113        */
114       float weight;
115    };
116 
117    /**
118     * Information derived from an IR instruction used to compute performance
119     * estimates.  Allows the timing calculation to work on both FS and VEC4
120     * instructions.
121     */
122    struct instruction_info {
instruction_info__anon63ed41e20111::instruction_info123       instruction_info(const intel_device_info *devinfo, const fs_inst *inst) :
124          devinfo(devinfo), op(inst->opcode),
125          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126          tx(get_exec_type(inst)), sx(0), ss(0),
127          sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128          desc(inst->desc), sfid(inst->sfid)
129       {
130          /* We typically want the maximum source size, except for split send
131           * messages which require the total size.
132           */
133          if (inst->opcode == SHADER_OPCODE_SEND) {
134             ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135                  DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136          } else {
137             for (unsigned i = 0; i < inst->sources; i++)
138                ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139          }
140 
141          /* Convert the execution size to GRF units. */
142          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143 
144          /* 32x32 integer multiplication has half the usual ALU throughput.
145           * Treat it as double-precision.
146           */
147          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151       }
152 
instruction_info__anon63ed41e20111::instruction_info153       instruction_info(const intel_device_info *devinfo,
154                        const vec4_instruction *inst) :
155          devinfo(devinfo), op(inst->opcode),
156          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157          tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158          desc(inst->desc), sfid(inst->sfid)
159       {
160          /* Compute the maximum source size. */
161          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162             ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163 
164          /* Convert the execution size to GRF units. */
165          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166 
167          /* 32x32 integer multiplication has half the usual ALU throughput.
168           * Treat it as double-precision.
169           */
170          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174       }
175 
176       /** Device information. */
177       const struct intel_device_info *devinfo;
178       /** Instruction opcode. */
179       opcode op;
180       /** Destination type. */
181       brw_reg_type td;
182       /** Destination size in GRF units. */
183       unsigned sd;
184       /** Execution type. */
185       brw_reg_type tx;
186       /** Execution size in GRF units. */
187       unsigned sx;
188       /** Source size. */
189       unsigned ss;
190       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191       unsigned sc;
192       /** Send message descriptor. */
193       uint32_t desc;
194       /** Send message shared function ID. */
195       uint8_t sfid;
196    };
197 
198    /**
199     * Timing information of an instruction used to estimate the performance of
200     * the program.
201     */
202    struct perf_desc {
perf_desc__anon63ed41e20111::perf_desc203       perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205 
206       /**
207        * Back-end unit its runtime shall be accounted to, in addition to the
208        * EU front-end which is always assumed to be involved.
209        */
210       unit u;
211       /**
212        * Overhead cycles from the time that the EU front-end starts executing
213        * the instruction until it's ready to execute the next instruction.
214        */
215       int df;
216       /**
217        * Overhead cycles from the time that the back-end starts executing the
218        * instruction until it's ready to execute the next instruction.
219        */
220       int db;
221       /**
222        * Latency cycles from the time that the back-end starts executing the
223        * instruction until its sources have been read from the register file.
224        */
225       int ls;
226       /**
227        * Latency cycles from the time that the back-end starts executing the
228        * instruction until its regular destination has been written to the
229        * register file.
230        */
231       int ld;
232       /**
233        * Latency cycles from the time that the back-end starts executing the
234        * instruction until its accumulator destination has been written to the
235        * ARF file.
236        *
237        * Note that this is an approximation of the real behavior of
238        * accumulating instructions in the hardware: Instead of modeling a pair
239        * of back-to-back accumulating instructions as a first computation with
240        * latency equal to ld followed by another computation with a
241        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242        * model the stall as if it occurred at the top of the pipeline, with
243        * the latency of the accumulator computation offset accordingly.
244        */
245       int la;
246       /**
247        * Latency cycles from the time that the back-end starts executing the
248        * instruction until its flag destination has been written to the ARF
249        * file.
250        */
251       int lf;
252    };
253 
254    /**
255     * Compute the timing information of an instruction based on any relevant
256     * information from the IR and a number of parameters specifying a linear
257     * approximation: Parameter X_Y specifies the derivative of timing X
258     * relative to info field Y, while X_1 specifies the independent term of
259     * the approximation of timing X.
260     */
261    perf_desc
calculate_desc(const instruction_info & info,unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)262    calculate_desc(const instruction_info &info, unit u,
263                   int df_1, int df_sd, int df_sc,
264                   int db_1, int db_sx,
265                   int ls_1, int ld_1, int la_1, int lf_1,
266                   int l_ss, int l_sd)
267    {
268       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269                           db_1 + db_sx * int(info.sx),
270                           ls_1 + l_ss * int(info.ss),
271                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272                           la_1, lf_1);
273    }
274 
275    /**
276     * Compute the timing information of an instruction based on any relevant
277     * information from the IR and a number of linear approximation parameters
278     * hard-coded for each IR instruction.
279     *
280     * Most timing parameters are obtained from the multivariate linear
281     * regression of a sample of empirical timings measured using the tm0
282     * register (as can be done today by using the shader_time debugging
283     * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
284     * "Shared Functions - Extended Math", Section 3.2 "Performance".
285     * Parameters marked XXX shall be considered low-quality, they're possibly
286     * high variance or completely guessed in cases where experimental data was
287     * unavailable.
288     */
289    const perf_desc
instruction_desc(const instruction_info & info)290    instruction_desc(const instruction_info &info)
291    {
292       const struct intel_device_info *devinfo = info.devinfo;
293 
294       switch (info.op) {
295       case BRW_OPCODE_SYNC:
296       case BRW_OPCODE_SEL:
297       case BRW_OPCODE_NOT:
298       case BRW_OPCODE_AND:
299       case BRW_OPCODE_OR:
300       case BRW_OPCODE_XOR:
301       case BRW_OPCODE_SHR:
302       case BRW_OPCODE_SHL:
303       case BRW_OPCODE_DIM:
304       case BRW_OPCODE_ASR:
305       case BRW_OPCODE_CMPN:
306       case BRW_OPCODE_F16TO32:
307       case BRW_OPCODE_BFREV:
308       case BRW_OPCODE_BFI1:
309       case BRW_OPCODE_AVG:
310       case BRW_OPCODE_FRC:
311       case BRW_OPCODE_RNDU:
312       case BRW_OPCODE_RNDD:
313       case BRW_OPCODE_RNDE:
314       case BRW_OPCODE_RNDZ:
315       case BRW_OPCODE_MAC:
316       case BRW_OPCODE_MACH:
317       case BRW_OPCODE_LZD:
318       case BRW_OPCODE_FBH:
319       case BRW_OPCODE_FBL:
320       case BRW_OPCODE_CBIT:
321       case BRW_OPCODE_ADDC:
322       case BRW_OPCODE_ROR:
323       case BRW_OPCODE_ROL:
324       case BRW_OPCODE_SUBB:
325       case BRW_OPCODE_SAD2:
326       case BRW_OPCODE_SADA2:
327       case BRW_OPCODE_LINE:
328       case BRW_OPCODE_NOP:
329       case SHADER_OPCODE_CLUSTER_BROADCAST:
330       case SHADER_OPCODE_SCRATCH_HEADER:
331       case FS_OPCODE_DDX_COARSE:
332       case FS_OPCODE_DDX_FINE:
333       case FS_OPCODE_DDY_COARSE:
334       case FS_OPCODE_PIXEL_X:
335       case FS_OPCODE_PIXEL_Y:
336       case FS_OPCODE_SET_SAMPLE_ID:
337       case VEC4_OPCODE_MOV_BYTES:
338       case VEC4_OPCODE_UNPACK_UNIFORM:
339       case VEC4_OPCODE_DOUBLE_TO_F32:
340       case VEC4_OPCODE_DOUBLE_TO_D32:
341       case VEC4_OPCODE_DOUBLE_TO_U32:
342       case VEC4_OPCODE_TO_DOUBLE:
343       case VEC4_OPCODE_PICK_LOW_32BIT:
344       case VEC4_OPCODE_PICK_HIGH_32BIT:
345       case VEC4_OPCODE_SET_LOW_32BIT:
346       case VEC4_OPCODE_SET_HIGH_32BIT:
347       case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
348       case GS_OPCODE_SET_DWORD_2:
349       case GS_OPCODE_SET_WRITE_OFFSET:
350       case GS_OPCODE_SET_VERTEX_COUNT:
351       case GS_OPCODE_PREPARE_CHANNEL_MASKS:
352       case GS_OPCODE_SET_CHANNEL_MASKS:
353       case GS_OPCODE_GET_INSTANCE_ID:
354       case GS_OPCODE_SET_PRIMITIVE_ID:
355       case GS_OPCODE_SVB_SET_DST_INDEX:
356       case TCS_OPCODE_SRC0_010_IS_ZERO:
357       case TCS_OPCODE_GET_PRIMITIVE_ID:
358       case TES_OPCODE_GET_PRIMITIVE_ID:
359       case SHADER_OPCODE_GET_DSS_ID:
360          if (devinfo->ver >= 11) {
361             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
362                                   0, 10, 6 /* XXX */, 14, 0, 0);
363          } else if (devinfo->ver >= 8) {
364             if (type_sz(info.tx) > 4)
365                return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
366                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
367             else
368                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
369                                      0, 8, 4, 12, 0, 0);
370          } else if (devinfo->is_haswell) {
371             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
372                                   0, 10, 6 /* XXX */, 16, 0, 0);
373          } else {
374             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
375                                   0, 12, 8 /* XXX */, 18, 0, 0);
376          }
377 
378       case BRW_OPCODE_MOV:
379       case BRW_OPCODE_CMP:
380       case BRW_OPCODE_ADD:
381       case BRW_OPCODE_ADD3:
382       case BRW_OPCODE_MUL:
383       case SHADER_OPCODE_MOV_RELOC_IMM:
384       case VEC4_OPCODE_MOV_FOR_SCRATCH:
385          if (devinfo->ver >= 11) {
386             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
387                                   0, 10, 6, 14, 0, 0);
388          } else if (devinfo->ver >= 8) {
389             if (type_sz(info.tx) > 4)
390                return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
391                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
392             else
393                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
394                                      0, 8, 4, 12, 0, 0);
395          } else if (devinfo->is_haswell) {
396             if (info.tx == BRW_REGISTER_TYPE_F)
397                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
398                                      0, 12, 8 /* XXX */, 18, 0, 0);
399             else
400                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
401                                      0, 10, 6 /* XXX */, 16, 0, 0);
402          } else if (devinfo->ver >= 7) {
403             if (info.tx == BRW_REGISTER_TYPE_F)
404                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
405                                      0, 14, 10 /* XXX */, 20, 0, 0);
406             else
407                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
408                                      0, 12, 8 /* XXX */, 18, 0, 0);
409          } else {
410             return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
411                                   0, 2 /* XXX */,
412                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
413                                   0, 0);
414          }
415 
416       case BRW_OPCODE_BFE:
417       case BRW_OPCODE_BFI2:
418       case BRW_OPCODE_CSEL:
419          if (devinfo->ver >= 11)
420             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
421                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
422          else if (devinfo->ver >= 8)
423             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
424                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
425          else if (devinfo->is_haswell)
426             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
427                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
428          else if (devinfo->ver >= 7)
429             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
430                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
431          else
432             abort();
433 
434       case BRW_OPCODE_MAD:
435          if (devinfo->ver >= 11) {
436             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
437                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
438          } else if (devinfo->ver >= 8) {
439             if (type_sz(info.tx) > 4)
440                return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
441                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
442             else
443                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
444                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
445          } else if (devinfo->is_haswell) {
446             if (info.tx == BRW_REGISTER_TYPE_F)
447                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
448                                      0, 12, 8 /* XXX */, 18, 0, 0);
449             else
450                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
451                                      0, 10, 6 /* XXX */, 16, 0, 0);
452          } else if (devinfo->ver >= 7) {
453             if (info.tx == BRW_REGISTER_TYPE_F)
454                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
455                                      0, 14, 10 /* XXX */, 20, 0, 0);
456             else
457                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
458                                      0, 12, 8 /* XXX */, 18, 0, 0);
459          } else if (devinfo->ver >= 6) {
460             return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
461                                   0, 2 /* XXX */,
462                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
463                                   0, 0);
464          } else {
465             abort();
466          }
467 
468       case BRW_OPCODE_F32TO16:
469          if (devinfo->ver >= 11)
470             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
471                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
472          else if (devinfo->ver >= 8)
473             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
474                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
475          else if (devinfo->is_haswell)
476             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
477                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
478          else if (devinfo->ver >= 7)
479             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
480                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
481          else
482             abort();
483 
484       case BRW_OPCODE_DP4:
485       case BRW_OPCODE_DPH:
486       case BRW_OPCODE_DP3:
487       case BRW_OPCODE_DP2:
488          if (devinfo->ver >= 8)
489             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
490                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
491          else if (devinfo->is_haswell)
492             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
493                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
494          else
495             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
496                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
497 
498       case BRW_OPCODE_DP4A:
499          if (devinfo->ver >= 12)
500             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
501                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
502          else
503             abort();
504 
505       case SHADER_OPCODE_RCP:
506       case SHADER_OPCODE_RSQ:
507       case SHADER_OPCODE_SQRT:
508       case SHADER_OPCODE_EXP2:
509       case SHADER_OPCODE_LOG2:
510       case SHADER_OPCODE_SIN:
511       case SHADER_OPCODE_COS:
512       case SHADER_OPCODE_POW:
513       case SHADER_OPCODE_INT_QUOTIENT:
514       case SHADER_OPCODE_INT_REMAINDER:
515          if (devinfo->ver >= 6) {
516             switch (info.op) {
517             case SHADER_OPCODE_RCP:
518             case SHADER_OPCODE_RSQ:
519             case SHADER_OPCODE_SQRT:
520             case SHADER_OPCODE_EXP2:
521             case SHADER_OPCODE_LOG2:
522             case SHADER_OPCODE_SIN:
523             case SHADER_OPCODE_COS:
524                if (devinfo->ver >= 8)
525                   return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
526                                         0, 16, 0, 0, 0, 0);
527                else if (devinfo->is_haswell)
528                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
529                                         0, 12, 0, 0, 0, 0);
530                else
531                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
532                                         0, 14, 0, 0, 0, 0);
533 
534             case SHADER_OPCODE_POW:
535                if (devinfo->ver >= 8)
536                   return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
537                                         0, 24, 0, 0, 0, 0);
538                else if (devinfo->is_haswell)
539                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
540                                         0, 20, 0, 0, 0, 0);
541                else
542                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
543                                         0, 22, 0, 0, 0, 0);
544 
545             case SHADER_OPCODE_INT_QUOTIENT:
546             case SHADER_OPCODE_INT_REMAINDER:
547                return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
548                                      0, 28 /* XXX */, 0, 0, 0, 0);
549 
550             default:
551                abort();
552             }
553          } else {
554             switch (info.op) {
555             case SHADER_OPCODE_RCP:
556                return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
557                                      0, 22, 0, 0, 0, 8);
558 
559             case SHADER_OPCODE_RSQ:
560                return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
561                                      0, 44, 0, 0, 0, 8);
562 
563             case SHADER_OPCODE_INT_QUOTIENT:
564             case SHADER_OPCODE_SQRT:
565             case SHADER_OPCODE_LOG2:
566                return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
567                                      0, 66, 0, 0, 0, 8);
568 
569             case SHADER_OPCODE_INT_REMAINDER:
570             case SHADER_OPCODE_EXP2:
571                return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
572                                      0, 88, 0, 0, 0, 8);
573 
574             case SHADER_OPCODE_SIN:
575             case SHADER_OPCODE_COS:
576                return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
577                                      0, 132, 0, 0, 0, 8);
578 
579             case SHADER_OPCODE_POW:
580                return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
581                                      0, 176, 0, 0, 0, 8);
582 
583             default:
584                abort();
585             }
586          }
587 
588       case BRW_OPCODE_DO:
589          if (devinfo->ver >= 6)
590             return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
591                                   0, 0, 0, 0, 0, 0);
592          else
593             return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
594                                   0, 0, 0, 0, 0, 0);
595 
596       case BRW_OPCODE_IF:
597       case BRW_OPCODE_ELSE:
598       case BRW_OPCODE_ENDIF:
599       case BRW_OPCODE_WHILE:
600       case BRW_OPCODE_BREAK:
601       case BRW_OPCODE_CONTINUE:
602       case BRW_OPCODE_HALT:
603          if (devinfo->ver >= 8)
604             return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
605                                   0, 0, 0, 0, 0, 0);
606          else if (devinfo->is_haswell)
607             return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
608                                   0, 0, 0, 0, 0, 0);
609          else
610             return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
611                                   0, 0, 0, 0, 0, 0);
612 
613       case FS_OPCODE_LINTERP:
614          if (devinfo->ver >= 8)
615             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
616                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
617          else if (devinfo->is_haswell)
618             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
619                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
620          else
621             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
622                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
623 
624       case BRW_OPCODE_LRP:
625          if (devinfo->ver >= 8)
626             return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
627                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
628          else if (devinfo->is_haswell)
629             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
630                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
631          else if (devinfo->ver >= 6)
632             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
633                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
634          else
635             abort();
636 
637       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
638          if (devinfo->ver >= 11)
639             return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
640                                   0, 10 /* XXX */, 6 /* XXX */,
641                                   14 /* XXX */, 0, 0);
642          else if (devinfo->ver >= 8)
643             return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
644                                   0, 8 /* XXX */, 4 /* XXX */,
645                                   12 /* XXX */, 0, 0);
646          else if (devinfo->is_haswell)
647             return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
648                                   0, 10 /* XXX */, 6 /* XXX */,
649                                   16 /* XXX */, 0, 0);
650          else if (devinfo->ver >= 7)
651             return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
652                                   0, 12 /* XXX */, 8 /* XXX */,
653                                   18 /* XXX */, 0, 0);
654          else
655             abort();
656 
657       case SHADER_OPCODE_MOV_INDIRECT:
658          if (devinfo->ver >= 11)
659             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
660                                   0, 10 /* XXX */, 6 /* XXX */,
661                                   14 /* XXX */, 0, 0);
662          else if (devinfo->ver >= 8)
663             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
664                                   0, 8 /* XXX */, 4 /* XXX */,
665                                   12 /* XXX */, 0, 0);
666          else if (devinfo->is_haswell)
667             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
668                                   0, 10 /* XXX */, 6 /* XXX */,
669                                   16 /* XXX */, 0, 0);
670          else
671             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
672                                   0, 12 /* XXX */, 8 /* XXX */,
673                                   18 /* XXX */, 0, 0);
674 
675       case SHADER_OPCODE_BROADCAST:
676          if (devinfo->ver >= 11)
677             return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
678                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
679          else if (devinfo->ver >= 8)
680             return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
681                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
682          else if (devinfo->is_haswell)
683             return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
684                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
685          else if (devinfo->ver >= 7)
686             return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
687                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
688          else
689             abort();
690 
691       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
692          if (devinfo->ver >= 11)
693             return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
694                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
695          else if (devinfo->ver >= 8)
696             return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
697                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
698          else if (devinfo->is_haswell)
699             return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
700                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
701          else if (devinfo->ver >= 7)
702             return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
703                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
704          else
705             abort();
706 
707       case SHADER_OPCODE_RND_MODE:
708       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
709          if (devinfo->ver >= 11)
710             return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
711                                   4 /* XXX */, 0,
712                                   0, 0, 0, 0, 0, 0);
713          else if (devinfo->ver >= 8)
714             return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
715                                   4 /* XXX */, 0,
716                                   0, 0, 0, 0, 0, 0);
717          else if (devinfo->is_haswell)
718             return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
719                                   4 /* XXX */, 0,
720                                   0, 0, 0, 0, 0, 0);
721          else if (devinfo->ver >= 6)
722             return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
723                                   4 /* XXX */, 0,
724                                   0, 0, 0, 0, 0, 0);
725          else
726             abort();
727 
728       case SHADER_OPCODE_SHUFFLE:
729          if (devinfo->ver >= 11)
730             return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
731                                   44 /* XXX */, 0,
732                                   0, 10 /* XXX */, 6 /* XXX */,
733                                   14 /* XXX */, 0, 0);
734          else if (devinfo->ver >= 8)
735             return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
736                                   42 /* XXX */, 0,
737                                   0, 8 /* XXX */, 4 /* XXX */,
738                                   12 /* XXX */, 0, 0);
739          else if (devinfo->is_haswell)
740             return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
741                                   0, 44 /* XXX */,
742                                   0, 10 /* XXX */, 6 /* XXX */,
743                                   16 /* XXX */, 0, 0);
744          else if (devinfo->ver >= 6)
745             return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
746                                   0, 46 /* XXX */,
747                                   0, 12 /* XXX */, 8 /* XXX */,
748                                   18 /* XXX */, 0, 0);
749          else
750             abort();
751 
752       case SHADER_OPCODE_SEL_EXEC:
753          if (devinfo->ver >= 11)
754             return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
755                                   0, 4 /* XXX */,
756                                   0, 10 /* XXX */, 6 /* XXX */,
757                                   14 /* XXX */, 0, 0);
758          else if (devinfo->ver >= 8)
759             return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
760                                   0, 4 /* XXX */,
761                                   0, 8 /* XXX */, 4 /* XXX */,
762                                   12 /* XXX */, 0, 0);
763          else if (devinfo->is_haswell)
764             return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
765                                   0, 4 /* XXX */,
766                                   0, 10 /* XXX */, 6 /* XXX */,
767                                   16 /* XXX */, 0, 0);
768          else
769             return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
770                                   0, 4 /* XXX */,
771                                   0, 12 /* XXX */, 8 /* XXX */,
772                                   18 /* XXX */, 0, 0);
773 
774       case SHADER_OPCODE_QUAD_SWIZZLE:
775          if (devinfo->ver >= 11)
776             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
777                                   0, 8 /* XXX */,
778                                   0, 10 /* XXX */, 6 /* XXX */,
779                                   14 /* XXX */, 0, 0);
780          else if (devinfo->ver >= 8)
781             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
782                                   0, 8 /* XXX */,
783                                   0, 8 /* XXX */, 4 /* XXX */,
784                                   12 /* XXX */, 0, 0);
785          else if (devinfo->is_haswell)
786             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
787                                   0, 8 /* XXX */,
788                                   0, 10 /* XXX */, 6 /* XXX */,
789                                   16 /* XXX */, 0, 0);
790          else
791             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
792                                   0, 8 /* XXX */,
793                                   0, 12 /* XXX */, 8 /* XXX */,
794                                   18 /* XXX */, 0, 0);
795 
796       case FS_OPCODE_DDY_FINE:
797          if (devinfo->ver >= 11)
798             return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
799                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
800          else if (devinfo->ver >= 8)
801             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
802                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
803          else if (devinfo->is_haswell)
804             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
805                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
806          else
807             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
808                                   0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
809 
810       case FS_OPCODE_LOAD_LIVE_CHANNELS:
811          if (devinfo->ver >= 11)
812             return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
813                                   2 /* XXX */, 0,
814                                   0, 0, 0, 10 /* XXX */, 0, 0);
815          else if (devinfo->ver >= 8)
816             return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
817                                   0, 2 /* XXX */,
818                                   0, 0, 0, 8 /* XXX */, 0, 0);
819          else
820             abort();
821 
822       case VEC4_OPCODE_PACK_BYTES:
823          if (devinfo->ver >= 8)
824             return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
825                                   4 /* XXX */, 0,
826                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
827                                   0, 0);
828          else if (devinfo->is_haswell)
829             return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
830                                   4 /* XXX */, 0,
831                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
832                                   0, 0);
833          else
834             return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
835                                   4 /* XXX */, 0,
836                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
837                                   0, 0);
838 
839       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
840       case TCS_OPCODE_GET_INSTANCE_ID:
841       case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
842       case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
843       case TES_OPCODE_CREATE_INPUT_READ_HEADER:
844          if (devinfo->ver >= 8)
845             return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
846                                   6 /* XXX */, 0,
847                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
848                                   0, 0);
849          else if (devinfo->is_haswell)
850             return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
851                                   6 /* XXX */, 0,
852                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
853                                   0, 0);
854          else
855             return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
856                                   6 /* XXX */, 0,
857                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
858                                   0, 0);
859 
860       case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
861       case TCS_OPCODE_CREATE_BARRIER_HEADER:
862          if (devinfo->ver >= 8)
863             return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
864                                   8 /* XXX */, 0,
865                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
866                                   0, 0);
867          else if (devinfo->is_haswell)
868             return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
869                                   8 /* XXX */, 0,
870                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
871                                   0, 0);
872          else if (devinfo->ver >= 6)
873             return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
874                                   8 /* XXX */, 0,
875                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
876                                   0, 0);
877          else
878             abort();
879 
880       case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
881          if (devinfo->ver >= 8)
882             return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
883                                   4 /* XXX */, 0,
884                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
885                                   0, 0);
886          else if (devinfo->is_haswell)
887             return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
888                                   4 /* XXX */, 0,
889                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
890                                   0, 0);
891          else if (devinfo->ver >= 7)
892             return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
893                                   4 /* XXX */, 0,
894                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
895                                   0, 0);
896          else
897             abort();
898 
899       case SHADER_OPCODE_TEX:
900       case FS_OPCODE_TXB:
901       case SHADER_OPCODE_TXD:
902       case SHADER_OPCODE_TXF:
903       case SHADER_OPCODE_TXF_LZ:
904       case SHADER_OPCODE_TXL:
905       case SHADER_OPCODE_TXL_LZ:
906       case SHADER_OPCODE_TXF_CMS:
907       case SHADER_OPCODE_TXF_CMS_W:
908       case SHADER_OPCODE_TXF_UMS:
909       case SHADER_OPCODE_TXF_MCS:
910       case SHADER_OPCODE_TXS:
911       case SHADER_OPCODE_LOD:
912       case SHADER_OPCODE_GET_BUFFER_SIZE:
913       case SHADER_OPCODE_TG4:
914       case SHADER_OPCODE_TG4_OFFSET:
915       case SHADER_OPCODE_SAMPLEINFO:
916       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
917          return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
918                                8 /* XXX */, 750 /* XXX */, 0, 0,
919                                2 /* XXX */, 0);
920 
921       case SHADER_OPCODE_URB_READ_SIMD8:
922       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
923       case SHADER_OPCODE_URB_WRITE_SIMD8:
924       case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
925       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
926       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
927       case VEC4_OPCODE_URB_READ:
928       case VS_OPCODE_URB_WRITE:
929       case GS_OPCODE_URB_WRITE:
930       case GS_OPCODE_URB_WRITE_ALLOCATE:
931       case GS_OPCODE_THREAD_END:
932       case GS_OPCODE_FF_SYNC:
933       case TCS_OPCODE_URB_WRITE:
934       case TCS_OPCODE_RELEASE_INPUT:
935       case TCS_OPCODE_THREAD_END:
936          return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
937                                32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
938 
939       case SHADER_OPCODE_MEMORY_FENCE:
940       case SHADER_OPCODE_INTERLOCK:
941          switch (info.sfid) {
942          case GFX6_SFID_DATAPORT_RENDER_CACHE:
943             if (devinfo->ver >= 7)
944                return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
945                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
946             else
947                abort();
948 
949          case BRW_SFID_URB:
950          case GFX7_SFID_DATAPORT_DATA_CACHE:
951          case GFX12_SFID_SLM:
952          case GFX12_SFID_TGM:
953          case GFX12_SFID_UGM:
954          case HSW_SFID_DATAPORT_DATA_CACHE_1:
955             if (devinfo->ver >= 7)
956                return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
957                                      10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
958             else
959                abort();
960 
961          default:
962             abort();
963          }
964 
965       case SHADER_OPCODE_GFX4_SCRATCH_READ:
966       case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
967       case SHADER_OPCODE_GFX7_SCRATCH_READ:
968          return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
969                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
970 
971       case VEC4_OPCODE_UNTYPED_ATOMIC:
972          if (devinfo->ver >= 7)
973             return calculate_desc(info, unit_dp_dc, 2, 0, 0,
974                                   30 /* XXX */, 400 /* XXX */,
975                                   10 /* XXX */, 100 /* XXX */, 0, 0,
976                                   0, 400 /* XXX */);
977          else
978             abort();
979 
980       case VEC4_OPCODE_UNTYPED_SURFACE_READ:
981       case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
982          if (devinfo->ver >= 7)
983             return calculate_desc(info, unit_dp_dc, 2, 0, 0,
984                                   0, 20 /* XXX */,
985                                   10 /* XXX */, 100 /* XXX */, 0, 0,
986                                   0, 0);
987          else
988             abort();
989 
990       case FS_OPCODE_FB_WRITE:
991       case FS_OPCODE_FB_READ:
992       case FS_OPCODE_REP_FB_WRITE:
993          return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
994                                10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
995 
996       case GS_OPCODE_SVB_WRITE:
997          if (devinfo->ver >= 6)
998             return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
999                                   0, 450 /* XXX */,
1000                                   10 /* XXX */, 300 /* XXX */, 0, 0,
1001                                   0, 0);
1002          else
1003             abort();
1004 
1005       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1006       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
1007          return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
1008                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1009 
1010       case VS_OPCODE_PULL_CONSTANT_LOAD:
1011       case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
1012          return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1013                                8, 750, 0, 0, 2, 0);
1014 
1015       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1016       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1017       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1018          if (devinfo->ver >= 7)
1019             return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
1020                                   0, 90 /* XXX */, 0, 0, 0, 0);
1021          else
1022             abort();
1023 
1024       case SHADER_OPCODE_BARRIER:
1025          if (devinfo->ver >= 7)
1026             return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1027                                   0 /* XXX */, 0,
1028                                   0, 0, 0, 0, 0, 0);
1029          else
1030             abort();
1031 
1032       case CS_OPCODE_CS_TERMINATE:
1033          if (devinfo->ver >= 7)
1034             return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1035                                   10 /* XXX */, 0, 0, 0, 0, 0);
1036          else
1037             abort();
1038 
1039       case SHADER_OPCODE_SEND:
1040          switch (info.sfid) {
1041          case GFX6_SFID_DATAPORT_RENDER_CACHE:
1042             if (devinfo->ver >= 7) {
1043                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1044                case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
1045                   return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1046                                         30 /* XXX */, 450 /* XXX */,
1047                                         10 /* XXX */, 100 /* XXX */,
1048                                         0, 0, 0, 400 /* XXX */);
1049                default:
1050                   return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1051                                         0, 450 /* XXX */,
1052                                         10 /* XXX */, 300 /* XXX */, 0, 0,
1053                                         0, 0);
1054                }
1055             } else if (devinfo->ver >= 6)  {
1056                return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1057                                      0, 450 /* XXX */,
1058                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1059             } else {
1060                abort();
1061             }
1062          case BRW_SFID_SAMPLER: {
1063             if (devinfo->ver >= 6)
1064                return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1065                                      8, 750, 0, 0, 2, 0);
1066             else
1067                abort();
1068          }
1069          case GFX7_SFID_DATAPORT_DATA_CACHE:
1070          case HSW_SFID_DATAPORT_DATA_CACHE_1:
1071             if (devinfo->verx10 >= 75) {
1072                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1073                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1074                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1075                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1076                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1077                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1078                                         30 /* XXX */, 400 /* XXX */,
1079                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1080                                         0, 400 /* XXX */);
1081 
1082                default:
1083                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1084                                         0, 20 /* XXX */,
1085                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1086                                         0, 0);
1087                }
1088             } else if (devinfo->ver >= 7) {
1089                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1090                case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1091                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1092                                         30 /* XXX */, 400 /* XXX */,
1093                                         10 /* XXX */, 100 /* XXX */,
1094                                         0, 0, 0, 400 /* XXX */);
1095                default:
1096                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1097                                         0, 20 /* XXX */,
1098                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1099                                         0, 0);
1100                }
1101             } else {
1102                abort();
1103             }
1104 
1105          case GFX12_SFID_UGM:
1106          case GFX12_SFID_TGM:
1107          case GFX12_SFID_SLM:
1108             switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
1109             case LSC_OP_LOAD:
1110             case LSC_OP_STORE:
1111             case LSC_OP_LOAD_CMASK:
1112             case LSC_OP_STORE_CMASK:
1113                return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1114                                      0, 20 /* XXX */,
1115                                      10 /* XXX */, 100 /* XXX */, 0, 0,
1116                                      0, 0);
1117 
1118             case LSC_OP_FENCE:
1119             case LSC_OP_ATOMIC_INC:
1120             case LSC_OP_ATOMIC_DEC:
1121             case LSC_OP_ATOMIC_LOAD:
1122             case LSC_OP_ATOMIC_STORE:
1123             case LSC_OP_ATOMIC_ADD:
1124             case LSC_OP_ATOMIC_SUB:
1125             case LSC_OP_ATOMIC_MIN:
1126             case LSC_OP_ATOMIC_MAX:
1127             case LSC_OP_ATOMIC_UMIN:
1128             case LSC_OP_ATOMIC_UMAX:
1129             case LSC_OP_ATOMIC_CMPXCHG:
1130             case LSC_OP_ATOMIC_FADD:
1131             case LSC_OP_ATOMIC_FSUB:
1132             case LSC_OP_ATOMIC_FMIN:
1133             case LSC_OP_ATOMIC_FMAX:
1134             case LSC_OP_ATOMIC_FCMPXCHG:
1135             case LSC_OP_ATOMIC_AND:
1136             case LSC_OP_ATOMIC_OR:
1137             case LSC_OP_ATOMIC_XOR:
1138                return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1139                                      30 /* XXX */, 400 /* XXX */,
1140                                      10 /* XXX */, 100 /* XXX */, 0, 0,
1141                                      0, 400 /* XXX */);
1142             default:
1143                abort();
1144             }
1145 
1146          case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
1147          case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
1148             return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1149                                   10 /* XXX */, 0, 0, 0, 0, 0);
1150 
1151          default:
1152             abort();
1153          }
1154 
1155       case SHADER_OPCODE_UNDEF:
1156       case SHADER_OPCODE_HALT_TARGET:
1157       case FS_OPCODE_SCHEDULING_FENCE:
1158          return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1159                                0, 0, 0, 0, 0, 0);
1160 
1161       default:
1162          abort();
1163       }
1164    }
1165 
1166    /**
1167     * Model the performance behavior of a stall on the specified dependency
1168     * ID.
1169     */
1170    void
stall_on_dependency(state & st,dependency_id id)1171    stall_on_dependency(state &st, dependency_id id)
1172    {
1173       if (id < ARRAY_SIZE(st.dep_ready))
1174          st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1175                                        st.dep_ready[id]);
1176    }
1177 
1178    /**
1179     * Model the performance behavior of the front-end and back-end while
1180     * executing an instruction with the specified timing information, assuming
1181     * all dependencies are already clear.
1182     */
1183    void
execute_instruction(state & st,const perf_desc & perf)1184    execute_instruction(state &st, const perf_desc &perf)
1185    {
1186       /* Compute the time at which the front-end will be ready to execute the
1187        * next instruction.
1188        */
1189       st.unit_ready[unit_fe] += perf.df;
1190 
1191       if (perf.u < num_units) {
1192          /* Wait for the back-end to be ready to execute this instruction. */
1193          st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1194                                        st.unit_ready[perf.u]);
1195 
1196          /* Compute the time at which the back-end will be ready to execute
1197           * the next instruction, and update the back-end utilization.
1198           */
1199          st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1200          st.unit_busy[perf.u] += perf.db * st.weight;
1201       }
1202    }
1203 
1204    /**
1205     * Model the performance behavior of a read dependency provided by an
1206     * instruction.
1207     */
1208    void
mark_read_dependency(state & st,const perf_desc & perf,dependency_id id)1209    mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1210    {
1211       if (id < ARRAY_SIZE(st.dep_ready))
1212          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1213    }
1214 
1215    /**
1216     * Model the performance behavior of a write dependency provided by an
1217     * instruction.
1218     */
1219    void
mark_write_dependency(state & st,const perf_desc & perf,dependency_id id)1220    mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1221    {
1222       if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1223          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1224       else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1225          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1226       else if (id < ARRAY_SIZE(st.dep_ready))
1227          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1228    }
1229 
1230    /**
1231     * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1232     */
1233    dependency_id
reg_dependency_id(const intel_device_info * devinfo,const backend_reg & r,const int delta)1234    reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
1235                      const int delta)
1236    {
1237       if (r.file == VGRF) {
1238          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1239          assert(i < dependency_id_mrf0 - dependency_id_grf0);
1240          return dependency_id(dependency_id_grf0 + i);
1241 
1242       } else if (r.file == FIXED_GRF) {
1243          const unsigned i = r.nr + delta;
1244          assert(i < dependency_id_mrf0 - dependency_id_grf0);
1245          return dependency_id(dependency_id_grf0 + i);
1246 
1247       } else if (r.file == MRF && devinfo->ver >= 7) {
1248          const unsigned i = GFX7_MRF_HACK_START +
1249                             r.nr + r.offset / REG_SIZE + delta;
1250          assert(i < dependency_id_mrf0 - dependency_id_grf0);
1251          return dependency_id(dependency_id_grf0 + i);
1252 
1253       } else if (r.file == MRF && devinfo->ver < 7) {
1254          const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1255                             r.offset / REG_SIZE + delta;
1256          assert(i < dependency_id_addr0 - dependency_id_mrf0);
1257          return dependency_id(dependency_id_mrf0 + i);
1258 
1259       } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1260                  r.nr < BRW_ARF_ACCUMULATOR) {
1261          assert(delta == 0);
1262          return dependency_id_addr0;
1263 
1264       } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1265                  r.nr < BRW_ARF_FLAG) {
1266          const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1267          assert(i < dependency_id_flag0 - dependency_id_accum0);
1268          return dependency_id(dependency_id_accum0 + i);
1269 
1270       } else {
1271          return num_dependency_ids;
1272       }
1273    }
1274 
1275    /**
1276     * Return the dependency ID of flag register starting at offset \p i.
1277     */
1278    dependency_id
flag_dependency_id(unsigned i)1279    flag_dependency_id(unsigned i)
1280    {
1281       assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1282       return dependency_id(dependency_id_flag0 + i);
1283    }
1284 
1285    /**
1286     * Return the dependency ID corresponding to the SBID read completion
1287     * condition of a Gfx12+ SWSB.
1288     */
1289    dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1290    tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1291    {
1292       if (swsb.mode) {
1293          assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1294          return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1295       } else {
1296          return num_dependency_ids;
1297       }
1298    }
1299 
1300    /**
1301     * Return the dependency ID corresponding to the SBID write completion
1302     * condition of a Gfx12+ SWSB.
1303     */
1304    dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1305    tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1306    {
1307       if (swsb.mode) {
1308          assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1309          return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1310       } else {
1311          return num_dependency_ids;
1312       }
1313    }
1314 
1315    /**
1316     * Return the implicit accumulator register accessed by channel \p i of the
1317     * instruction.
1318     */
1319    unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)1320    accum_reg_of_channel(const intel_device_info *devinfo,
1321                         const backend_instruction *inst,
1322                         brw_reg_type tx, unsigned i)
1323    {
1324       assert(inst->reads_accumulator_implicitly() ||
1325              inst->writes_accumulator_implicitly(devinfo));
1326       const unsigned offset = (inst->group + i) * type_sz(tx) *
1327          (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1328       return offset / REG_SIZE % 2;
1329    }
1330 
1331    /**
1332     * Model the performance behavior of an FS back-end instruction.
1333     */
1334    void
issue_fs_inst(state & st,const intel_device_info * devinfo,const backend_instruction * be_inst)1335    issue_fs_inst(state &st, const intel_device_info *devinfo,
1336                  const backend_instruction *be_inst)
1337    {
1338       const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1339       const instruction_info info(devinfo, inst);
1340       const perf_desc perf = instruction_desc(info);
1341 
1342       /* Stall on any source dependencies. */
1343       for (unsigned i = 0; i < inst->sources; i++) {
1344          for (unsigned j = 0; j < regs_read(inst, i); j++)
1345             stall_on_dependency(
1346                st, reg_dependency_id(devinfo, inst->src[i], j));
1347       }
1348 
1349       if (inst->reads_accumulator_implicitly()) {
1350          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1351               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1352                                         inst->exec_size - 1); j++)
1353             stall_on_dependency(
1354                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1355       }
1356 
1357       if (is_send(inst) && inst->base_mrf != -1) {
1358          for (unsigned j = 0; j < inst->mlen; j++)
1359             stall_on_dependency(
1360                st, reg_dependency_id(
1361                   devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1362       }
1363 
1364       if (const unsigned mask = inst->flags_read(devinfo)) {
1365          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1366             if (mask & (1 << i))
1367                stall_on_dependency(st, flag_dependency_id(i));
1368          }
1369       }
1370 
1371       /* Stall on any write dependencies. */
1372       if (!inst->no_dd_check) {
1373          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1374             for (unsigned j = 0; j < regs_written(inst); j++)
1375                stall_on_dependency(
1376                   st, reg_dependency_id(devinfo, inst->dst, j));
1377          }
1378 
1379          if (inst->writes_accumulator_implicitly(devinfo)) {
1380             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1381                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1382                                            inst->exec_size - 1); j++)
1383                stall_on_dependency(
1384                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1385          }
1386 
1387          if (const unsigned mask = inst->flags_written(devinfo)) {
1388             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1389                if (mask & (1 << i))
1390                   stall_on_dependency(st, flag_dependency_id(i));
1391             }
1392          }
1393       }
1394 
1395       /* Stall on any SBID dependencies. */
1396       if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1397          stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1398       else if (inst->sched.mode & TGL_SBID_SRC)
1399          stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1400 
1401       /* Execute the instruction. */
1402       execute_instruction(st, perf);
1403 
1404       /* Mark any source dependencies. */
1405       if (inst->is_send_from_grf()) {
1406          for (unsigned i = 0; i < inst->sources; i++) {
1407             if (inst->is_payload(i)) {
1408                for (unsigned j = 0; j < regs_read(inst, i); j++)
1409                   mark_read_dependency(
1410                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1411             }
1412          }
1413       }
1414 
1415       if (is_send(inst) && inst->base_mrf != -1) {
1416          for (unsigned j = 0; j < inst->mlen; j++)
1417             mark_read_dependency(st, perf,
1418                reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1419       }
1420 
1421       /* Mark any destination dependencies. */
1422       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1423          for (unsigned j = 0; j < regs_written(inst); j++) {
1424             mark_write_dependency(st, perf,
1425                                   reg_dependency_id(devinfo, inst->dst, j));
1426          }
1427       }
1428 
1429       if (inst->writes_accumulator_implicitly(devinfo)) {
1430          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1431               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1432                                         inst->exec_size - 1); j++)
1433             mark_write_dependency(st, perf,
1434                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
1435       }
1436 
1437       if (const unsigned mask = inst->flags_written(devinfo)) {
1438          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1439             if (mask & (1 << i))
1440                mark_write_dependency(st, perf, flag_dependency_id(i));
1441          }
1442       }
1443 
1444       /* Mark any SBID dependencies. */
1445       if (inst->sched.mode & TGL_SBID_SET) {
1446          mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1447          mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1448       }
1449    }
1450 
1451    /**
1452     * Model the performance behavior of a VEC4 back-end instruction.
1453     */
1454    void
issue_vec4_instruction(state & st,const intel_device_info * devinfo,const backend_instruction * be_inst)1455    issue_vec4_instruction(state &st, const intel_device_info *devinfo,
1456                           const backend_instruction *be_inst)
1457    {
1458       const vec4_instruction *inst =
1459          static_cast<const vec4_instruction *>(be_inst);
1460       const instruction_info info(devinfo, inst);
1461       const perf_desc perf = instruction_desc(info);
1462 
1463       /* Stall on any source dependencies. */
1464       for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1465          for (unsigned j = 0; j < regs_read(inst, i); j++)
1466             stall_on_dependency(
1467                st, reg_dependency_id(devinfo, inst->src[i], j));
1468       }
1469 
1470       if (inst->reads_accumulator_implicitly()) {
1471          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1472               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1473                                         inst->exec_size - 1); j++)
1474             stall_on_dependency(
1475                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1476       }
1477 
1478       if (inst->base_mrf != -1) {
1479          for (unsigned j = 0; j < inst->mlen; j++)
1480             stall_on_dependency(
1481                st, reg_dependency_id(
1482                   devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1483       }
1484 
1485       if (inst->reads_flag())
1486          stall_on_dependency(st, dependency_id_flag0);
1487 
1488       /* Stall on any write dependencies. */
1489       if (!inst->no_dd_check) {
1490          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1491             for (unsigned j = 0; j < regs_written(inst); j++)
1492                stall_on_dependency(
1493                   st, reg_dependency_id(devinfo, inst->dst, j));
1494          }
1495 
1496          if (inst->writes_accumulator_implicitly(devinfo)) {
1497             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1498                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1499                                            inst->exec_size - 1); j++)
1500                stall_on_dependency(
1501                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1502          }
1503 
1504          if (inst->writes_flag(devinfo))
1505             stall_on_dependency(st, dependency_id_flag0);
1506       }
1507 
1508       /* Execute the instruction. */
1509       execute_instruction(st, perf);
1510 
1511       /* Mark any source dependencies. */
1512       if (inst->is_send_from_grf()) {
1513          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1514             for (unsigned j = 0; j < regs_read(inst, i); j++)
1515                mark_read_dependency(
1516                   st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1517          }
1518       }
1519 
1520       if (inst->base_mrf != -1) {
1521          for (unsigned j = 0; j < inst->mlen; j++)
1522             mark_read_dependency(st, perf,
1523                reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1524       }
1525 
1526       /* Mark any destination dependencies. */
1527       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1528          for (unsigned j = 0; j < regs_written(inst); j++) {
1529             mark_write_dependency(st, perf,
1530                                   reg_dependency_id(devinfo, inst->dst, j));
1531          }
1532       }
1533 
1534       if (inst->writes_accumulator_implicitly(devinfo)) {
1535          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1536               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1537                                         inst->exec_size - 1); j++)
1538             mark_write_dependency(st, perf,
1539                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
1540       }
1541 
1542       if (inst->writes_flag(devinfo))
1543          mark_write_dependency(st, perf, dependency_id_flag0);
1544    }
1545 
1546    /**
1547     * Calculate the maximum possible throughput of the program compatible with
1548     * the cycle-count utilization estimated for each asynchronous unit, in
1549     * threads-per-cycle units.
1550     */
1551    float
calculate_thread_throughput(const state & st,float busy)1552    calculate_thread_throughput(const state &st, float busy)
1553    {
1554       for (unsigned i = 0; i < num_units; i++)
1555          busy = MAX2(busy, st.unit_busy[i]);
1556 
1557       return 1.0 / busy;
1558    }
1559 
1560    /**
1561     * Estimate the performance of the specified shader.
1562     */
1563    void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const intel_device_info *,const backend_instruction *),unsigned dispatch_width)1564    calculate_performance(performance &p, const backend_shader *s,
1565                          void (*issue_instruction)(
1566                             state &, const intel_device_info *,
1567                             const backend_instruction *),
1568                          unsigned dispatch_width)
1569    {
1570       /* XXX - Note that the previous version of this code used worst-case
1571        *       scenario estimation of branching divergence for SIMD32 shaders,
1572        *       but this heuristic was removed to improve performance in common
1573        *       scenarios. Wider shader variants are less optimal when divergence
1574        *       is high, e.g. when application renders complex scene on a small
1575        *       surface. It is assumed that such renders are short, so their
1576        *       time doesn't matter and when it comes to the overall performance,
1577        *       they are dominated by more optimal larger renders.
1578        *
1579        *       It's possible that we could do better with divergence analysis
1580        *       by isolating branches which are 100% uniform.
1581        *
1582        *       Plumbing the trip counts from NIR loop analysis would allow us
1583        *       to do a better job regarding the loop weights.
1584        *
1585        *       In the meantime use values that roughly match the control flow
1586        *       weights used elsewhere in the compiler back-end.
1587        *
1588        *       Note that we provide slightly more pessimistic weights on
1589        *       Gfx12+ for SIMD32, since the effective warp size on that
1590        *       platform is 2x the SIMD width due to EU fusion, which increases
1591        *       the likelihood of divergent control flow in comparison to
1592        *       previous generations, giving narrower SIMD modes a performance
1593        *       advantage in several test-cases with non-uniform discard jumps.
1594        */
1595       const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1596                                     1.0 : 0.5);
1597       const float loop_weight = 10;
1598       unsigned halt_count = 0;
1599       unsigned elapsed = 0;
1600       state st;
1601 
1602       foreach_block(block, s->cfg) {
1603          const unsigned elapsed0 = elapsed;
1604 
1605          foreach_inst_in_block(backend_instruction, inst, block) {
1606             const unsigned clock0 = st.unit_ready[unit_fe];
1607 
1608             issue_instruction(st, s->devinfo, inst);
1609 
1610             if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1611                st.weight /= discard_weight;
1612 
1613             elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1614 
1615             if (inst->opcode == BRW_OPCODE_DO)
1616                st.weight *= loop_weight;
1617             else if (inst->opcode == BRW_OPCODE_WHILE)
1618                st.weight /= loop_weight;
1619             else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1620                st.weight *= discard_weight;
1621          }
1622 
1623          p.block_latency[block->num] = elapsed - elapsed0;
1624       }
1625 
1626       p.latency = elapsed;
1627       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1628    }
1629 }
1630 
performance(const fs_visitor * v)1631 brw::performance::performance(const fs_visitor *v) :
1632    block_latency(new unsigned[v->cfg->num_blocks])
1633 {
1634    calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1635 }
1636 
performance(const vec4_visitor * v)1637 brw::performance::performance(const vec4_visitor *v) :
1638    block_latency(new unsigned[v->cfg->num_blocks])
1639 {
1640    calculate_performance(*this, v, issue_vec4_instruction, 8);
1641 }
1642 
~performance()1643 brw::performance::~performance()
1644 {
1645    delete[] block_latency;
1646 }
1647