• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28 
29 using namespace brw;
30 
31 namespace {
32    /**
33     * Enumeration representing the various asynchronous units that can run
34     * computations in parallel on behalf of a shader thread.
35     */
36    enum unit {
37       /** EU front-end. */
38       unit_fe,
39       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40       unit_fpu,
41       /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
42       unit_em,
43       /** Sampler shared function. */
44       unit_sampler,
45       /** Pixel Interpolator shared function. */
46       unit_pi,
47       /** Unified Return Buffer shared function. */
48       unit_urb,
49       /** Data Port Data Cache shared function. */
50       unit_dp_dc,
51       /** Data Port Render Cache shared function. */
52       unit_dp_rc,
53       /** Data Port Constant Cache shared function. */
54       unit_dp_cc,
55       /** Message Gateway shared function. */
56       unit_gateway,
57       /** Thread Spawner shared function. */
58       unit_spawner,
59       /* unit_vme, */
60       /* unit_cre, */
61       /** Number of asynchronous units currently tracked. */
62       num_units,
63       /** Dummy unit for instructions that don't consume runtime from the above. */
64       unit_null = num_units
65    };
66 
67    /**
68     * Enumeration representing a computation result another computation can
69     * potentially depend on.
70     */
71    enum dependency_id {
72       /* Register part of the GRF. */
73       dependency_id_grf0 = 0,
74       /* Register part of the MRF.  Only used on Gen4-6. */
75       dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76       /* Address register part of the ARF. */
77       dependency_id_addr0 = dependency_id_mrf0 + 24,
78       /* Accumulator register part of the ARF. */
79       dependency_id_accum0 = dependency_id_addr0 + 1,
80       /* Flag register part of the ARF. */
81       dependency_id_flag0 = dependency_id_accum0 + 12,
82       /* SBID token write completion.  Only used on Gen12+. */
83       dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84       /* SBID token read completion.  Only used on Gen12+. */
85       dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86       /* Number of computation dependencies currently tracked. */
87       num_dependency_ids = dependency_id_sbid_rd0 + 16
88    };
89 
90    /**
91     * State of our modeling of the program execution.
92     */
93    struct state {
state__anonccf5b65c0111::state94       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95       /**
96        * Time at which a given unit will be ready to execute the next
97        * computation, in clock units.
98        */
99       unsigned unit_ready[num_units];
100       /**
101        * Time at which an instruction dependent on a given dependency ID will
102        * be ready to execute, in clock units.
103        */
104       unsigned dep_ready[num_dependency_ids];
105       /**
106        * Aggregated utilization of a given unit excluding idle cycles,
107        * in clock units.
108        */
109       float unit_busy[num_units];
110       /**
111        * Factor of the overhead of a computation accounted for in the
112        * aggregated utilization calculation.
113        */
114       float weight;
115    };
116 
117    /**
118     * Information derived from an IR instruction used to compute performance
119     * estimates.  Allows the timing calculation to work on both FS and VEC4
120     * instructions.
121     */
122    struct instruction_info {
instruction_info__anonccf5b65c0111::instruction_info123       instruction_info(const gen_device_info *devinfo, const fs_inst *inst) :
124          devinfo(devinfo), op(inst->opcode),
125          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126          tx(get_exec_type(inst)), sx(0), ss(0),
127          sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128          desc(inst->desc), sfid(inst->sfid)
129       {
130          /* We typically want the maximum source size, except for split send
131           * messages which require the total size.
132           */
133          if (inst->opcode == SHADER_OPCODE_SEND) {
134             ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135                  DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136          } else {
137             for (unsigned i = 0; i < inst->sources; i++)
138                ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139          }
140 
141          /* Convert the execution size to GRF units. */
142          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143 
144          /* 32x32 integer multiplication has half the usual ALU throughput.
145           * Treat it as double-precision.
146           */
147          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151       }
152 
instruction_info__anonccf5b65c0111::instruction_info153       instruction_info(const gen_device_info *devinfo,
154                        const vec4_instruction *inst) :
155          devinfo(devinfo), op(inst->opcode),
156          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157          tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158          desc(inst->desc), sfid(inst->sfid)
159       {
160          /* Compute the maximum source size. */
161          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162             ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163 
164          /* Convert the execution size to GRF units. */
165          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166 
167          /* 32x32 integer multiplication has half the usual ALU throughput.
168           * Treat it as double-precision.
169           */
170          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174       }
175 
176       /** Device information. */
177       const struct gen_device_info *devinfo;
178       /** Instruction opcode. */
179       opcode op;
180       /** Destination type. */
181       brw_reg_type td;
182       /** Destination size in GRF units. */
183       unsigned sd;
184       /** Execution type. */
185       brw_reg_type tx;
186       /** Execution size in GRF units. */
187       unsigned sx;
188       /** Source size. */
189       unsigned ss;
190       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191       unsigned sc;
192       /** Send message descriptor. */
193       uint32_t desc;
194       /** Send message shared function ID. */
195       uint8_t sfid;
196    };
197 
198    /**
199     * Timing information of an instruction used to estimate the performance of
200     * the program.
201     */
202    struct perf_desc {
perf_desc__anonccf5b65c0111::perf_desc203       perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205 
206       /**
207        * Back-end unit its runtime shall be accounted to, in addition to the
208        * EU front-end which is always assumed to be involved.
209        */
210       unit u;
211       /**
212        * Overhead cycles from the time that the EU front-end starts executing
213        * the instruction until it's ready to execute the next instruction.
214        */
215       int df;
216       /**
217        * Overhead cycles from the time that the back-end starts executing the
218        * instruction until it's ready to execute the next instruction.
219        */
220       int db;
221       /**
222        * Latency cycles from the time that the back-end starts executing the
223        * instruction until its sources have been read from the register file.
224        */
225       int ls;
226       /**
227        * Latency cycles from the time that the back-end starts executing the
228        * instruction until its regular destination has been written to the
229        * register file.
230        */
231       int ld;
232       /**
233        * Latency cycles from the time that the back-end starts executing the
234        * instruction until its accumulator destination has been written to the
235        * ARF file.
236        *
237        * Note that this is an approximation of the real behavior of
238        * accumulating instructions in the hardware: Instead of modeling a pair
239        * of back-to-back accumulating instructions as a first computation with
240        * latency equal to ld followed by another computation with a
241        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242        * model the stall as if it occurred at the top of the pipeline, with
243        * the latency of the accumulator computation offset accordingly.
244        */
245       int la;
246       /**
247        * Latency cycles from the time that the back-end starts executing the
248        * instruction until its flag destination has been written to the ARF
249        * file.
250        */
251       int lf;
252    };
253 
254    /**
255     * Compute the timing information of an instruction based on any relevant
256     * information from the IR and a number of parameters specifying a linear
257     * approximation: Parameter X_Y specifies the derivative of timing X
258     * relative to info field Y, while X_1 specifies the independent term of
259     * the approximation of timing X.
260     */
261    perf_desc
calculate_desc(const instruction_info & info,unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)262    calculate_desc(const instruction_info &info, unit u,
263                   int df_1, int df_sd, int df_sc,
264                   int db_1, int db_sx,
265                   int ls_1, int ld_1, int la_1, int lf_1,
266                   int l_ss, int l_sd)
267    {
268       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269                           db_1 + db_sx * int(info.sx),
270                           ls_1 + l_ss * int(info.ss),
271                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272                           la_1, lf_1);
273    }
274 
275    /**
276     * Compute the timing information of an instruction based on any relevant
277     * information from the IR and a number of linear approximation parameters
278     * hard-coded for each IR instruction.
279     *
280     * Most timing parameters are obtained from the multivariate linear
281     * regression of a sample of empirical timings measured using the tm0
282     * register (as can be done today by using the shader_time debugging
283     * option).  The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284     * "Shared Functions - Extended Math", Section 3.2 "Performance".
285     * Parameters marked XXX shall be considered low-quality, they're possibly
286     * high variance or completely guessed in cases where experimental data was
287     * unavailable.
288     */
289    const perf_desc
instruction_desc(const instruction_info & info)290    instruction_desc(const instruction_info &info)
291    {
292       const struct gen_device_info *devinfo = info.devinfo;
293 
294       switch (info.op) {
295       case BRW_OPCODE_SYNC:
296       case BRW_OPCODE_SEL:
297       case BRW_OPCODE_NOT:
298       case BRW_OPCODE_AND:
299       case BRW_OPCODE_OR:
300       case BRW_OPCODE_XOR:
301       case BRW_OPCODE_SHR:
302       case BRW_OPCODE_SHL:
303       case BRW_OPCODE_DIM:
304       case BRW_OPCODE_ASR:
305       case BRW_OPCODE_CMPN:
306       case BRW_OPCODE_F16TO32:
307       case BRW_OPCODE_BFREV:
308       case BRW_OPCODE_BFI1:
309       case BRW_OPCODE_AVG:
310       case BRW_OPCODE_FRC:
311       case BRW_OPCODE_RNDU:
312       case BRW_OPCODE_RNDD:
313       case BRW_OPCODE_RNDE:
314       case BRW_OPCODE_RNDZ:
315       case BRW_OPCODE_MAC:
316       case BRW_OPCODE_MACH:
317       case BRW_OPCODE_LZD:
318       case BRW_OPCODE_FBH:
319       case BRW_OPCODE_FBL:
320       case BRW_OPCODE_CBIT:
321       case BRW_OPCODE_ADDC:
322       case BRW_OPCODE_ROR:
323       case BRW_OPCODE_ROL:
324       case BRW_OPCODE_SUBB:
325       case BRW_OPCODE_SAD2:
326       case BRW_OPCODE_SADA2:
327       case BRW_OPCODE_LINE:
328       case BRW_OPCODE_NOP:
329       case SHADER_OPCODE_CLUSTER_BROADCAST:
330       case SHADER_OPCODE_SCRATCH_HEADER:
331       case FS_OPCODE_DDX_COARSE:
332       case FS_OPCODE_DDX_FINE:
333       case FS_OPCODE_DDY_COARSE:
334       case FS_OPCODE_PIXEL_X:
335       case FS_OPCODE_PIXEL_Y:
336       case FS_OPCODE_SET_SAMPLE_ID:
337       case VEC4_OPCODE_MOV_BYTES:
338       case VEC4_OPCODE_UNPACK_UNIFORM:
339       case VEC4_OPCODE_DOUBLE_TO_F32:
340       case VEC4_OPCODE_DOUBLE_TO_D32:
341       case VEC4_OPCODE_DOUBLE_TO_U32:
342       case VEC4_OPCODE_TO_DOUBLE:
343       case VEC4_OPCODE_PICK_LOW_32BIT:
344       case VEC4_OPCODE_PICK_HIGH_32BIT:
345       case VEC4_OPCODE_SET_LOW_32BIT:
346       case VEC4_OPCODE_SET_HIGH_32BIT:
347       case GS_OPCODE_SET_DWORD_2:
348       case GS_OPCODE_SET_WRITE_OFFSET:
349       case GS_OPCODE_SET_VERTEX_COUNT:
350       case GS_OPCODE_PREPARE_CHANNEL_MASKS:
351       case GS_OPCODE_SET_CHANNEL_MASKS:
352       case GS_OPCODE_GET_INSTANCE_ID:
353       case GS_OPCODE_SET_PRIMITIVE_ID:
354       case GS_OPCODE_SVB_SET_DST_INDEX:
355       case TCS_OPCODE_SRC0_010_IS_ZERO:
356       case TCS_OPCODE_GET_PRIMITIVE_ID:
357       case TES_OPCODE_GET_PRIMITIVE_ID:
358          if (devinfo->gen >= 11) {
359             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
360                                   0, 10, 6 /* XXX */, 14, 0, 0);
361          } else if (devinfo->gen >= 8) {
362             if (type_sz(info.tx) > 4)
363                return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
364                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
365             else
366                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
367                                      0, 8, 4, 12, 0, 0);
368          } else if (devinfo->is_haswell) {
369             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
370                                   0, 10, 6 /* XXX */, 16, 0, 0);
371          } else {
372             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
373                                   0, 12, 8 /* XXX */, 18, 0, 0);
374          }
375 
376       case BRW_OPCODE_MOV:
377       case BRW_OPCODE_CMP:
378       case BRW_OPCODE_ADD:
379       case BRW_OPCODE_MUL:
380       case SHADER_OPCODE_MOV_RELOC_IMM:
381          if (devinfo->gen >= 11) {
382             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
383                                   0, 10, 6, 14, 0, 0);
384          } else if (devinfo->gen >= 8) {
385             if (type_sz(info.tx) > 4)
386                return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
387                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
388             else
389                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
390                                      0, 8, 4, 12, 0, 0);
391          } else if (devinfo->is_haswell) {
392             if (info.tx == BRW_REGISTER_TYPE_F)
393                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
394                                      0, 12, 8 /* XXX */, 18, 0, 0);
395             else
396                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
397                                      0, 10, 6 /* XXX */, 16, 0, 0);
398          } else if (devinfo->gen >= 7) {
399             if (info.tx == BRW_REGISTER_TYPE_F)
400                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
401                                      0, 14, 10 /* XXX */, 20, 0, 0);
402             else
403                return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
404                                      0, 12, 8 /* XXX */, 18, 0, 0);
405          } else {
406             return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
407                                   0, 2 /* XXX */,
408                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
409                                   0, 0);
410          }
411 
412       case BRW_OPCODE_BFE:
413       case BRW_OPCODE_BFI2:
414       case BRW_OPCODE_CSEL:
415          if (devinfo->gen >= 11)
416             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
417                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
418          else if (devinfo->gen >= 8)
419             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
420                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
421          else if (devinfo->is_haswell)
422             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
423                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
424          else if (devinfo->gen >= 7)
425             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
426                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
427          else
428             abort();
429 
430       case BRW_OPCODE_MAD:
431          if (devinfo->gen >= 11) {
432             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
433                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
434          } else if (devinfo->gen >= 8) {
435             if (type_sz(info.tx) > 4)
436                return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
437                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
438             else
439                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
440                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
441          } else if (devinfo->is_haswell) {
442             if (info.tx == BRW_REGISTER_TYPE_F)
443                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
444                                      0, 12, 8 /* XXX */, 18, 0, 0);
445             else
446                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
447                                      0, 10, 6 /* XXX */, 16, 0, 0);
448          } else if (devinfo->gen >= 7) {
449             if (info.tx == BRW_REGISTER_TYPE_F)
450                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
451                                      0, 14, 10 /* XXX */, 20, 0, 0);
452             else
453                return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
454                                      0, 12, 8 /* XXX */, 18, 0, 0);
455          } else if (devinfo->gen >= 6) {
456             return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
457                                   0, 2 /* XXX */,
458                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
459                                   0, 0);
460          } else {
461             abort();
462          }
463 
464       case BRW_OPCODE_F32TO16:
465          if (devinfo->gen >= 11)
466             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
467                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
468          else if (devinfo->gen >= 8)
469             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
470                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
471          else if (devinfo->is_haswell)
472             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
473                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
474          else if (devinfo->gen >= 7)
475             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
476                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
477          else
478             abort();
479 
480       case BRW_OPCODE_DP4:
481       case BRW_OPCODE_DPH:
482       case BRW_OPCODE_DP3:
483       case BRW_OPCODE_DP2:
484          if (devinfo->gen >= 8)
485             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
486                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
487          else if (devinfo->is_haswell)
488             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
489                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
490          else
491             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
492                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
493 
494       case SHADER_OPCODE_RCP:
495       case SHADER_OPCODE_RSQ:
496       case SHADER_OPCODE_SQRT:
497       case SHADER_OPCODE_EXP2:
498       case SHADER_OPCODE_LOG2:
499       case SHADER_OPCODE_SIN:
500       case SHADER_OPCODE_COS:
501       case SHADER_OPCODE_POW:
502       case SHADER_OPCODE_INT_QUOTIENT:
503       case SHADER_OPCODE_INT_REMAINDER:
504          if (devinfo->gen >= 6) {
505             switch (info.op) {
506             case SHADER_OPCODE_RCP:
507             case SHADER_OPCODE_RSQ:
508             case SHADER_OPCODE_SQRT:
509             case SHADER_OPCODE_EXP2:
510             case SHADER_OPCODE_LOG2:
511             case SHADER_OPCODE_SIN:
512             case SHADER_OPCODE_COS:
513                if (devinfo->gen >= 8)
514                   return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
515                                         0, 16, 0, 0, 0, 0);
516                else if (devinfo->is_haswell)
517                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
518                                         0, 12, 0, 0, 0, 0);
519                else
520                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
521                                         0, 14, 0, 0, 0, 0);
522 
523             case SHADER_OPCODE_POW:
524                if (devinfo->gen >= 8)
525                   return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
526                                         0, 24, 0, 0, 0, 0);
527                else if (devinfo->is_haswell)
528                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
529                                         0, 20, 0, 0, 0, 0);
530                else
531                   return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
532                                         0, 22, 0, 0, 0, 0);
533 
534             case SHADER_OPCODE_INT_QUOTIENT:
535             case SHADER_OPCODE_INT_REMAINDER:
536                return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
537                                      0, 28 /* XXX */, 0, 0, 0, 0);
538 
539             default:
540                abort();
541             }
542          } else {
543             switch (info.op) {
544             case SHADER_OPCODE_RCP:
545                return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
546                                      0, 22, 0, 0, 0, 8);
547 
548             case SHADER_OPCODE_RSQ:
549                return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
550                                      0, 44, 0, 0, 0, 8);
551 
552             case SHADER_OPCODE_INT_QUOTIENT:
553             case SHADER_OPCODE_SQRT:
554             case SHADER_OPCODE_LOG2:
555                return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
556                                      0, 66, 0, 0, 0, 8);
557 
558             case SHADER_OPCODE_INT_REMAINDER:
559             case SHADER_OPCODE_EXP2:
560                return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
561                                      0, 88, 0, 0, 0, 8);
562 
563             case SHADER_OPCODE_SIN:
564             case SHADER_OPCODE_COS:
565                return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
566                                      0, 132, 0, 0, 0, 8);
567 
568             case SHADER_OPCODE_POW:
569                return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
570                                      0, 176, 0, 0, 0, 8);
571 
572             default:
573                abort();
574             }
575          }
576 
577       case BRW_OPCODE_DO:
578          if (devinfo->gen >= 6)
579             return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
580                                   0, 0, 0, 0, 0, 0);
581          else
582             return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
583                                   0, 0, 0, 0, 0, 0);
584 
585       case BRW_OPCODE_IF:
586       case BRW_OPCODE_ELSE:
587       case BRW_OPCODE_ENDIF:
588       case BRW_OPCODE_WHILE:
589       case BRW_OPCODE_BREAK:
590       case BRW_OPCODE_CONTINUE:
591       case FS_OPCODE_DISCARD_JUMP:
592          if (devinfo->gen >= 8)
593             return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
594                                   0, 0, 0, 0, 0, 0);
595          else if (devinfo->is_haswell)
596             return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
597                                   0, 0, 0, 0, 0, 0);
598          else
599             return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
600                                   0, 0, 0, 0, 0, 0);
601 
602       case FS_OPCODE_LINTERP:
603          if (devinfo->gen >= 8)
604             return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
605                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
606          else if (devinfo->is_haswell)
607             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
608                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
609          else
610             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
611                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
612 
613       case BRW_OPCODE_LRP:
614          if (devinfo->gen >= 8)
615             return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
616                                   0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
617          else if (devinfo->is_haswell)
618             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
619                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
620          else if (devinfo->gen >= 6)
621             return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
622                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
623          else
624             abort();
625 
626       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
627          if (devinfo->gen >= 11)
628             return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
629                                   0, 10 /* XXX */, 6 /* XXX */,
630                                   14 /* XXX */, 0, 0);
631          else if (devinfo->gen >= 8)
632             return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
633                                   0, 8 /* XXX */, 4 /* XXX */,
634                                   12 /* XXX */, 0, 0);
635          else if (devinfo->is_haswell)
636             return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
637                                   0, 10 /* XXX */, 6 /* XXX */,
638                                   16 /* XXX */, 0, 0);
639          else if (devinfo->gen >= 7)
640             return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
641                                   0, 12 /* XXX */, 8 /* XXX */,
642                                   18 /* XXX */, 0, 0);
643          else
644             abort();
645 
646       case SHADER_OPCODE_MOV_INDIRECT:
647          if (devinfo->gen >= 11)
648             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
649                                   0, 10 /* XXX */, 6 /* XXX */,
650                                   14 /* XXX */, 0, 0);
651          else if (devinfo->gen >= 8)
652             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
653                                   0, 8 /* XXX */, 4 /* XXX */,
654                                   12 /* XXX */, 0, 0);
655          else if (devinfo->is_haswell)
656             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
657                                   0, 10 /* XXX */, 6 /* XXX */,
658                                   16 /* XXX */, 0, 0);
659          else
660             return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
661                                   0, 12 /* XXX */, 8 /* XXX */,
662                                   18 /* XXX */, 0, 0);
663 
664       case SHADER_OPCODE_BROADCAST:
665          if (devinfo->gen >= 11)
666             return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
667                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
668          else if (devinfo->gen >= 8)
669             return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
670                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
671          else if (devinfo->is_haswell)
672             return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
673                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
674          else if (devinfo->gen >= 7)
675             return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
676                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
677          else
678             abort();
679 
680       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
681          if (devinfo->gen >= 11)
682             return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
683                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
684          else if (devinfo->gen >= 8)
685             return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
686                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
687          else if (devinfo->is_haswell)
688             return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
689                                   0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
690          else if (devinfo->gen >= 7)
691             return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
692                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
693          else
694             abort();
695 
696       case SHADER_OPCODE_RND_MODE:
697       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
698          if (devinfo->gen >= 11)
699             return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
700                                   4 /* XXX */, 0,
701                                   0, 0, 0, 0, 0, 0);
702          else if (devinfo->gen >= 8)
703             return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
704                                   4 /* XXX */, 0,
705                                   0, 0, 0, 0, 0, 0);
706          else if (devinfo->is_haswell)
707             return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
708                                   4 /* XXX */, 0,
709                                   0, 0, 0, 0, 0, 0);
710          else if (devinfo->gen >= 6)
711             return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
712                                   4 /* XXX */, 0,
713                                   0, 0, 0, 0, 0, 0);
714          else
715             abort();
716 
717       case SHADER_OPCODE_SHUFFLE:
718          if (devinfo->gen >= 11)
719             return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
720                                   44 /* XXX */, 0,
721                                   0, 10 /* XXX */, 6 /* XXX */,
722                                   14 /* XXX */, 0, 0);
723          else if (devinfo->gen >= 8)
724             return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
725                                   42 /* XXX */, 0,
726                                   0, 8 /* XXX */, 4 /* XXX */,
727                                   12 /* XXX */, 0, 0);
728          else if (devinfo->is_haswell)
729             return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
730                                   0, 44 /* XXX */,
731                                   0, 10 /* XXX */, 6 /* XXX */,
732                                   16 /* XXX */, 0, 0);
733          else if (devinfo->gen >= 6)
734             return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
735                                   0, 46 /* XXX */,
736                                   0, 12 /* XXX */, 8 /* XXX */,
737                                   18 /* XXX */, 0, 0);
738          else
739             abort();
740 
741       case SHADER_OPCODE_SEL_EXEC:
742          if (devinfo->gen >= 11)
743             return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
744                                   0, 4 /* XXX */,
745                                   0, 10 /* XXX */, 6 /* XXX */,
746                                   14 /* XXX */, 0, 0);
747          else if (devinfo->gen >= 8)
748             return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
749                                   0, 4 /* XXX */,
750                                   0, 8 /* XXX */, 4 /* XXX */,
751                                   12 /* XXX */, 0, 0);
752          else if (devinfo->is_haswell)
753             return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
754                                   0, 4 /* XXX */,
755                                   0, 10 /* XXX */, 6 /* XXX */,
756                                   16 /* XXX */, 0, 0);
757          else
758             return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
759                                   0, 4 /* XXX */,
760                                   0, 12 /* XXX */, 8 /* XXX */,
761                                   18 /* XXX */, 0, 0);
762 
763       case SHADER_OPCODE_QUAD_SWIZZLE:
764          if (devinfo->gen >= 11)
765             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
766                                   0, 8 /* XXX */,
767                                   0, 10 /* XXX */, 6 /* XXX */,
768                                   14 /* XXX */, 0, 0);
769          else if (devinfo->gen >= 8)
770             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
771                                   0, 8 /* XXX */,
772                                   0, 8 /* XXX */, 4 /* XXX */,
773                                   12 /* XXX */, 0, 0);
774          else if (devinfo->is_haswell)
775             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
776                                   0, 8 /* XXX */,
777                                   0, 10 /* XXX */, 6 /* XXX */,
778                                   16 /* XXX */, 0, 0);
779          else
780             return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
781                                   0, 8 /* XXX */,
782                                   0, 12 /* XXX */, 8 /* XXX */,
783                                   18 /* XXX */, 0, 0);
784 
785       case FS_OPCODE_DDY_FINE:
786          if (devinfo->gen >= 11)
787             return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
788                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
789          else if (devinfo->gen >= 8)
790             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
791                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
792          else if (devinfo->is_haswell)
793             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
794                                   0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
795          else
796             return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
797                                   0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
798 
799       case FS_OPCODE_LOAD_LIVE_CHANNELS:
800          if (devinfo->gen >= 11)
801             return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
802                                   2 /* XXX */, 0,
803                                   0, 0, 0, 10 /* XXX */, 0, 0);
804          else if (devinfo->gen >= 8)
805             return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
806                                   0, 2 /* XXX */,
807                                   0, 0, 0, 8 /* XXX */, 0, 0);
808          else
809             abort();
810 
811       case VEC4_OPCODE_PACK_BYTES:
812          if (devinfo->gen >= 8)
813             return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
814                                   4 /* XXX */, 0,
815                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
816                                   0, 0);
817          else if (devinfo->is_haswell)
818             return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
819                                   4 /* XXX */, 0,
820                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
821                                   0, 0);
822          else
823             return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
824                                   4 /* XXX */, 0,
825                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
826                                   0, 0);
827 
828       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
829       case TCS_OPCODE_GET_INSTANCE_ID:
830       case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
831       case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
832       case TES_OPCODE_CREATE_INPUT_READ_HEADER:
833          if (devinfo->gen >= 8)
834             return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
835                                   6 /* XXX */, 0,
836                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
837                                   0, 0);
838          else if (devinfo->is_haswell)
839             return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
840                                   6 /* XXX */, 0,
841                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
842                                   0, 0);
843          else
844             return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
845                                   6 /* XXX */, 0,
846                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
847                                   0, 0);
848 
849       case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
850       case TCS_OPCODE_CREATE_BARRIER_HEADER:
851          if (devinfo->gen >= 8)
852             return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
853                                   8 /* XXX */, 0,
854                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
855                                   0, 0);
856          else if (devinfo->is_haswell)
857             return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
858                                   8 /* XXX */, 0,
859                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
860                                   0, 0);
861          else if (devinfo->gen >= 6)
862             return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
863                                   8 /* XXX */, 0,
864                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
865                                   0, 0);
866          else
867             abort();
868 
869       case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
870          if (devinfo->gen >= 8)
871             return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
872                                   4 /* XXX */, 0,
873                                   0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
874                                   0, 0);
875          else if (devinfo->is_haswell)
876             return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
877                                   4 /* XXX */, 0,
878                                   0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
879                                   0, 0);
880          else if (devinfo->gen >= 7)
881             return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
882                                   4 /* XXX */, 0,
883                                   0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
884                                   0, 0);
885          else
886             abort();
887 
888       case SHADER_OPCODE_TEX:
889       case FS_OPCODE_TXB:
890       case SHADER_OPCODE_TXD:
891       case SHADER_OPCODE_TXF:
892       case SHADER_OPCODE_TXF_LZ:
893       case SHADER_OPCODE_TXL:
894       case SHADER_OPCODE_TXL_LZ:
895       case SHADER_OPCODE_TXF_CMS:
896       case SHADER_OPCODE_TXF_CMS_W:
897       case SHADER_OPCODE_TXF_UMS:
898       case SHADER_OPCODE_TXF_MCS:
899       case SHADER_OPCODE_TXS:
900       case SHADER_OPCODE_LOD:
901       case SHADER_OPCODE_GET_BUFFER_SIZE:
902       case SHADER_OPCODE_TG4:
903       case SHADER_OPCODE_TG4_OFFSET:
904       case SHADER_OPCODE_SAMPLEINFO:
905       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
906          return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
907                                8 /* XXX */, 750 /* XXX */, 0, 0,
908                                2 /* XXX */, 0);
909 
910       case SHADER_OPCODE_URB_READ_SIMD8:
911       case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
912       case SHADER_OPCODE_URB_WRITE_SIMD8:
913       case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
914       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
915       case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
916       case VEC4_OPCODE_URB_READ:
917       case VS_OPCODE_URB_WRITE:
918       case GS_OPCODE_URB_WRITE:
919       case GS_OPCODE_URB_WRITE_ALLOCATE:
920       case GS_OPCODE_THREAD_END:
921       case GS_OPCODE_FF_SYNC:
922       case TCS_OPCODE_URB_WRITE:
923       case TCS_OPCODE_RELEASE_INPUT:
924       case TCS_OPCODE_THREAD_END:
925          return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
926                                32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
927 
928       case SHADER_OPCODE_MEMORY_FENCE:
929       case SHADER_OPCODE_INTERLOCK:
930          switch (info.sfid) {
931          case GEN6_SFID_DATAPORT_RENDER_CACHE:
932             if (devinfo->gen >= 7)
933                return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
934                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
935             else
936                abort();
937 
938          case GEN7_SFID_DATAPORT_DATA_CACHE:
939          case HSW_SFID_DATAPORT_DATA_CACHE_1:
940             if (devinfo->gen >= 7)
941                return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
942                                      10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
943             else
944                abort();
945 
946          default:
947             abort();
948          }
949 
950       case SHADER_OPCODE_GEN4_SCRATCH_READ:
951       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
952       case SHADER_OPCODE_GEN7_SCRATCH_READ:
953          return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
954                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
955 
956       case VEC4_OPCODE_UNTYPED_ATOMIC:
957          if (devinfo->gen >= 7)
958             return calculate_desc(info, unit_dp_dc, 2, 0, 0,
959                                   30 /* XXX */, 400 /* XXX */,
960                                   10 /* XXX */, 100 /* XXX */, 0, 0,
961                                   0, 400 /* XXX */);
962          else
963             abort();
964 
965       case VEC4_OPCODE_UNTYPED_SURFACE_READ:
966       case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
967          if (devinfo->gen >= 7)
968             return calculate_desc(info, unit_dp_dc, 2, 0, 0,
969                                   0, 20 /* XXX */,
970                                   10 /* XXX */, 100 /* XXX */, 0, 0,
971                                   0, 0);
972          else
973             abort();
974 
975       case FS_OPCODE_FB_WRITE:
976       case FS_OPCODE_FB_READ:
977       case FS_OPCODE_REP_FB_WRITE:
978          return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
979                                10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
980 
981       case GS_OPCODE_SVB_WRITE:
982          if (devinfo->gen >= 6)
983             return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
984                                   0, 450 /* XXX */,
985                                   10 /* XXX */, 300 /* XXX */, 0, 0,
986                                   0, 0);
987          else
988             abort();
989 
990       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
991       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
992          return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
993                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
994 
995       case VS_OPCODE_PULL_CONSTANT_LOAD:
996       case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
997          return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
998                                8, 750, 0, 0, 2, 0);
999 
1000       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1001       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1002       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1003          if (devinfo->gen >= 7)
1004             return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
1005                                   0, 90 /* XXX */, 0, 0, 0, 0);
1006          else
1007             abort();
1008 
1009       case SHADER_OPCODE_BARRIER:
1010          if (devinfo->gen >= 7)
1011             return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1012                                   0 /* XXX */, 0,
1013                                   0, 0, 0, 0, 0, 0);
1014          else
1015             abort();
1016 
1017       case CS_OPCODE_CS_TERMINATE:
1018          if (devinfo->gen >= 7)
1019             return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1020                                   10 /* XXX */, 0, 0, 0, 0, 0);
1021          else
1022             abort();
1023 
1024       case SHADER_OPCODE_SEND:
1025          switch (info.sfid) {
1026          case GEN6_SFID_DATAPORT_RENDER_CACHE:
1027             if (devinfo->gen >= 7) {
1028                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1029                case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP:
1030                   return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1031                                         30 /* XXX */, 450 /* XXX */,
1032                                         10 /* XXX */, 100 /* XXX */,
1033                                         0, 0, 0, 400 /* XXX */);
1034                default:
1035                   return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1036                                         0, 450 /* XXX */,
1037                                         10 /* XXX */, 300 /* XXX */, 0, 0,
1038                                         0, 0);
1039                }
1040             } else if (devinfo->gen >= 6)  {
1041                return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1042                                      0, 450 /* XXX */,
1043                                      10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1044             } else {
1045                abort();
1046             }
1047          case BRW_SFID_SAMPLER: {
1048             if (devinfo->gen >= 6)
1049                return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1050                                      8, 750, 0, 0, 2, 0);
1051             else
1052                abort();
1053          }
1054          case GEN7_SFID_DATAPORT_DATA_CACHE:
1055          case HSW_SFID_DATAPORT_DATA_CACHE_1:
1056             if (devinfo->gen >= 8 || devinfo->is_haswell) {
1057                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1058                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1059                case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1060                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1061                case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1062                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1063                                         30 /* XXX */, 400 /* XXX */,
1064                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1065                                         0, 400 /* XXX */);
1066 
1067                default:
1068                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1069                                         0, 20 /* XXX */,
1070                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1071                                         0, 0);
1072                }
1073             } else if (devinfo->gen >= 7) {
1074                switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1075                case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1076                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1077                                         30 /* XXX */, 400 /* XXX */,
1078                                         10 /* XXX */, 100 /* XXX */,
1079                                         0, 0, 0, 400 /* XXX */);
1080                default:
1081                   return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1082                                         0, 20 /* XXX */,
1083                                         10 /* XXX */, 100 /* XXX */, 0, 0,
1084                                         0, 0);
1085                }
1086             } else {
1087                abort();
1088             }
1089          default:
1090             abort();
1091          }
1092 
1093       case SHADER_OPCODE_UNDEF:
1094       case FS_OPCODE_PLACEHOLDER_HALT:
1095       case FS_OPCODE_SCHEDULING_FENCE:
1096          return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1097                                0, 0, 0, 0, 0, 0);
1098 
1099       default:
1100          abort();
1101       }
1102    }
1103 
1104    /**
1105     * Model the performance behavior of a stall on the specified dependency
1106     * ID.
1107     */
1108    void
stall_on_dependency(state & st,dependency_id id)1109    stall_on_dependency(state &st, dependency_id id)
1110    {
1111       if (id < ARRAY_SIZE(st.dep_ready))
1112          st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1113                                        st.dep_ready[id]);
1114    }
1115 
1116    /**
1117     * Model the performance behavior of the front-end and back-end while
1118     * executing an instruction with the specified timing information, assuming
1119     * all dependencies are already clear.
1120     */
1121    void
execute_instruction(state & st,const perf_desc & perf)1122    execute_instruction(state &st, const perf_desc &perf)
1123    {
1124       /* Compute the time at which the front-end will be ready to execute the
1125        * next instruction.
1126        */
1127       st.unit_ready[unit_fe] += perf.df;
1128 
1129       if (perf.u < num_units) {
1130          /* Wait for the back-end to be ready to execute this instruction. */
1131          st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1132                                        st.unit_ready[perf.u]);
1133 
1134          /* Compute the time at which the back-end will be ready to execute
1135           * the next instruction, and update the back-end utilization.
1136           */
1137          st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1138          st.unit_busy[perf.u] += perf.db * st.weight;
1139       }
1140    }
1141 
1142    /**
1143     * Model the performance behavior of a read dependency provided by an
1144     * instruction.
1145     */
1146    void
mark_read_dependency(state & st,const perf_desc & perf,dependency_id id)1147    mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1148    {
1149       if (id < ARRAY_SIZE(st.dep_ready))
1150          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1151    }
1152 
1153    /**
1154     * Model the performance behavior of a write dependency provided by an
1155     * instruction.
1156     */
1157    void
mark_write_dependency(state & st,const perf_desc & perf,dependency_id id)1158    mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1159    {
1160       if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1161          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1162       else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1163          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1164       else if (id < ARRAY_SIZE(st.dep_ready))
1165          st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1166    }
1167 
1168    /**
1169     * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1170     */
1171    dependency_id
reg_dependency_id(const gen_device_info * devinfo,const backend_reg & r,const int delta)1172    reg_dependency_id(const gen_device_info *devinfo, const backend_reg &r,
1173                      const int delta)
1174    {
1175       if (r.file == VGRF) {
1176          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1177          assert(i < dependency_id_mrf0 - dependency_id_grf0);
1178          return dependency_id(dependency_id_grf0 + i);
1179 
1180       } else if (r.file == FIXED_GRF) {
1181          const unsigned i = r.nr + delta;
1182          assert(i < dependency_id_mrf0 - dependency_id_grf0);
1183          return dependency_id(dependency_id_grf0 + i);
1184 
1185       } else if (r.file == MRF && devinfo->gen >= 7) {
1186          const unsigned i = GEN7_MRF_HACK_START +
1187                             r.nr + r.offset / REG_SIZE + delta;
1188          assert(i < dependency_id_mrf0 - dependency_id_grf0);
1189          return dependency_id(dependency_id_grf0 + i);
1190 
1191       } else if (r.file == MRF && devinfo->gen < 7) {
1192          const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1193                             r.offset / REG_SIZE + delta;
1194          assert(i < dependency_id_addr0 - dependency_id_mrf0);
1195          return dependency_id(dependency_id_mrf0 + i);
1196 
1197       } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1198                  r.nr < BRW_ARF_ACCUMULATOR) {
1199          assert(delta == 0);
1200          return dependency_id_addr0;
1201 
1202       } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1203                  r.nr < BRW_ARF_FLAG) {
1204          const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1205          assert(i < dependency_id_flag0 - dependency_id_accum0);
1206          return dependency_id(dependency_id_accum0 + i);
1207 
1208       } else {
1209          return num_dependency_ids;
1210       }
1211    }
1212 
1213    /**
1214     * Return the dependency ID of flag register starting at offset \p i.
1215     */
1216    dependency_id
flag_dependency_id(unsigned i)1217    flag_dependency_id(unsigned i)
1218    {
1219       assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1220       return dependency_id(dependency_id_flag0 + i);
1221    }
1222 
1223    /**
1224     * Return the dependency ID corresponding to the SBID read completion
1225     * condition of a Gen12+ SWSB.
1226     */
1227    dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1228    tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1229    {
1230       if (swsb.mode) {
1231          assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1232          return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1233       } else {
1234          return num_dependency_ids;
1235       }
1236    }
1237 
1238    /**
1239     * Return the dependency ID corresponding to the SBID write completion
1240     * condition of a Gen12+ SWSB.
1241     */
1242    dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1243    tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1244    {
1245       if (swsb.mode) {
1246          assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1247          return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1248       } else {
1249          return num_dependency_ids;
1250       }
1251    }
1252 
1253    /**
1254     * Return the implicit accumulator register accessed by channel \p i of the
1255     * instruction.
1256     */
1257    unsigned
accum_reg_of_channel(const gen_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)1258    accum_reg_of_channel(const gen_device_info *devinfo,
1259                         const backend_instruction *inst,
1260                         brw_reg_type tx, unsigned i)
1261    {
1262       assert(inst->reads_accumulator_implicitly() ||
1263              inst->writes_accumulator_implicitly(devinfo));
1264       const unsigned offset = (inst->group + i) * type_sz(tx) *
1265          (devinfo->gen < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1266       return offset / REG_SIZE % 2;
1267    }
1268 
1269    /**
1270     * Model the performance behavior of an FS back-end instruction.
1271     */
1272    void
issue_fs_inst(state & st,const gen_device_info * devinfo,const backend_instruction * be_inst)1273    issue_fs_inst(state &st, const gen_device_info *devinfo,
1274                  const backend_instruction *be_inst)
1275    {
1276       const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1277       const instruction_info info(devinfo, inst);
1278       const perf_desc perf = instruction_desc(info);
1279 
1280       /* Stall on any source dependencies. */
1281       for (unsigned i = 0; i < inst->sources; i++) {
1282          for (unsigned j = 0; j < regs_read(inst, i); j++)
1283             stall_on_dependency(
1284                st, reg_dependency_id(devinfo, inst->src[i], j));
1285       }
1286 
1287       if (inst->reads_accumulator_implicitly()) {
1288          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1289               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1290                                         inst->exec_size - 1); j++)
1291             stall_on_dependency(
1292                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1293       }
1294 
1295       if (is_send(inst) && inst->base_mrf != -1) {
1296          for (unsigned j = 0; j < inst->mlen; j++)
1297             stall_on_dependency(
1298                st, reg_dependency_id(
1299                   devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1300       }
1301 
1302       if (const unsigned mask = inst->flags_read(devinfo)) {
1303          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1304             if (mask & (1 << i))
1305                stall_on_dependency(st, flag_dependency_id(i));
1306          }
1307       }
1308 
1309       /* Stall on any write dependencies. */
1310       if (!inst->no_dd_check) {
1311          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1312             for (unsigned j = 0; j < regs_written(inst); j++)
1313                stall_on_dependency(
1314                   st, reg_dependency_id(devinfo, inst->dst, j));
1315          }
1316 
1317          if (inst->writes_accumulator_implicitly(devinfo)) {
1318             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1319                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1320                                            inst->exec_size - 1); j++)
1321                stall_on_dependency(
1322                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1323          }
1324 
1325          if (const unsigned mask = inst->flags_written()) {
1326             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1327                if (mask & (1 << i))
1328                   stall_on_dependency(st, flag_dependency_id(i));
1329             }
1330          }
1331       }
1332 
1333       /* Stall on any SBID dependencies. */
1334       if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1335          stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1336       else if (inst->sched.mode & TGL_SBID_SRC)
1337          stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1338 
1339       /* Execute the instruction. */
1340       execute_instruction(st, perf);
1341 
1342       /* Mark any source dependencies. */
1343       if (inst->is_send_from_grf()) {
1344          for (unsigned i = 0; i < inst->sources; i++) {
1345             if (inst->is_payload(i)) {
1346                for (unsigned j = 0; j < regs_read(inst, i); j++)
1347                   mark_read_dependency(
1348                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1349             }
1350          }
1351       }
1352 
1353       if (is_send(inst) && inst->base_mrf != -1) {
1354          for (unsigned j = 0; j < inst->mlen; j++)
1355             mark_read_dependency(st, perf,
1356                reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1357       }
1358 
1359       /* Mark any destination dependencies. */
1360       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1361          for (unsigned j = 0; j < regs_written(inst); j++) {
1362             mark_write_dependency(st, perf,
1363                                   reg_dependency_id(devinfo, inst->dst, j));
1364          }
1365       }
1366 
1367       if (inst->writes_accumulator_implicitly(devinfo)) {
1368          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1369               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1370                                         inst->exec_size - 1); j++)
1371             mark_write_dependency(st, perf,
1372                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
1373       }
1374 
1375       if (const unsigned mask = inst->flags_written()) {
1376          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1377             if (mask & (1 << i))
1378                mark_write_dependency(st, perf, flag_dependency_id(i));
1379          }
1380       }
1381 
1382       /* Mark any SBID dependencies. */
1383       if (inst->sched.mode & TGL_SBID_SET) {
1384          mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1385          mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1386       }
1387    }
1388 
1389    /**
1390     * Model the performance behavior of a VEC4 back-end instruction.
1391     */
1392    void
issue_vec4_instruction(state & st,const gen_device_info * devinfo,const backend_instruction * be_inst)1393    issue_vec4_instruction(state &st, const gen_device_info *devinfo,
1394                           const backend_instruction *be_inst)
1395    {
1396       const vec4_instruction *inst =
1397          static_cast<const vec4_instruction *>(be_inst);
1398       const instruction_info info(devinfo, inst);
1399       const perf_desc perf = instruction_desc(info);
1400 
1401       /* Stall on any source dependencies. */
1402       for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1403          for (unsigned j = 0; j < regs_read(inst, i); j++)
1404             stall_on_dependency(
1405                st, reg_dependency_id(devinfo, inst->src[i], j));
1406       }
1407 
1408       if (inst->reads_accumulator_implicitly()) {
1409          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1410               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1411                                         inst->exec_size - 1); j++)
1412             stall_on_dependency(
1413                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1414       }
1415 
1416       if (inst->base_mrf != -1) {
1417          for (unsigned j = 0; j < inst->mlen; j++)
1418             stall_on_dependency(
1419                st, reg_dependency_id(
1420                   devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1421       }
1422 
1423       if (inst->reads_flag())
1424          stall_on_dependency(st, dependency_id_flag0);
1425 
1426       /* Stall on any write dependencies. */
1427       if (!inst->no_dd_check) {
1428          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1429             for (unsigned j = 0; j < regs_written(inst); j++)
1430                stall_on_dependency(
1431                   st, reg_dependency_id(devinfo, inst->dst, j));
1432          }
1433 
1434          if (inst->writes_accumulator_implicitly(devinfo)) {
1435             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1436                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
1437                                            inst->exec_size - 1); j++)
1438                stall_on_dependency(
1439                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1440          }
1441 
1442          if (inst->writes_flag())
1443             stall_on_dependency(st, dependency_id_flag0);
1444       }
1445 
1446       /* Execute the instruction. */
1447       execute_instruction(st, perf);
1448 
1449       /* Mark any source dependencies. */
1450       if (inst->is_send_from_grf()) {
1451          for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1452             for (unsigned j = 0; j < regs_read(inst, i); j++)
1453                mark_read_dependency(
1454                   st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1455          }
1456       }
1457 
1458       if (inst->base_mrf != -1) {
1459          for (unsigned j = 0; j < inst->mlen; j++)
1460             mark_read_dependency(st, perf,
1461                reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1462       }
1463 
1464       /* Mark any destination dependencies. */
1465       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1466          for (unsigned j = 0; j < regs_written(inst); j++) {
1467             mark_write_dependency(st, perf,
1468                                   reg_dependency_id(devinfo, inst->dst, j));
1469          }
1470       }
1471 
1472       if (inst->writes_accumulator_implicitly(devinfo)) {
1473          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1474               j <= accum_reg_of_channel(devinfo, inst, info.tx,
1475                                         inst->exec_size - 1); j++)
1476             mark_write_dependency(st, perf,
1477                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
1478       }
1479 
1480       if (inst->writes_flag())
1481          mark_write_dependency(st, perf, dependency_id_flag0);
1482    }
1483 
1484    /**
1485     * Calculate the maximum possible throughput of the program compatible with
1486     * the cycle-count utilization estimated for each asynchronous unit, in
1487     * threads-per-cycle units.
1488     */
1489    float
calculate_thread_throughput(const state & st,float busy)1490    calculate_thread_throughput(const state &st, float busy)
1491    {
1492       for (unsigned i = 0; i < num_units; i++)
1493          busy = MAX2(busy, st.unit_busy[i]);
1494 
1495       return 1.0 / busy;
1496    }
1497 
1498    /**
1499     * Estimate the performance of the specified shader.
1500     */
1501    void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const gen_device_info *,const backend_instruction *),unsigned dispatch_width)1502    calculate_performance(performance &p, const backend_shader *s,
1503                          void (*issue_instruction)(
1504                             state &, const gen_device_info *,
1505                             const backend_instruction *),
1506                          unsigned dispatch_width)
1507    {
1508       /* XXX - Note that the previous version of this code used worst-case
1509        *       scenario estimation of branching divergence for SIMD32 shaders,
1510        *       but this heuristic was removed to improve performance in common
1511        *       scenarios. Wider shader variants are less optimal when divergence
1512        *       is high, e.g. when application renders complex scene on a small
1513        *       surface. It is assumed that such renders are short, so their
1514        *       time doesn't matter and when it comes to the overall performance,
1515        *       they are dominated by more optimal larger renders.
1516        *
1517        *       It's possible that we could do better with divergence analysis
1518        *       by isolating branches which are 100% uniform.
1519        *
1520        *       Plumbing the trip counts from NIR loop analysis would allow us
1521        *       to do a better job regarding the loop weights.
1522        *
1523        *       In the meantime use values that roughly match the control flow
1524        *       weights used elsewhere in the compiler back-end.
1525        *
1526        *       Note that we provide slightly more pessimistic weights on
1527        *       Gen12+ for SIMD32, since the effective warp size on that
1528        *       platform is 2x the SIMD width due to EU fusion, which increases
1529        *       the likelihood of divergent control flow in comparison to
1530        *       previous generations, giving narrower SIMD modes a performance
1531        *       advantage in several test-cases with non-uniform discard jumps.
1532        */
1533       const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
1534                                     1.0 : 0.5);
1535       const float loop_weight = 10;
1536       unsigned discard_count = 0;
1537       unsigned elapsed = 0;
1538       state st;
1539 
1540       foreach_block(block, s->cfg) {
1541          const unsigned elapsed0 = elapsed;
1542 
1543          foreach_inst_in_block(backend_instruction, inst, block) {
1544             const unsigned clock0 = st.unit_ready[unit_fe];
1545 
1546             issue_instruction(st, s->devinfo, inst);
1547 
1548             if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
1549                st.weight /= discard_weight;
1550 
1551             elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1552 
1553             if (inst->opcode == BRW_OPCODE_DO)
1554                st.weight *= loop_weight;
1555             else if (inst->opcode == BRW_OPCODE_WHILE)
1556                st.weight /= loop_weight;
1557             else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
1558                st.weight *= discard_weight;
1559          }
1560 
1561          p.block_latency[block->num] = elapsed - elapsed0;
1562       }
1563 
1564       p.latency = elapsed;
1565       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1566    }
1567 }
1568 
performance(const fs_visitor * v)1569 brw::performance::performance(const fs_visitor *v) :
1570    block_latency(new unsigned[v->cfg->num_blocks])
1571 {
1572    calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1573 }
1574 
performance(const vec4_visitor * v)1575 brw::performance::performance(const vec4_visitor *v) :
1576    block_latency(new unsigned[v->cfg->num_blocks])
1577 {
1578    calculate_performance(*this, v, issue_vec4_instruction, 8);
1579 }
1580 
~performance()1581 brw::performance::~performance()
1582 {
1583    delete[] block_latency;
1584 }
1585