• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 
28 using namespace brw;
29 
30 namespace {
31    /**
32     * Enumeration representing the various asynchronous units that can run
33     * computations in parallel on behalf of a shader thread.
34     */
35    enum intel_eu_unit {
36       /** EU front-end. */
37       EU_UNIT_FE,
38       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
39       EU_UNIT_FPU,
40       /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
41       EU_UNIT_EM,
42       /** Sampler shared function. */
43       EU_UNIT_SAMPLER,
44       /** Pixel Interpolator shared function. */
45       EU_UNIT_PI,
46       /** Unified Return Buffer shared function. */
47       EU_UNIT_URB,
48       /** Data Port Data Cache shared function. */
49       EU_UNIT_DP_DC,
50       /** Data Port Render Cache shared function. */
51       EU_UNIT_DP_RC,
52       /** Data Port Constant Cache shared function. */
53       EU_UNIT_DP_CC,
54       /** Message Gateway shared function. */
55       EU_UNIT_GATEWAY,
56       /** Thread Spawner shared function. */
57       EU_UNIT_SPAWNER,
58       /* EU_UNIT_VME, */
59       /* EU_UNIT_CRE, */
60       /** Number of asynchronous units currently tracked. */
61       EU_NUM_UNITS,
62       /** Dummy unit for instructions that don't consume runtime from the above. */
63       EU_UNIT_NULL = EU_NUM_UNITS
64    };
65 
66    /**
67     * Enumeration representing a computation result another computation can
68     * potentially depend on.
69     */
70    enum intel_eu_dependency_id {
71       /* Register part of the GRF. */
72       EU_DEPENDENCY_ID_GRF0 = 0,
73       /* Address register part of the ARF. */
74       EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_GRF0 + XE2_MAX_GRF,
75       /* Accumulator register part of the ARF. */
76       EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
77       /* Flag register part of the ARF. */
78       EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
79       /* SBID token write completion.  Only used on Gfx12+. */
80       EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
81       /* SBID token read completion.  Only used on Gfx12+. */
82       EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
83       /* Number of computation dependencies currently tracked. */
84       EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
85    };
86 
87    /**
88     * State of our modeling of the program execution.
89     */
90    struct state {
state__anonc582e4c40111::state91       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
92       /**
93        * Time at which a given unit will be ready to execute the next
94        * computation, in clock units.
95        */
96       unsigned unit_ready[EU_NUM_UNITS];
97       /**
98        * Time at which an instruction dependent on a given dependency ID will
99        * be ready to execute, in clock units.
100        */
101       unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
102       /**
103        * Aggregated utilization of a given unit excluding idle cycles,
104        * in clock units.
105        */
106       float unit_busy[EU_NUM_UNITS];
107       /**
108        * Factor of the overhead of a computation accounted for in the
109        * aggregated utilization calculation.
110        */
111       float weight;
112    };
113 
114    /**
115     * Information derived from an IR instruction used to compute performance
116     * estimates.  Allows the timing calculation to work on both FS and VEC4
117     * instructions.
118     */
119    struct instruction_info {
instruction_info__anonc582e4c40111::instruction_info120       instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
121          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
122          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
123          tx(get_exec_type(inst)), sx(0), ss(0),
124          sc(has_bank_conflict(isa, inst) ? sd : 0),
125          desc(inst->desc), sfid(inst->sfid)
126       {
127          /* We typically want the maximum source size, except for split send
128           * messages which require the total size.
129           */
130          if (inst->opcode == SHADER_OPCODE_SEND) {
131             ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
132                  DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
133          } else {
134             for (unsigned i = 0; i < inst->sources; i++)
135                ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
136          }
137 
138          /* Convert the execution size to GRF units. */
139          sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
140 
141          /* 32x32 integer multiplication has half the usual ALU throughput.
142           * Treat it as double-precision.
143           */
144          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
145              !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
146              type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
147             tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
148 
149          rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
150       }
151 
152       /** ISA encoding information */
153       const struct brw_isa_info *isa;
154       /** Device information. */
155       const struct intel_device_info *devinfo;
156       /** Instruction opcode. */
157       opcode op;
158       /** Destination type. */
159       brw_reg_type td;
160       /** Destination size in GRF units. */
161       unsigned sd;
162       /** Execution type. */
163       brw_reg_type tx;
164       /** Execution size in GRF units. */
165       unsigned sx;
166       /** Source size. */
167       unsigned ss;
168       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
169       unsigned sc;
170       /** Send message descriptor. */
171       uint32_t desc;
172       /** Send message shared function ID. */
173       uint8_t sfid;
174       /** Repeat count for DPAS instructions. */
175       uint8_t rcount;
176    };
177 
178    /**
179     * Timing information of an instruction used to estimate the performance of
180     * the program.
181     */
182    struct perf_desc {
perf_desc__anonc582e4c40111::perf_desc183       perf_desc(enum intel_eu_unit u, int df, int db,
184                 int ls, int ld, int la, int lf) :
185          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
186 
187       /**
188        * Back-end unit its runtime shall be accounted to, in addition to the
189        * EU front-end which is always assumed to be involved.
190        */
191       enum intel_eu_unit u;
192       /**
193        * Overhead cycles from the time that the EU front-end starts executing
194        * the instruction until it's ready to execute the next instruction.
195        */
196       int df;
197       /**
198        * Overhead cycles from the time that the back-end starts executing the
199        * instruction until it's ready to execute the next instruction.
200        */
201       int db;
202       /**
203        * Latency cycles from the time that the back-end starts executing the
204        * instruction until its sources have been read from the register file.
205        */
206       int ls;
207       /**
208        * Latency cycles from the time that the back-end starts executing the
209        * instruction until its regular destination has been written to the
210        * register file.
211        */
212       int ld;
213       /**
214        * Latency cycles from the time that the back-end starts executing the
215        * instruction until its accumulator destination has been written to the
216        * ARF file.
217        *
218        * Note that this is an approximation of the real behavior of
219        * accumulating instructions in the hardware: Instead of modeling a pair
220        * of back-to-back accumulating instructions as a first computation with
221        * latency equal to ld followed by another computation with a
222        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
223        * model the stall as if it occurred at the top of the pipeline, with
224        * the latency of the accumulator computation offset accordingly.
225        */
226       int la;
227       /**
228        * Latency cycles from the time that the back-end starts executing the
229        * instruction until its flag destination has been written to the ARF
230        * file.
231        */
232       int lf;
233    };
234 
235    /**
236     * Compute the timing information of an instruction based on any relevant
237     * information from the IR and a number of parameters specifying a linear
238     * approximation: Parameter X_Y specifies the derivative of timing X
239     * relative to info field Y, while X_1 specifies the independent term of
240     * the approximation of timing X.
241     */
242    perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)243    calculate_desc(const instruction_info &info, enum intel_eu_unit u,
244                   int df_1, int df_sd, int df_sc,
245                   int db_1, int db_sx,
246                   int ls_1, int ld_1, int la_1, int lf_1,
247                   int l_ss, int l_sd)
248    {
249       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
250                           db_1 + db_sx * int(info.sx),
251                           ls_1 + l_ss * int(info.ss),
252                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
253                           la_1, lf_1);
254    }
255 
256    /**
257     * Compute the timing information of an instruction based on any relevant
258     * information from the IR and a number of linear approximation parameters
259     * hard-coded for each IR instruction.
260     *
261     * Most timing parameters are obtained from the multivariate linear
262     * regression of a sample of empirical timings measured using the tm0
263     * register (as can be done today by using the shader_time debugging
264     * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
265     * "Shared Functions - Extended Math", Section 3.2 "Performance".
266     * Parameters marked XXX shall be considered low-quality, they're possibly
267     * high variance or completely guessed in cases where experimental data was
268     * unavailable.
269     */
270    const perf_desc
instruction_desc(const instruction_info & info)271    instruction_desc(const instruction_info &info)
272    {
273       const struct intel_device_info *devinfo = info.devinfo;
274 
275       switch (info.op) {
276       case BRW_OPCODE_SYNC:
277       case BRW_OPCODE_SEL:
278       case BRW_OPCODE_NOT:
279       case BRW_OPCODE_AND:
280       case BRW_OPCODE_OR:
281       case BRW_OPCODE_XOR:
282       case BRW_OPCODE_SHR:
283       case BRW_OPCODE_SHL:
284       case BRW_OPCODE_ASR:
285       case BRW_OPCODE_CMPN:
286       case BRW_OPCODE_BFREV:
287       case BRW_OPCODE_BFI1:
288       case BRW_OPCODE_AVG:
289       case BRW_OPCODE_FRC:
290       case BRW_OPCODE_RNDU:
291       case BRW_OPCODE_RNDD:
292       case BRW_OPCODE_RNDE:
293       case BRW_OPCODE_RNDZ:
294       case BRW_OPCODE_MAC:
295       case BRW_OPCODE_MACH:
296       case BRW_OPCODE_LZD:
297       case BRW_OPCODE_FBH:
298       case BRW_OPCODE_FBL:
299       case BRW_OPCODE_CBIT:
300       case BRW_OPCODE_ADDC:
301       case BRW_OPCODE_ROR:
302       case BRW_OPCODE_ROL:
303       case BRW_OPCODE_SUBB:
304       case BRW_OPCODE_SAD2:
305       case BRW_OPCODE_SADA2:
306       case BRW_OPCODE_LINE:
307       case BRW_OPCODE_NOP:
308       case SHADER_OPCODE_CLUSTER_BROADCAST:
309       case SHADER_OPCODE_SCRATCH_HEADER:
310       case FS_OPCODE_DDX_COARSE:
311       case FS_OPCODE_DDX_FINE:
312       case FS_OPCODE_DDY_COARSE:
313       case FS_OPCODE_PIXEL_X:
314       case FS_OPCODE_PIXEL_Y:
315       case SHADER_OPCODE_READ_SR_REG:
316          if (devinfo->ver >= 11) {
317             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
318                                   0, 10, 6 /* XXX */, 14, 0, 0);
319          } else {
320             if (type_sz(info.tx) > 4)
321                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
322                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
323             else
324                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
325                                      0, 8, 4, 12, 0, 0);
326          }
327 
328       case BRW_OPCODE_MOV:
329       case BRW_OPCODE_CMP:
330       case BRW_OPCODE_ADD:
331       case BRW_OPCODE_ADD3:
332       case BRW_OPCODE_MUL:
333       case SHADER_OPCODE_MOV_RELOC_IMM:
334          if (devinfo->ver >= 11) {
335             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
336                                   0, 10, 6, 14, 0, 0);
337          } else {
338             if (type_sz(info.tx) > 4)
339                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
340                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
341             else
342                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
343                                      0, 8, 4, 12, 0, 0);
344          }
345 
346       case BRW_OPCODE_BFE:
347       case BRW_OPCODE_BFI2:
348       case BRW_OPCODE_CSEL:
349          if (devinfo->ver >= 11)
350             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
351                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
352          else
353             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
354                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
355 
356       case BRW_OPCODE_MAD:
357          if (devinfo->ver >= 11) {
358             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
359                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
360          } else {
361             if (type_sz(info.tx) > 4)
362                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
363                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
364             else
365                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
366                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
367          }
368 
369       case BRW_OPCODE_DP4:
370       case BRW_OPCODE_DPH:
371       case BRW_OPCODE_DP3:
372       case BRW_OPCODE_DP2:
373          return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
374                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
375 
376       case BRW_OPCODE_DP4A:
377          if (devinfo->ver >= 12)
378             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
379                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
380          else
381             abort();
382 
383       case BRW_OPCODE_DPAS: {
384          unsigned ld;
385 
386          switch (info.rcount) {
387          case 1:
388             ld = 21;
389             break;
390          case 2:
391             ld = 22;
392             break;
393          case 8:
394          default:
395             ld = 32;
396             break;
397          }
398 
399          /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
400           * for la and lf.
401           */
402          if (devinfo->verx10 >= 125)
403             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
404                                   0, ld, UINT_MAX, UINT_MAX, 0, 0);
405          else
406             abort();
407       }
408 
409       case SHADER_OPCODE_RCP:
410       case SHADER_OPCODE_RSQ:
411       case SHADER_OPCODE_SQRT:
412       case SHADER_OPCODE_EXP2:
413       case SHADER_OPCODE_LOG2:
414       case SHADER_OPCODE_SIN:
415       case SHADER_OPCODE_COS:
416          return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
417                                0, 16, 0, 0, 0, 0);
418 
419       case SHADER_OPCODE_POW:
420          return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
421                                0, 24, 0, 0, 0, 0);
422 
423       case SHADER_OPCODE_INT_QUOTIENT:
424       case SHADER_OPCODE_INT_REMAINDER:
425          return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
426                                0, 28 /* XXX */, 0, 0, 0, 0);
427 
428       case BRW_OPCODE_DO:
429          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
430                                0, 0, 0, 0, 0, 0);
431 
432       case BRW_OPCODE_IF:
433       case BRW_OPCODE_ELSE:
434       case BRW_OPCODE_ENDIF:
435       case BRW_OPCODE_WHILE:
436       case BRW_OPCODE_BREAK:
437       case BRW_OPCODE_CONTINUE:
438       case BRW_OPCODE_HALT:
439          return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
440                                0, 0, 0, 0, 0, 0);
441 
442       case FS_OPCODE_LINTERP:
443          return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
444                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
445 
446       case BRW_OPCODE_LRP:
447          return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
448                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
449 
450       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
451          if (devinfo->ver >= 11)
452             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
453                                   0, 10 /* XXX */, 6 /* XXX */,
454                                   14 /* XXX */, 0, 0);
455          else
456             return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
457                                   0, 8 /* XXX */, 4 /* XXX */,
458                                   12 /* XXX */, 0, 0);
459 
460       case SHADER_OPCODE_MOV_INDIRECT:
461          if (devinfo->ver >= 11)
462             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
463                                   0, 10 /* XXX */, 6 /* XXX */,
464                                   14 /* XXX */, 0, 0);
465          else
466             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
467                                   0, 8 /* XXX */, 4 /* XXX */,
468                                   12 /* XXX */, 0, 0);
469 
470       case SHADER_OPCODE_BROADCAST:
471          if (devinfo->ver >= 11)
472             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
473                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
474          else
475             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
476                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
477 
478       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
479       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
480       case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
481          if (devinfo->ver >= 11)
482             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
483                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
484          else
485             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
486                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
487 
488       case SHADER_OPCODE_RND_MODE:
489       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
490          if (devinfo->ver >= 11)
491             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
492                                   4 /* XXX */, 0,
493                                   0, 0, 0, 0, 0, 0);
494          else
495             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
496                                   4 /* XXX */, 0,
497                                   0, 0, 0, 0, 0, 0);
498 
499       case SHADER_OPCODE_SHUFFLE:
500          if (devinfo->ver >= 11)
501             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
502                                   44 /* XXX */, 0,
503                                   0, 10 /* XXX */, 6 /* XXX */,
504                                   14 /* XXX */, 0, 0);
505          else
506             return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
507                                   42 /* XXX */, 0,
508                                   0, 8 /* XXX */, 4 /* XXX */,
509                                   12 /* XXX */, 0, 0);
510 
511       case SHADER_OPCODE_SEL_EXEC:
512          if (devinfo->ver >= 11)
513             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
514                                   0, 4 /* XXX */,
515                                   0, 10 /* XXX */, 6 /* XXX */,
516                                   14 /* XXX */, 0, 0);
517          else
518             return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
519                                   0, 4 /* XXX */,
520                                   0, 8 /* XXX */, 4 /* XXX */,
521                                   12 /* XXX */, 0, 0);
522 
523       case SHADER_OPCODE_QUAD_SWIZZLE:
524          if (devinfo->ver >= 11)
525             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
526                                   0, 8 /* XXX */,
527                                   0, 10 /* XXX */, 6 /* XXX */,
528                                   14 /* XXX */, 0, 0);
529          else
530             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
531                                   0, 8 /* XXX */,
532                                   0, 8 /* XXX */, 4 /* XXX */,
533                                   12 /* XXX */, 0, 0);
534 
535       case FS_OPCODE_DDY_FINE:
536          if (devinfo->ver >= 11)
537             return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
538                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
539          else
540             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
541                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
542 
543       case FS_OPCODE_LOAD_LIVE_CHANNELS:
544          if (devinfo->ver >= 11)
545             return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
546                                   2 /* XXX */, 0,
547                                   0, 0, 0, 10 /* XXX */, 0, 0);
548          else
549             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
550                                   0, 2 /* XXX */,
551                                   0, 0, 0, 8 /* XXX */, 0, 0);
552 
553       case SHADER_OPCODE_TEX:
554       case FS_OPCODE_TXB:
555       case SHADER_OPCODE_TXD:
556       case SHADER_OPCODE_TXF:
557       case SHADER_OPCODE_TXF_LZ:
558       case SHADER_OPCODE_TXL:
559       case SHADER_OPCODE_TXL_LZ:
560       case SHADER_OPCODE_TXF_CMS:
561       case SHADER_OPCODE_TXF_CMS_W:
562       case SHADER_OPCODE_TXF_UMS:
563       case SHADER_OPCODE_TXF_MCS:
564       case SHADER_OPCODE_TXS:
565       case SHADER_OPCODE_LOD:
566       case SHADER_OPCODE_GET_BUFFER_SIZE:
567       case SHADER_OPCODE_TG4:
568       case SHADER_OPCODE_TG4_BIAS:
569       case SHADER_OPCODE_TG4_EXPLICIT_LOD:
570       case SHADER_OPCODE_TG4_IMPLICIT_LOD:
571       case SHADER_OPCODE_TG4_OFFSET:
572       case SHADER_OPCODE_TG4_OFFSET_LOD:
573       case SHADER_OPCODE_TG4_OFFSET_BIAS:
574       case SHADER_OPCODE_SAMPLEINFO:
575          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
576                                8 /* XXX */, 750 /* XXX */, 0, 0,
577                                2 /* XXX */, 0);
578 
579       case SHADER_OPCODE_MEMORY_FENCE:
580       case SHADER_OPCODE_INTERLOCK:
581          switch (info.sfid) {
582          case GFX6_SFID_DATAPORT_RENDER_CACHE:
583             return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
584                                   10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
585 
586          case BRW_SFID_URB:
587          case GFX7_SFID_DATAPORT_DATA_CACHE:
588          case GFX12_SFID_SLM:
589          case GFX12_SFID_TGM:
590          case GFX12_SFID_UGM:
591          case HSW_SFID_DATAPORT_DATA_CACHE_1:
592             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
593                                   10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
594 
595          default:
596             abort();
597          }
598 
599       case FS_OPCODE_FB_READ:
600          return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 0, 450 /* XXX */,
601                                10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
602 
603       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
604          return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
605                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
606 
607       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
608       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
609       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
610          return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
611                                0, 90 /* XXX */, 0, 0, 0, 0);
612 
613       case SHADER_OPCODE_BARRIER:
614          return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
615                                0 /* XXX */, 0,
616                                0, 0, 0, 0, 0, 0);
617 
618       case CS_OPCODE_CS_TERMINATE:
619          return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
620                                10 /* XXX */, 0, 0, 0, 0, 0);
621 
622       case SHADER_OPCODE_SEND:
623          switch (info.sfid) {
624          case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
625             /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
626             return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
627                                   10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
628          case GFX6_SFID_DATAPORT_RENDER_CACHE:
629             switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
630             case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
631                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
632                                      30 /* XXX */, 450 /* XXX */,
633                                      10 /* XXX */, 100 /* XXX */,
634                                      0, 0, 0, 400 /* XXX */);
635             default:
636                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
637                                      0, 450 /* XXX */,
638                                      10 /* XXX */, 300 /* XXX */, 0, 0,
639                                      0, 0);
640             }
641          case BRW_SFID_SAMPLER: {
642             return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
643                                   8, 750, 0, 0, 2, 0);
644          }
645          case GFX7_SFID_DATAPORT_DATA_CACHE:
646          case HSW_SFID_DATAPORT_DATA_CACHE_1:
647             switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
648             case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
649             case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
650             case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
651             case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
652                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
653                                      30 /* XXX */, 400 /* XXX */,
654                                      10 /* XXX */, 100 /* XXX */, 0, 0,
655                                      0, 400 /* XXX */);
656 
657             default:
658                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
659                                      0, 20 /* XXX */,
660                                      10 /* XXX */, 100 /* XXX */, 0, 0,
661                                      0, 0);
662             }
663 
664          case GFX7_SFID_PIXEL_INTERPOLATOR:
665             return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
666                                   0, 90 /* XXX */, 0, 0, 0, 0);
667 
668          case GFX12_SFID_UGM:
669          case GFX12_SFID_TGM:
670          case GFX12_SFID_SLM:
671             switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
672             case LSC_OP_LOAD:
673             case LSC_OP_STORE:
674             case LSC_OP_LOAD_CMASK:
675             case LSC_OP_STORE_CMASK:
676                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
677                                      0, 20 /* XXX */,
678                                      10 /* XXX */, 100 /* XXX */, 0, 0,
679                                      0, 0);
680 
681             case LSC_OP_FENCE:
682             case LSC_OP_ATOMIC_INC:
683             case LSC_OP_ATOMIC_DEC:
684             case LSC_OP_ATOMIC_LOAD:
685             case LSC_OP_ATOMIC_STORE:
686             case LSC_OP_ATOMIC_ADD:
687             case LSC_OP_ATOMIC_SUB:
688             case LSC_OP_ATOMIC_MIN:
689             case LSC_OP_ATOMIC_MAX:
690             case LSC_OP_ATOMIC_UMIN:
691             case LSC_OP_ATOMIC_UMAX:
692             case LSC_OP_ATOMIC_CMPXCHG:
693             case LSC_OP_ATOMIC_FADD:
694             case LSC_OP_ATOMIC_FSUB:
695             case LSC_OP_ATOMIC_FMIN:
696             case LSC_OP_ATOMIC_FMAX:
697             case LSC_OP_ATOMIC_FCMPXCHG:
698             case LSC_OP_ATOMIC_AND:
699             case LSC_OP_ATOMIC_OR:
700             case LSC_OP_ATOMIC_XOR:
701                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
702                                      30 /* XXX */, 400 /* XXX */,
703                                      10 /* XXX */, 100 /* XXX */, 0, 0,
704                                      0, 400 /* XXX */);
705             default:
706                abort();
707             }
708 
709          case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH:
710          case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
711             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
712                                   10 /* XXX */, 0, 0, 0, 0, 0);
713 
714          case BRW_SFID_URB:
715             return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
716                                   32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
717 
718          default:
719             abort();
720          }
721 
722       case SHADER_OPCODE_UNDEF:
723       case SHADER_OPCODE_HALT_TARGET:
724       case FS_OPCODE_SCHEDULING_FENCE:
725          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
726                                0, 0, 0, 0, 0, 0);
727 
728       default:
729          abort();
730       }
731    }
732 
733    /**
734     * Model the performance behavior of a stall on the specified dependency
735     * ID.
736     */
737    void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)738    stall_on_dependency(state &st, enum intel_eu_dependency_id id)
739    {
740       if (id < ARRAY_SIZE(st.dep_ready))
741          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
742                                        st.dep_ready[id]);
743    }
744 
745    /**
746     * Model the performance behavior of the front-end and back-end while
747     * executing an instruction with the specified timing information, assuming
748     * all dependencies are already clear.
749     */
750    void
execute_instruction(state & st,const perf_desc & perf)751    execute_instruction(state &st, const perf_desc &perf)
752    {
753       /* Compute the time at which the front-end will be ready to execute the
754        * next instruction.
755        */
756       st.unit_ready[EU_UNIT_FE] += perf.df;
757 
758       if (perf.u < EU_NUM_UNITS) {
759          /* Wait for the back-end to be ready to execute this instruction. */
760          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
761                                        st.unit_ready[perf.u]);
762 
763          /* Compute the time at which the back-end will be ready to execute
764           * the next instruction, and update the back-end utilization.
765           */
766          st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
767          st.unit_busy[perf.u] += perf.db * st.weight;
768       }
769    }
770 
771    /**
772     * Model the performance behavior of a read dependency provided by an
773     * instruction.
774     */
775    void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)776    mark_read_dependency(state &st, const perf_desc &perf,
777                         enum intel_eu_dependency_id id)
778    {
779       if (id < ARRAY_SIZE(st.dep_ready))
780          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
781    }
782 
783    /**
784     * Model the performance behavior of a write dependency provided by an
785     * instruction.
786     */
787    void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)788    mark_write_dependency(state &st, const perf_desc &perf,
789                          enum intel_eu_dependency_id id)
790    {
791       if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
792          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
793       else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
794          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
795       else if (id < ARRAY_SIZE(st.dep_ready))
796          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
797    }
798 
799    /**
800     * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
801     */
802    enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const backend_reg & r,const int delta)803    reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r,
804                      const int delta)
805    {
806       if (r.file == VGRF) {
807          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
808          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
809          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
810 
811       } else if (r.file == FIXED_GRF) {
812          const unsigned i = r.nr + delta;
813          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
814          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
815 
816       } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
817                  r.nr < BRW_ARF_ACCUMULATOR) {
818          assert(delta == 0);
819          return EU_DEPENDENCY_ID_ADDR0;
820 
821       } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
822                  r.nr < BRW_ARF_FLAG) {
823          const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
824          assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
825          return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
826 
827       } else {
828          return EU_NUM_DEPENDENCY_IDS;
829       }
830    }
831 
832    /**
833     * Return the dependency ID of flag register starting at offset \p i.
834     */
835    enum intel_eu_dependency_id
flag_dependency_id(unsigned i)836    flag_dependency_id(unsigned i)
837    {
838       assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
839       return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
840    }
841 
842    /**
843     * Return the dependency ID corresponding to the SBID read completion
844     * condition of a Gfx12+ SWSB.
845     */
846    enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)847    tgl_swsb_rd_dependency_id(tgl_swsb swsb)
848    {
849       if (swsb.mode) {
850          assert(swsb.sbid <
851                 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
852          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
853       } else {
854          return EU_NUM_DEPENDENCY_IDS;
855       }
856    }
857 
858    /**
859     * Return the dependency ID corresponding to the SBID write completion
860     * condition of a Gfx12+ SWSB.
861     */
862    enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)863    tgl_swsb_wr_dependency_id(tgl_swsb swsb)
864    {
865       if (swsb.mode) {
866          assert(swsb.sbid <
867                 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
868          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
869       } else {
870          return EU_NUM_DEPENDENCY_IDS;
871       }
872    }
873 
874    /**
875     * Return the implicit accumulator register accessed by channel \p i of the
876     * instruction.
877     */
878    unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)879    accum_reg_of_channel(const intel_device_info *devinfo,
880                         const backend_instruction *inst,
881                         brw_reg_type tx, unsigned i)
882    {
883       assert(inst->reads_accumulator_implicitly() ||
884              inst->writes_accumulator_implicitly(devinfo));
885       const unsigned offset = (inst->group + i) * type_sz(tx) *
886          (brw_reg_type_is_floating_point(tx) ? 1 : 2);
887       return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
888    }
889 
890    /**
891     * Model the performance behavior of an FS back-end instruction.
892     */
893    void
issue_fs_inst(state & st,const struct brw_isa_info * isa,const backend_instruction * be_inst)894    issue_fs_inst(state &st, const struct brw_isa_info *isa,
895                  const backend_instruction *be_inst)
896    {
897       const struct intel_device_info *devinfo = isa->devinfo;
898       const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
899       const instruction_info info(isa, inst);
900       const perf_desc perf = instruction_desc(info);
901 
902       /* Stall on any source dependencies. */
903       for (unsigned i = 0; i < inst->sources; i++) {
904          for (unsigned j = 0; j < regs_read(inst, i); j++)
905             stall_on_dependency(
906                st, reg_dependency_id(devinfo, inst->src[i], j));
907       }
908 
909       if (inst->reads_accumulator_implicitly()) {
910          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
911               j <= accum_reg_of_channel(devinfo, inst, info.tx,
912                                         inst->exec_size - 1); j++)
913             stall_on_dependency(
914                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
915       }
916 
917       if (const unsigned mask = inst->flags_read(devinfo)) {
918          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
919             if (mask & (1 << i))
920                stall_on_dependency(st, flag_dependency_id(i));
921          }
922       }
923 
924       /* Stall on any write dependencies. */
925       if (!inst->no_dd_check) {
926          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
927             for (unsigned j = 0; j < regs_written(inst); j++)
928                stall_on_dependency(
929                   st, reg_dependency_id(devinfo, inst->dst, j));
930          }
931 
932          if (inst->writes_accumulator_implicitly(devinfo)) {
933             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
934                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
935                                            inst->exec_size - 1); j++)
936                stall_on_dependency(
937                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
938          }
939 
940          if (const unsigned mask = inst->flags_written(devinfo)) {
941             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
942                if (mask & (1 << i))
943                   stall_on_dependency(st, flag_dependency_id(i));
944             }
945          }
946       }
947 
948       /* Stall on any SBID dependencies. */
949       if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
950          stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
951       else if (inst->sched.mode & TGL_SBID_SRC)
952          stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
953 
954       /* Execute the instruction. */
955       execute_instruction(st, perf);
956 
957       /* Mark any source dependencies. */
958       if (inst->is_send_from_grf()) {
959          for (unsigned i = 0; i < inst->sources; i++) {
960             if (inst->is_payload(i)) {
961                for (unsigned j = 0; j < regs_read(inst, i); j++)
962                   mark_read_dependency(
963                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
964             }
965          }
966       }
967 
968       /* Mark any destination dependencies. */
969       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
970          for (unsigned j = 0; j < regs_written(inst); j++) {
971             mark_write_dependency(st, perf,
972                                   reg_dependency_id(devinfo, inst->dst, j));
973          }
974       }
975 
976       if (inst->writes_accumulator_implicitly(devinfo)) {
977          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
978               j <= accum_reg_of_channel(devinfo, inst, info.tx,
979                                         inst->exec_size - 1); j++)
980             mark_write_dependency(st, perf,
981                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
982       }
983 
984       if (const unsigned mask = inst->flags_written(devinfo)) {
985          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
986             if (mask & (1 << i))
987                mark_write_dependency(st, perf, flag_dependency_id(i));
988          }
989       }
990 
991       /* Mark any SBID dependencies. */
992       if (inst->sched.mode & TGL_SBID_SET) {
993          mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
994          mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
995       }
996    }
997 
998    /**
999     * Calculate the maximum possible throughput of the program compatible with
1000     * the cycle-count utilization estimated for each asynchronous unit, in
1001     * threads-per-cycle units.
1002     */
1003    float
calculate_thread_throughput(const state & st,float busy)1004    calculate_thread_throughput(const state &st, float busy)
1005    {
1006       for (unsigned i = 0; i < EU_NUM_UNITS; i++)
1007          busy = MAX2(busy, st.unit_busy[i]);
1008 
1009       return 1.0 / busy;
1010    }
1011 
1012    /**
1013     * Estimate the performance of the specified shader.
1014     */
1015    void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const struct brw_isa_info *,const backend_instruction *),unsigned dispatch_width)1016    calculate_performance(performance &p, const backend_shader *s,
1017                          void (*issue_instruction)(
1018                             state &, const struct brw_isa_info *,
1019                             const backend_instruction *),
1020                          unsigned dispatch_width)
1021    {
1022       /* XXX - Note that the previous version of this code used worst-case
1023        *       scenario estimation of branching divergence for SIMD32 shaders,
1024        *       but this heuristic was removed to improve performance in common
1025        *       scenarios. Wider shader variants are less optimal when divergence
1026        *       is high, e.g. when application renders complex scene on a small
1027        *       surface. It is assumed that such renders are short, so their
1028        *       time doesn't matter and when it comes to the overall performance,
1029        *       they are dominated by more optimal larger renders.
1030        *
1031        *       It's possible that we could do better with divergence analysis
1032        *       by isolating branches which are 100% uniform.
1033        *
1034        *       Plumbing the trip counts from NIR loop analysis would allow us
1035        *       to do a better job regarding the loop weights.
1036        *
1037        *       In the meantime use values that roughly match the control flow
1038        *       weights used elsewhere in the compiler back-end.
1039        *
1040        *       Note that we provide slightly more pessimistic weights on
1041        *       Gfx12+ for SIMD32, since the effective warp size on that
1042        *       platform is 2x the SIMD width due to EU fusion, which increases
1043        *       the likelihood of divergent control flow in comparison to
1044        *       previous generations, giving narrower SIMD modes a performance
1045        *       advantage in several test-cases with non-uniform discard jumps.
1046        */
1047       const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
1048                                     1.0 : 0.5);
1049       const float loop_weight = 10;
1050       unsigned halt_count = 0;
1051       unsigned elapsed = 0;
1052       state st;
1053 
1054       foreach_block(block, s->cfg) {
1055          const unsigned elapsed0 = elapsed;
1056 
1057          foreach_inst_in_block(backend_instruction, inst, block) {
1058             const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1059 
1060             issue_instruction(st, &s->compiler->isa, inst);
1061 
1062             if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1063                st.weight /= discard_weight;
1064 
1065             elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1066 
1067             if (inst->opcode == BRW_OPCODE_DO)
1068                st.weight *= loop_weight;
1069             else if (inst->opcode == BRW_OPCODE_WHILE)
1070                st.weight /= loop_weight;
1071             else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1072                st.weight *= discard_weight;
1073          }
1074 
1075          p.block_latency[block->num] = elapsed - elapsed0;
1076       }
1077 
1078       p.latency = elapsed;
1079       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1080    }
1081 }
1082 
performance(const fs_visitor * v)1083 brw::performance::performance(const fs_visitor *v) :
1084    block_latency(new unsigned[v->cfg->num_blocks])
1085 {
1086    calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1087 }
1088 
~performance()1089 brw::performance::~performance()
1090 {
1091    delete[] block_latency;
1092 }
1093