• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_cfg.h"
27 
28 using namespace brw;
29 
30 namespace {
31    /**
32     * Enumeration representing the various asynchronous units that can run
33     * computations in parallel on behalf of a shader thread.
34     */
35    enum intel_eu_unit {
36       /** EU front-end. */
37       EU_UNIT_FE,
38       /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
39       EU_UNIT_FPU,
40       /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */
41       EU_UNIT_EM,
42       /** Sampler shared function. */
43       EU_UNIT_SAMPLER,
44       /** Pixel Interpolator shared function. */
45       EU_UNIT_PI,
46       /** Unified Return Buffer shared function. */
47       EU_UNIT_URB,
48       /** Data Port Data Cache shared function. */
49       EU_UNIT_DP_DC,
50       /** Data Port Render Cache shared function. */
51       EU_UNIT_DP_RC,
52       /** Data Port Constant Cache shared function. */
53       EU_UNIT_DP_CC,
54       /** Message Gateway shared function. */
55       EU_UNIT_GATEWAY,
56       /** Thread Spawner shared function. */
57       EU_UNIT_SPAWNER,
58       /* EU_UNIT_VME, */
59       /* EU_UNIT_CRE, */
60       /** Number of asynchronous units currently tracked. */
61       EU_NUM_UNITS,
62       /** Dummy unit for instructions that don't consume runtime from the above. */
63       EU_UNIT_NULL = EU_NUM_UNITS
64    };
65 
66    /**
67     * Enumeration representing a computation result another computation can
68     * potentially depend on.
69     */
70    enum intel_eu_dependency_id {
71       /* Register part of the GRF. */
72       EU_DEPENDENCY_ID_GRF0 = 0,
73       /* Address register part of the ARF. */
74       EU_DEPENDENCY_ID_ADDR0 = EU_DEPENDENCY_ID_GRF0 + XE3_MAX_GRF,
75       /* Accumulator register part of the ARF. */
76       EU_DEPENDENCY_ID_ACCUM0 = EU_DEPENDENCY_ID_ADDR0 + 1,
77       /* Flag register part of the ARF. */
78       EU_DEPENDENCY_ID_FLAG0 = EU_DEPENDENCY_ID_ACCUM0 + 12,
79       /* SBID token write completion.  Only used on Gfx12+. */
80       EU_DEPENDENCY_ID_SBID_WR0 = EU_DEPENDENCY_ID_FLAG0 + 8,
81       /* SBID token read completion.  Only used on Gfx12+. */
82       EU_DEPENDENCY_ID_SBID_RD0 = EU_DEPENDENCY_ID_SBID_WR0 + 32,
83       /* Number of computation dependencies currently tracked. */
84       EU_NUM_DEPENDENCY_IDS = EU_DEPENDENCY_ID_SBID_RD0 + 32
85    };
86 
87    /**
88     * State of our modeling of the program execution.
89     */
90    struct state {
state__anon0742f7430111::state91       state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
92       /**
93        * Time at which a given unit will be ready to execute the next
94        * computation, in clock units.
95        */
96       unsigned unit_ready[EU_NUM_UNITS];
97       /**
98        * Time at which an instruction dependent on a given dependency ID will
99        * be ready to execute, in clock units.
100        */
101       unsigned dep_ready[EU_NUM_DEPENDENCY_IDS];
102       /**
103        * Aggregated utilization of a given unit excluding idle cycles,
104        * in clock units.
105        */
106       float unit_busy[EU_NUM_UNITS];
107       /**
108        * Factor of the overhead of a computation accounted for in the
109        * aggregated utilization calculation.
110        */
111       float weight;
112    };
113 
114    /**
115     * Information derived from an IR instruction used to compute performance
116     * estimates.  Allows the timing calculation to work on both FS and VEC4
117     * instructions.
118     */
119    struct instruction_info {
instruction_info__anon0742f7430111::instruction_info120       instruction_info(const struct brw_isa_info *isa, const fs_inst *inst) :
121          isa(isa), devinfo(isa->devinfo), op(inst->opcode),
122          td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
123          tx(get_exec_type(inst)), sx(0), ss(0),
124          sc(has_bank_conflict(isa, inst) ? sd : 0),
125          desc(inst->desc), sfid(inst->sfid)
126       {
127          /* We typically want the maximum source size, except for split send
128           * messages which require the total size.
129           */
130          if (inst->opcode == SHADER_OPCODE_SEND) {
131             ss = DIV_ROUND_UP(inst->size_read(devinfo, 2), REG_SIZE) +
132                  DIV_ROUND_UP(inst->size_read(devinfo, 3), REG_SIZE);
133          } else if (inst->opcode == SHADER_OPCODE_SEND_GATHER) {
134             ss = inst->mlen;
135             /* If haven't lowered yet, count the sources. */
136             if (!ss) {
137                for (int i = 3; i < inst->sources; i++)
138                   ss += DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE);
139             }
140          } else {
141             for (unsigned i = 0; i < inst->sources; i++)
142                ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE));
143          }
144 
145          /* Convert the execution size to GRF units. */
146          sx = DIV_ROUND_UP(inst->exec_size * brw_type_size_bytes(tx), REG_SIZE);
147 
148          /* 32x32 integer multiplication has half the usual ALU throughput.
149           * Treat it as double-precision.
150           */
151          if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
152              !brw_type_is_float(tx) && brw_type_size_bytes(tx) == 4 &&
153              brw_type_size_bytes(inst->src[0].type) == brw_type_size_bytes(inst->src[1].type))
154             tx = brw_int_type(8, tx == BRW_TYPE_D);
155 
156          rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
157       }
158 
159       /** ISA encoding information */
160       const struct brw_isa_info *isa;
161       /** Device information. */
162       const struct intel_device_info *devinfo;
163       /** Instruction opcode. */
164       opcode op;
165       /** Destination type. */
166       brw_reg_type td;
167       /** Destination size in GRF units. */
168       unsigned sd;
169       /** Execution type. */
170       brw_reg_type tx;
171       /** Execution size in GRF units. */
172       unsigned sx;
173       /** Source size. */
174       unsigned ss;
175       /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
176       unsigned sc;
177       /** Send message descriptor. */
178       uint32_t desc;
179       /** Send message shared function ID. */
180       uint8_t sfid;
181       /** Repeat count for DPAS instructions. */
182       uint8_t rcount;
183    };
184 
185    /**
186     * Timing information of an instruction used to estimate the performance of
187     * the program.
188     */
189    struct perf_desc {
perf_desc__anon0742f7430111::perf_desc190       perf_desc(enum intel_eu_unit u, int df, int db,
191                 int ls, int ld, int la, int lf) :
192          u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
193 
194       /**
195        * Back-end unit its runtime shall be accounted to, in addition to the
196        * EU front-end which is always assumed to be involved.
197        */
198       enum intel_eu_unit u;
199       /**
200        * Overhead cycles from the time that the EU front-end starts executing
201        * the instruction until it's ready to execute the next instruction.
202        */
203       int df;
204       /**
205        * Overhead cycles from the time that the back-end starts executing the
206        * instruction until it's ready to execute the next instruction.
207        */
208       int db;
209       /**
210        * Latency cycles from the time that the back-end starts executing the
211        * instruction until its sources have been read from the register file.
212        */
213       int ls;
214       /**
215        * Latency cycles from the time that the back-end starts executing the
216        * instruction until its regular destination has been written to the
217        * register file.
218        */
219       int ld;
220       /**
221        * Latency cycles from the time that the back-end starts executing the
222        * instruction until its accumulator destination has been written to the
223        * ARF file.
224        *
225        * Note that this is an approximation of the real behavior of
226        * accumulating instructions in the hardware: Instead of modeling a pair
227        * of back-to-back accumulating instructions as a first computation with
228        * latency equal to ld followed by another computation with a
229        * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
230        * model the stall as if it occurred at the top of the pipeline, with
231        * the latency of the accumulator computation offset accordingly.
232        */
233       int la;
234       /**
235        * Latency cycles from the time that the back-end starts executing the
236        * instruction until its flag destination has been written to the ARF
237        * file.
238        */
239       int lf;
240    };
241 
242    /**
243     * Compute the timing information of an instruction based on any relevant
244     * information from the IR and a number of parameters specifying a linear
245     * approximation: Parameter X_Y specifies the derivative of timing X
246     * relative to info field Y, while X_1 specifies the independent term of
247     * the approximation of timing X.
248     */
249    perf_desc
calculate_desc(const instruction_info & info,enum intel_eu_unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)250    calculate_desc(const instruction_info &info, enum intel_eu_unit u,
251                   int df_1, int df_sd, int df_sc,
252                   int db_1, int db_sx,
253                   int ls_1, int ld_1, int la_1, int lf_1,
254                   int l_ss, int l_sd)
255    {
256       return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
257                           db_1 + db_sx * int(info.sx),
258                           ls_1 + l_ss * int(info.ss),
259                           ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
260                           la_1, lf_1);
261    }
262 
263    /**
264     * Compute the timing information of an instruction based on any relevant
265     * information from the IR and a number of linear approximation parameters
266     * hard-coded for each IR instruction.
267     *
268     * Most timing parameters are obtained from the multivariate linear
269     * regression of a sample of empirical timings measured using the tm0
270     * register (as can be done today by using the shader_time debugging
271     * option).  The Gfx4-5 math timings are obtained from BSpec Volume 5c.3
272     * "Shared Functions - Extended Math", Section 3.2 "Performance".
273     * Parameters marked XXX shall be considered low-quality, they're possibly
274     * high variance or completely guessed in cases where experimental data was
275     * unavailable.
276     */
277    const perf_desc
instruction_desc(const instruction_info & info)278    instruction_desc(const instruction_info &info)
279    {
280       const struct intel_device_info *devinfo = info.devinfo;
281 
282       switch (info.op) {
283       case BRW_OPCODE_SYNC:
284       case BRW_OPCODE_SEL:
285       case BRW_OPCODE_NOT:
286       case BRW_OPCODE_AND:
287       case BRW_OPCODE_OR:
288       case BRW_OPCODE_XOR:
289       case BRW_OPCODE_SHR:
290       case BRW_OPCODE_SHL:
291       case BRW_OPCODE_ASR:
292       case BRW_OPCODE_CMPN:
293       case BRW_OPCODE_BFREV:
294       case BRW_OPCODE_BFI1:
295       case BRW_OPCODE_AVG:
296       case BRW_OPCODE_FRC:
297       case BRW_OPCODE_RNDU:
298       case BRW_OPCODE_RNDD:
299       case BRW_OPCODE_RNDE:
300       case BRW_OPCODE_RNDZ:
301       case BRW_OPCODE_MAC:
302       case BRW_OPCODE_MACH:
303       case BRW_OPCODE_LZD:
304       case BRW_OPCODE_FBH:
305       case BRW_OPCODE_FBL:
306       case BRW_OPCODE_CBIT:
307       case BRW_OPCODE_ADDC:
308       case BRW_OPCODE_ROR:
309       case BRW_OPCODE_ROL:
310       case BRW_OPCODE_SUBB:
311       case BRW_OPCODE_LINE:
312       case BRW_OPCODE_NOP:
313       case SHADER_OPCODE_CLUSTER_BROADCAST:
314       case SHADER_OPCODE_SCRATCH_HEADER:
315       case FS_OPCODE_DDX_COARSE:
316       case FS_OPCODE_DDX_FINE:
317       case FS_OPCODE_DDY_COARSE:
318       case FS_OPCODE_PIXEL_X:
319       case FS_OPCODE_PIXEL_Y:
320          if (devinfo->ver >= 11) {
321             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
322                                   0, 10, 6 /* XXX */, 14, 0, 0);
323          } else {
324             if (brw_type_size_bytes(info.tx) > 4)
325                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
326                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
327             else
328                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
329                                      0, 8, 4, 12, 0, 0);
330          }
331 
332       case BRW_OPCODE_MOV:
333       case BRW_OPCODE_CMP:
334       case BRW_OPCODE_ADD:
335       case BRW_OPCODE_ADD3:
336       case BRW_OPCODE_MUL:
337       case SHADER_OPCODE_MOV_RELOC_IMM:
338          if (devinfo->ver >= 11) {
339             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
340                                   0, 10, 6, 14, 0, 0);
341          } else {
342             if (brw_type_size_bytes(info.tx) > 4)
343                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
344                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
345             else
346                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
347                                      0, 8, 4, 12, 0, 0);
348          }
349 
350       case BRW_OPCODE_BFE:
351       case BRW_OPCODE_BFI2:
352       case BRW_OPCODE_CSEL:
353          if (devinfo->ver >= 11)
354             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
355                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
356          else
357             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
358                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
359 
360       case BRW_OPCODE_MAD:
361          if (devinfo->ver >= 11) {
362             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
363                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
364          } else {
365             if (brw_type_size_bytes(info.tx) > 4)
366                return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
367                                      0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
368             else
369                return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
370                                      0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
371          }
372 
373       case BRW_OPCODE_DP4:
374       case BRW_OPCODE_DPH:
375       case BRW_OPCODE_DP3:
376       case BRW_OPCODE_DP2:
377          return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
378                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
379 
380       case BRW_OPCODE_DP4A:
381          if (devinfo->ver >= 12)
382             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
383                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
384          else
385             abort();
386 
387       case BRW_OPCODE_DPAS: {
388          unsigned ld;
389 
390          switch (info.rcount) {
391          case 1:
392             ld = 21;
393             break;
394          case 2:
395             ld = 22;
396             break;
397          case 8:
398          default:
399             ld = 32;
400             break;
401          }
402 
403          /* DPAS cannot write the accumulator or the flags, so pass UINT_MAX
404           * for la and lf.
405           */
406          if (devinfo->verx10 >= 125)
407             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
408                                   0, ld, UINT_MAX, UINT_MAX, 0, 0);
409          else
410             abort();
411       }
412 
413       case SHADER_OPCODE_RCP:
414       case SHADER_OPCODE_RSQ:
415       case SHADER_OPCODE_SQRT:
416       case SHADER_OPCODE_EXP2:
417       case SHADER_OPCODE_LOG2:
418       case SHADER_OPCODE_SIN:
419       case SHADER_OPCODE_COS:
420          return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 4,
421                                0, 16, 0, 0, 0, 0);
422 
423       case SHADER_OPCODE_POW:
424          return calculate_desc(info, EU_UNIT_EM, -2, 4, 0, 0, 8,
425                                0, 24, 0, 0, 0, 0);
426 
427       case SHADER_OPCODE_INT_QUOTIENT:
428       case SHADER_OPCODE_INT_REMAINDER:
429          return calculate_desc(info, EU_UNIT_EM, 2, 0, 0, 26, 0,
430                                0, 28 /* XXX */, 0, 0, 0, 0);
431 
432       case BRW_OPCODE_DO:
433          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
434                                0, 0, 0, 0, 0, 0);
435 
436       case BRW_OPCODE_IF:
437       case BRW_OPCODE_ELSE:
438       case BRW_OPCODE_ENDIF:
439       case BRW_OPCODE_WHILE:
440       case BRW_OPCODE_BREAK:
441       case BRW_OPCODE_CONTINUE:
442       case BRW_OPCODE_HALT:
443          return calculate_desc(info, EU_UNIT_NULL, 8, 0, 0, 0, 0,
444                                0, 0, 0, 0, 0, 0);
445 
446       case BRW_OPCODE_PLN:
447          return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
448                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
449 
450       case BRW_OPCODE_LRP:
451          return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
452                                0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
453 
454       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
455          if (devinfo->ver >= 11)
456             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
457                                   0, 10 /* XXX */, 6 /* XXX */,
458                                   14 /* XXX */, 0, 0);
459          else
460             return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
461                                   0, 8 /* XXX */, 4 /* XXX */,
462                                   12 /* XXX */, 0, 0);
463 
464       case SHADER_OPCODE_READ_ARCH_REG:
465          if (devinfo->ver >= 12) {
466             return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
467                                   0, 10, 6 /* XXX */, 14, 0, 0);
468          } else {
469             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
470                                   0, 8, 4, 12, 0, 0);
471          }
472 
473       case SHADER_OPCODE_MOV_INDIRECT:
474          if (devinfo->ver >= 11)
475             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
476                                   0, 10 /* XXX */, 6 /* XXX */,
477                                   14 /* XXX */, 0, 0);
478          else
479             return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
480                                   0, 8 /* XXX */, 4 /* XXX */,
481                                   12 /* XXX */, 0, 0);
482 
483       case SHADER_OPCODE_BROADCAST:
484          if (devinfo->ver >= 11)
485             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
486                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
487          else
488             return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
489                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
490 
491       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
492       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
493       case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
494          if (devinfo->ver >= 11)
495             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
496                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
497          else
498             return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
499                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
500 
501       case SHADER_OPCODE_RND_MODE:
502       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
503          if (devinfo->ver >= 11)
504             return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
505                                   4 /* XXX */, 0,
506                                   0, 0, 0, 0, 0, 0);
507          else
508             return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
509                                   4 /* XXX */, 0,
510                                   0, 0, 0, 0, 0, 0);
511 
512       case SHADER_OPCODE_SHUFFLE:
513          if (devinfo->ver >= 11)
514             return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
515                                   44 /* XXX */, 0,
516                                   0, 10 /* XXX */, 6 /* XXX */,
517                                   14 /* XXX */, 0, 0);
518          else
519             return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
520                                   42 /* XXX */, 0,
521                                   0, 8 /* XXX */, 4 /* XXX */,
522                                   12 /* XXX */, 0, 0);
523 
524       case SHADER_OPCODE_SEL_EXEC:
525          if (devinfo->ver >= 11)
526             return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
527                                   0, 4 /* XXX */,
528                                   0, 10 /* XXX */, 6 /* XXX */,
529                                   14 /* XXX */, 0, 0);
530          else
531             return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
532                                   0, 4 /* XXX */,
533                                   0, 8 /* XXX */, 4 /* XXX */,
534                                   12 /* XXX */, 0, 0);
535 
536       case SHADER_OPCODE_QUAD_SWIZZLE:
537          if (devinfo->ver >= 11)
538             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
539                                   0, 8 /* XXX */,
540                                   0, 10 /* XXX */, 6 /* XXX */,
541                                   14 /* XXX */, 0, 0);
542          else
543             return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
544                                   0, 8 /* XXX */,
545                                   0, 8 /* XXX */, 4 /* XXX */,
546                                   12 /* XXX */, 0, 0);
547 
548       case FS_OPCODE_DDY_FINE:
549          if (devinfo->ver >= 11)
550             return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
551                                   0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
552          else
553             return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
554                                   0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
555 
556       case FS_OPCODE_LOAD_LIVE_CHANNELS:
557          if (devinfo->ver >= 11)
558             return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
559                                   2 /* XXX */, 0,
560                                   0, 0, 0, 10 /* XXX */, 0, 0);
561          else
562             return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
563                                   0, 2 /* XXX */,
564                                   0, 0, 0, 8 /* XXX */, 0, 0);
565 
566       case SHADER_OPCODE_GET_BUFFER_SIZE:
567          return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16 /* XXX */,
568                                8 /* XXX */, 750 /* XXX */, 0, 0,
569                                2 /* XXX */, 0);
570 
571       case SHADER_OPCODE_MEMORY_FENCE:
572       case SHADER_OPCODE_INTERLOCK:
573          switch (info.sfid) {
574          case GFX6_SFID_DATAPORT_RENDER_CACHE:
575             return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0, 30 /* XXX */, 0,
576                                   10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
577 
578          case BRW_SFID_URB:
579          case GFX7_SFID_DATAPORT_DATA_CACHE:
580          case GFX12_SFID_SLM:
581          case GFX12_SFID_TGM:
582          case GFX12_SFID_UGM:
583          case HSW_SFID_DATAPORT_DATA_CACHE_1:
584             return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0, 30 /* XXX */, 0,
585                                   10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
586 
587          default:
588             abort();
589          }
590 
591       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
592          return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
593                                10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
594 
595       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
596       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
597       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
598          return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
599                                0, 90 /* XXX */, 0, 0, 0, 0);
600 
601       case SHADER_OPCODE_BARRIER:
602          return calculate_desc(info, EU_UNIT_GATEWAY, 90 /* XXX */, 0, 0,
603                                0 /* XXX */, 0,
604                                0, 0, 0, 0, 0, 0);
605 
606       case SHADER_OPCODE_SEND:
607       case SHADER_OPCODE_SEND_GATHER:
608          switch (info.sfid) {
609          case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
610             /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
611             return calculate_desc(info, EU_UNIT_DP_CC, 2, 0, 0, 0, 16 /* XXX */,
612                                   10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
613          case GFX6_SFID_DATAPORT_RENDER_CACHE:
614             switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
615             case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
616                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
617                                      30 /* XXX */, 450 /* XXX */,
618                                      10 /* XXX */, 100 /* XXX */,
619                                      0, 0, 0, 400 /* XXX */);
620             default:
621                return calculate_desc(info, EU_UNIT_DP_RC, 2, 0, 0,
622                                      0, 450 /* XXX */,
623                                      10 /* XXX */, 300 /* XXX */, 0, 0,
624                                      0, 0);
625             }
626          case BRW_SFID_SAMPLER: {
627             return calculate_desc(info, EU_UNIT_SAMPLER, 2, 0, 0, 0, 16,
628                                   8, 750, 0, 0, 2, 0);
629          }
630          case GFX7_SFID_DATAPORT_DATA_CACHE:
631          case HSW_SFID_DATAPORT_DATA_CACHE_1:
632             switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
633             case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
634             case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
635             case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
636             case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
637                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
638                                      30 /* XXX */, 400 /* XXX */,
639                                      10 /* XXX */, 100 /* XXX */, 0, 0,
640                                      0, 400 /* XXX */);
641 
642             default:
643                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
644                                      0, 20 /* XXX */,
645                                      10 /* XXX */, 100 /* XXX */, 0, 0,
646                                      0, 0);
647             }
648 
649          case GFX7_SFID_PIXEL_INTERPOLATOR:
650             return calculate_desc(info, EU_UNIT_PI, 2, 0, 0, 14 /* XXX */, 0,
651                                   0, 90 /* XXX */, 0, 0, 0, 0);
652 
653          case GFX12_SFID_UGM:
654          case GFX12_SFID_TGM:
655          case GFX12_SFID_SLM:
656             switch (lsc_msg_desc_opcode(devinfo, info.desc)) {
657             case LSC_OP_LOAD:
658             case LSC_OP_STORE:
659             case LSC_OP_LOAD_CMASK:
660             case LSC_OP_STORE_CMASK:
661                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
662                                      0, 20 /* XXX */,
663                                      10 /* XXX */, 100 /* XXX */, 0, 0,
664                                      0, 0);
665 
666             case LSC_OP_FENCE:
667             case LSC_OP_ATOMIC_INC:
668             case LSC_OP_ATOMIC_DEC:
669             case LSC_OP_ATOMIC_LOAD:
670             case LSC_OP_ATOMIC_STORE:
671             case LSC_OP_ATOMIC_ADD:
672             case LSC_OP_ATOMIC_SUB:
673             case LSC_OP_ATOMIC_MIN:
674             case LSC_OP_ATOMIC_MAX:
675             case LSC_OP_ATOMIC_UMIN:
676             case LSC_OP_ATOMIC_UMAX:
677             case LSC_OP_ATOMIC_CMPXCHG:
678             case LSC_OP_ATOMIC_FADD:
679             case LSC_OP_ATOMIC_FSUB:
680             case LSC_OP_ATOMIC_FMIN:
681             case LSC_OP_ATOMIC_FMAX:
682             case LSC_OP_ATOMIC_FCMPXCHG:
683             case LSC_OP_ATOMIC_AND:
684             case LSC_OP_ATOMIC_OR:
685             case LSC_OP_ATOMIC_XOR:
686                return calculate_desc(info, EU_UNIT_DP_DC, 2, 0, 0,
687                                      30 /* XXX */, 400 /* XXX */,
688                                      10 /* XXX */, 100 /* XXX */, 0, 0,
689                                      0, 400 /* XXX */);
690             default:
691                abort();
692             }
693 
694          case BRW_SFID_MESSAGE_GATEWAY:
695          case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */
696          case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
697             return calculate_desc(info, EU_UNIT_SPAWNER, 2, 0, 0, 0 /* XXX */, 0,
698                                   10 /* XXX */, 0, 0, 0, 0, 0);
699 
700          case BRW_SFID_URB:
701             return calculate_desc(info, EU_UNIT_URB, 2, 0, 0, 0, 6 /* XXX */,
702                                   32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
703 
704          default:
705             abort();
706          }
707 
708       case SHADER_OPCODE_UNDEF:
709       case SHADER_OPCODE_HALT_TARGET:
710       case FS_OPCODE_SCHEDULING_FENCE:
711          return calculate_desc(info, EU_UNIT_NULL, 0, 0, 0, 0, 0,
712                                0, 0, 0, 0, 0, 0);
713 
714       default:
715          abort();
716       }
717    }
718 
719    /**
720     * Model the performance behavior of a stall on the specified dependency
721     * ID.
722     */
723    void
stall_on_dependency(state & st,enum intel_eu_dependency_id id)724    stall_on_dependency(state &st, enum intel_eu_dependency_id id)
725    {
726       if (id < ARRAY_SIZE(st.dep_ready))
727          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
728                                        st.dep_ready[id]);
729    }
730 
731    /**
732     * Model the performance behavior of the front-end and back-end while
733     * executing an instruction with the specified timing information, assuming
734     * all dependencies are already clear.
735     */
736    void
execute_instruction(state & st,const perf_desc & perf)737    execute_instruction(state &st, const perf_desc &perf)
738    {
739       /* Compute the time at which the front-end will be ready to execute the
740        * next instruction.
741        */
742       st.unit_ready[EU_UNIT_FE] += perf.df;
743 
744       if (perf.u < EU_NUM_UNITS) {
745          /* Wait for the back-end to be ready to execute this instruction. */
746          st.unit_ready[EU_UNIT_FE] = MAX2(st.unit_ready[EU_UNIT_FE],
747                                        st.unit_ready[perf.u]);
748 
749          /* Compute the time at which the back-end will be ready to execute
750           * the next instruction, and update the back-end utilization.
751           */
752          st.unit_ready[perf.u] = st.unit_ready[EU_UNIT_FE] + perf.db;
753          st.unit_busy[perf.u] += perf.db * st.weight;
754       }
755    }
756 
757    /**
758     * Model the performance behavior of a read dependency provided by an
759     * instruction.
760     */
761    void
mark_read_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)762    mark_read_dependency(state &st, const perf_desc &perf,
763                         enum intel_eu_dependency_id id)
764    {
765       if (id < ARRAY_SIZE(st.dep_ready))
766          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ls;
767    }
768 
769    /**
770     * Model the performance behavior of a write dependency provided by an
771     * instruction.
772     */
773    void
mark_write_dependency(state & st,const perf_desc & perf,enum intel_eu_dependency_id id)774    mark_write_dependency(state &st, const perf_desc &perf,
775                          enum intel_eu_dependency_id id)
776    {
777       if (id >= EU_DEPENDENCY_ID_ACCUM0 && id < EU_DEPENDENCY_ID_FLAG0)
778          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.la;
779       else if (id >= EU_DEPENDENCY_ID_FLAG0 && id < EU_DEPENDENCY_ID_SBID_WR0)
780          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.lf;
781       else if (id < ARRAY_SIZE(st.dep_ready))
782          st.dep_ready[id] = st.unit_ready[EU_UNIT_FE] + perf.ld;
783    }
784 
785    /**
786     * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
787     */
788    enum intel_eu_dependency_id
reg_dependency_id(const intel_device_info * devinfo,const brw_reg & r,const int delta)789    reg_dependency_id(const intel_device_info *devinfo, const brw_reg &r,
790                      const int delta)
791    {
792       if (r.file == VGRF) {
793          const unsigned i = r.nr + r.offset / REG_SIZE + delta;
794          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
795          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
796 
797       } else if (r.file == FIXED_GRF) {
798          const unsigned i = r.nr + delta;
799          assert(i < EU_DEPENDENCY_ID_ADDR0 - EU_DEPENDENCY_ID_GRF0);
800          return intel_eu_dependency_id(EU_DEPENDENCY_ID_GRF0 + i);
801 
802       } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
803                  r.nr < BRW_ARF_ACCUMULATOR) {
804          assert(delta == 0);
805          return EU_DEPENDENCY_ID_ADDR0;
806 
807       } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
808                  r.nr < BRW_ARF_FLAG) {
809          const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
810          assert(i < EU_DEPENDENCY_ID_FLAG0 - EU_DEPENDENCY_ID_ACCUM0);
811          return intel_eu_dependency_id(EU_DEPENDENCY_ID_ACCUM0 + i);
812 
813       } else {
814          return EU_NUM_DEPENDENCY_IDS;
815       }
816    }
817 
818    /**
819     * Return the dependency ID of flag register starting at offset \p i.
820     */
821    enum intel_eu_dependency_id
flag_dependency_id(unsigned i)822    flag_dependency_id(unsigned i)
823    {
824       assert(i < EU_DEPENDENCY_ID_SBID_WR0 - EU_DEPENDENCY_ID_FLAG0);
825       return intel_eu_dependency_id(EU_DEPENDENCY_ID_FLAG0 + i);
826    }
827 
828    /**
829     * Return the dependency ID corresponding to the SBID read completion
830     * condition of a Gfx12+ SWSB.
831     */
832    enum intel_eu_dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)833    tgl_swsb_rd_dependency_id(tgl_swsb swsb)
834    {
835       if (swsb.mode) {
836          assert(swsb.sbid <
837                 EU_NUM_DEPENDENCY_IDS - EU_DEPENDENCY_ID_SBID_RD0);
838          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_RD0 + swsb.sbid);
839       } else {
840          return EU_NUM_DEPENDENCY_IDS;
841       }
842    }
843 
844    /**
845     * Return the dependency ID corresponding to the SBID write completion
846     * condition of a Gfx12+ SWSB.
847     */
848    enum intel_eu_dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)849    tgl_swsb_wr_dependency_id(tgl_swsb swsb)
850    {
851       if (swsb.mode) {
852          assert(swsb.sbid <
853                 EU_DEPENDENCY_ID_SBID_RD0 - EU_DEPENDENCY_ID_SBID_WR0);
854          return intel_eu_dependency_id(EU_DEPENDENCY_ID_SBID_WR0 + swsb.sbid);
855       } else {
856          return EU_NUM_DEPENDENCY_IDS;
857       }
858    }
859 
860    /**
861     * Return the implicit accumulator register accessed by channel \p i of the
862     * instruction.
863     */
864    unsigned
accum_reg_of_channel(const intel_device_info * devinfo,const fs_inst * inst,brw_reg_type tx,unsigned i)865    accum_reg_of_channel(const intel_device_info *devinfo,
866                         const fs_inst *inst,
867                         brw_reg_type tx, unsigned i)
868    {
869       assert(inst->reads_accumulator_implicitly() ||
870              inst->writes_accumulator_implicitly(devinfo));
871       const unsigned offset = (inst->group + i) * brw_type_size_bytes(tx) *
872          (brw_type_is_float(tx) ? 1 : 2);
873       return offset / (reg_unit(devinfo) * REG_SIZE) % 2;
874    }
875 
876    /**
877     * Model the performance behavior of an FS back-end instruction.
878     */
879    void
issue_inst(state & st,const struct brw_isa_info * isa,const fs_inst * inst)880    issue_inst(state &st, const struct brw_isa_info *isa,
881               const fs_inst *inst)
882    {
883       const struct intel_device_info *devinfo = isa->devinfo;
884       const instruction_info info(isa, inst);
885       const perf_desc perf = instruction_desc(info);
886 
887       /* Stall on any source dependencies. */
888       for (unsigned i = 0; i < inst->sources; i++) {
889          for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
890             stall_on_dependency(
891                st, reg_dependency_id(devinfo, inst->src[i], j));
892       }
893 
894       if (inst->reads_accumulator_implicitly()) {
895          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
896               j <= accum_reg_of_channel(devinfo, inst, info.tx,
897                                         inst->exec_size - 1); j++)
898             stall_on_dependency(
899                st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
900       }
901 
902       if (const unsigned mask = inst->flags_read(devinfo)) {
903          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
904             if (mask & (1 << i))
905                stall_on_dependency(st, flag_dependency_id(i));
906          }
907       }
908 
909       /* Stall on any write dependencies. */
910       if (!inst->no_dd_check) {
911          if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
912             for (unsigned j = 0; j < regs_written(inst); j++)
913                stall_on_dependency(
914                   st, reg_dependency_id(devinfo, inst->dst, j));
915          }
916 
917          if (inst->writes_accumulator_implicitly(devinfo)) {
918             for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
919                  j <= accum_reg_of_channel(devinfo, inst, info.tx,
920                                            inst->exec_size - 1); j++)
921                stall_on_dependency(
922                   st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
923          }
924 
925          if (const unsigned mask = inst->flags_written(devinfo)) {
926             for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
927                if (mask & (1 << i))
928                   stall_on_dependency(st, flag_dependency_id(i));
929             }
930          }
931       }
932 
933       /* Stall on any SBID dependencies. */
934       if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
935          stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
936       else if (inst->sched.mode & TGL_SBID_SRC)
937          stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
938 
939       /* Execute the instruction. */
940       execute_instruction(st, perf);
941 
942       /* Mark any source dependencies. */
943       if (inst->is_send_from_grf()) {
944          for (unsigned i = 0; i < inst->sources; i++) {
945             if (inst->is_payload(i)) {
946                for (unsigned j = 0; j < regs_read(devinfo, inst, i); j++)
947                   mark_read_dependency(
948                      st, perf, reg_dependency_id(devinfo, inst->src[i], j));
949             }
950          }
951       }
952 
953       /* Mark any destination dependencies. */
954       if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
955          for (unsigned j = 0; j < regs_written(inst); j++) {
956             mark_write_dependency(st, perf,
957                                   reg_dependency_id(devinfo, inst->dst, j));
958          }
959       }
960 
961       if (inst->writes_accumulator_implicitly(devinfo)) {
962          for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
963               j <= accum_reg_of_channel(devinfo, inst, info.tx,
964                                         inst->exec_size - 1); j++)
965             mark_write_dependency(st, perf,
966                                   reg_dependency_id(devinfo, brw_acc_reg(8), j));
967       }
968 
969       if (const unsigned mask = inst->flags_written(devinfo)) {
970          for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
971             if (mask & (1 << i))
972                mark_write_dependency(st, perf, flag_dependency_id(i));
973          }
974       }
975 
976       /* Mark any SBID dependencies. */
977       if (inst->sched.mode & TGL_SBID_SET) {
978          mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
979          mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
980       }
981    }
982 
983    /**
984     * Calculate the maximum possible throughput of the program compatible with
985     * the cycle-count utilization estimated for each asynchronous unit, in
986     * threads-per-cycle units.
987     */
988    float
calculate_thread_throughput(const state & st,float busy)989    calculate_thread_throughput(const state &st, float busy)
990    {
991       for (unsigned i = 0; i < EU_NUM_UNITS; i++)
992          busy = MAX2(busy, st.unit_busy[i]);
993 
994       return 1.0 / busy;
995    }
996 
997    /**
998     * Estimate the performance of the specified shader.
999     */
1000    void
calculate_performance(performance & p,const fs_visitor * s,unsigned dispatch_width)1001    calculate_performance(performance &p, const fs_visitor *s,
1002                          unsigned dispatch_width)
1003    {
1004       /* XXX - Note that the previous version of this code used worst-case
1005        *       scenario estimation of branching divergence for SIMD32 shaders,
1006        *       but this heuristic was removed to improve performance in common
1007        *       scenarios. Wider shader variants are less optimal when divergence
1008        *       is high, e.g. when application renders complex scene on a small
1009        *       surface. It is assumed that such renders are short, so their
1010        *       time doesn't matter and when it comes to the overall performance,
1011        *       they are dominated by more optimal larger renders.
1012        *
1013        *       It's possible that we could do better with divergence analysis
1014        *       by isolating branches which are 100% uniform.
1015        *
1016        *       Plumbing the trip counts from NIR loop analysis would allow us
1017        *       to do a better job regarding the loop weights.
1018        *
1019        *       In the meantime use values that roughly match the control flow
1020        *       weights used elsewhere in the compiler back-end.
1021        *
1022        *       Note that we provide slightly more pessimistic weights on
1023        *       Gfx12.x for SIMD32, since the effective warp size on that
1024        *       platform is 2x the SIMD width due to EU fusion, which increases
1025        *       the likelihood of divergent control flow in comparison to
1026        *       previous generations, giving narrower SIMD modes a performance
1027        *       advantage in several test-cases with non-uniform discard jumps.
1028        *       EU fusion has been removed on Xe2+ so its divergence behavior is
1029        *       expected to be closer to pre-Gfx12 platforms.
1030        */
1031       const float discard_weight = (dispatch_width > 16 || s->devinfo->ver != 12 ?
1032                                     1.0 : 0.5);
1033       const float loop_weight = 10;
1034       unsigned halt_count = 0;
1035       unsigned elapsed = 0;
1036       state st;
1037 
1038       foreach_block(block, s->cfg) {
1039          const unsigned elapsed0 = elapsed;
1040 
1041          foreach_inst_in_block(fs_inst, inst, block) {
1042             const unsigned clock0 = st.unit_ready[EU_UNIT_FE];
1043 
1044             issue_inst(st, &s->compiler->isa, inst);
1045 
1046             if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count)
1047                st.weight /= discard_weight;
1048 
1049             elapsed += (st.unit_ready[EU_UNIT_FE] - clock0) * st.weight;
1050 
1051             if (inst->opcode == BRW_OPCODE_DO)
1052                st.weight *= loop_weight;
1053             else if (inst->opcode == BRW_OPCODE_WHILE)
1054                st.weight /= loop_weight;
1055             else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++)
1056                st.weight *= discard_weight;
1057          }
1058 
1059          p.block_latency[block->num] = elapsed - elapsed0;
1060       }
1061 
1062       p.latency = elapsed;
1063       p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1064    }
1065 }
1066 
performance(const fs_visitor * v)1067 brw::performance::performance(const fs_visitor *v) :
1068    block_latency(new unsigned[v->cfg->num_blocks])
1069 {
1070    calculate_performance(*this, v, v->dispatch_width);
1071 }
1072 
~performance()1073 brw::performance::~performance()
1074 {
1075    delete[] block_latency;
1076 }
1077