1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28
29 using namespace brw;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum unit {
37 /** EU front-end. */
38 unit_fe,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 unit_fpu,
41 /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
42 unit_em,
43 /** Sampler shared function. */
44 unit_sampler,
45 /** Pixel Interpolator shared function. */
46 unit_pi,
47 /** Unified Return Buffer shared function. */
48 unit_urb,
49 /** Data Port Data Cache shared function. */
50 unit_dp_dc,
51 /** Data Port Render Cache shared function. */
52 unit_dp_rc,
53 /** Data Port Constant Cache shared function. */
54 unit_dp_cc,
55 /** Message Gateway shared function. */
56 unit_gateway,
57 /** Thread Spawner shared function. */
58 unit_spawner,
59 /* unit_vme, */
60 /* unit_cre, */
61 /** Number of asynchronous units currently tracked. */
62 num_units,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 unit_null = num_units
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum dependency_id {
72 /* Register part of the GRF. */
73 dependency_id_grf0 = 0,
74 /* Register part of the MRF. Only used on Gen4-6. */
75 dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76 /* Address register part of the ARF. */
77 dependency_id_addr0 = dependency_id_mrf0 + 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0 = dependency_id_addr0 + 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0 = dependency_id_accum0 + 12,
82 /* SBID token write completion. Only used on Gen12+. */
83 dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84 /* SBID token read completion. Only used on Gen12+. */
85 dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids = dependency_id_sbid_rd0 + 16
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
state__anonccf5b65c0111::state94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[num_units];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[num_dependency_ids];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[num_units];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
instruction_info__anonccf5b65c0111::instruction_info123 instruction_info(const gen_device_info *devinfo, const fs_inst *inst) :
124 devinfo(devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
132 */
133 if (inst->opcode == SHADER_OPCODE_SEND) {
134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136 } else {
137 for (unsigned i = 0; i < inst->sources; i++)
138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139 }
140
141 /* Convert the execution size to GRF units. */
142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
146 */
147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151 }
152
instruction_info__anonccf5b65c0111::instruction_info153 instruction_info(const gen_device_info *devinfo,
154 const vec4_instruction *inst) :
155 devinfo(devinfo), op(inst->opcode),
156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158 desc(inst->desc), sfid(inst->sfid)
159 {
160 /* Compute the maximum source size. */
161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164 /* Convert the execution size to GRF units. */
165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
169 */
170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174 }
175
176 /** Device information. */
177 const struct gen_device_info *devinfo;
178 /** Instruction opcode. */
179 opcode op;
180 /** Destination type. */
181 brw_reg_type td;
182 /** Destination size in GRF units. */
183 unsigned sd;
184 /** Execution type. */
185 brw_reg_type tx;
186 /** Execution size in GRF units. */
187 unsigned sx;
188 /** Source size. */
189 unsigned ss;
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191 unsigned sc;
192 /** Send message descriptor. */
193 uint32_t desc;
194 /** Send message shared function ID. */
195 uint8_t sfid;
196 };
197
198 /**
199 * Timing information of an instruction used to estimate the performance of
200 * the program.
201 */
202 struct perf_desc {
perf_desc__anonccf5b65c0111::perf_desc203 perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205
206 /**
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
209 */
210 unit u;
211 /**
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
214 */
215 int df;
216 /**
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
219 */
220 int db;
221 /**
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
224 */
225 int ls;
226 /**
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
229 * register file.
230 */
231 int ld;
232 /**
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
235 * ARF file.
236 *
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
244 */
245 int la;
246 /**
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
249 * file.
250 */
251 int lf;
252 };
253
254 /**
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
260 */
261 perf_desc
calculate_desc(const instruction_info & info,unit u,int df_1,int df_sd,int df_sc,int db_1,int db_sx,int ls_1,int ld_1,int la_1,int lf_1,int l_ss,int l_sd)262 calculate_desc(const instruction_info &info, unit u,
263 int df_1, int df_sd, int df_sc,
264 int db_1, int db_sx,
265 int ls_1, int ld_1, int la_1, int lf_1,
266 int l_ss, int l_sd)
267 {
268 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269 db_1 + db_sx * int(info.sx),
270 ls_1 + l_ss * int(info.ss),
271 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272 la_1, lf_1);
273 }
274
275 /**
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
279 *
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
287 * unavailable.
288 */
289 const perf_desc
instruction_desc(const instruction_info & info)290 instruction_desc(const instruction_info &info)
291 {
292 const struct gen_device_info *devinfo = info.devinfo;
293
294 switch (info.op) {
295 case BRW_OPCODE_SYNC:
296 case BRW_OPCODE_SEL:
297 case BRW_OPCODE_NOT:
298 case BRW_OPCODE_AND:
299 case BRW_OPCODE_OR:
300 case BRW_OPCODE_XOR:
301 case BRW_OPCODE_SHR:
302 case BRW_OPCODE_SHL:
303 case BRW_OPCODE_DIM:
304 case BRW_OPCODE_ASR:
305 case BRW_OPCODE_CMPN:
306 case BRW_OPCODE_F16TO32:
307 case BRW_OPCODE_BFREV:
308 case BRW_OPCODE_BFI1:
309 case BRW_OPCODE_AVG:
310 case BRW_OPCODE_FRC:
311 case BRW_OPCODE_RNDU:
312 case BRW_OPCODE_RNDD:
313 case BRW_OPCODE_RNDE:
314 case BRW_OPCODE_RNDZ:
315 case BRW_OPCODE_MAC:
316 case BRW_OPCODE_MACH:
317 case BRW_OPCODE_LZD:
318 case BRW_OPCODE_FBH:
319 case BRW_OPCODE_FBL:
320 case BRW_OPCODE_CBIT:
321 case BRW_OPCODE_ADDC:
322 case BRW_OPCODE_ROR:
323 case BRW_OPCODE_ROL:
324 case BRW_OPCODE_SUBB:
325 case BRW_OPCODE_SAD2:
326 case BRW_OPCODE_SADA2:
327 case BRW_OPCODE_LINE:
328 case BRW_OPCODE_NOP:
329 case SHADER_OPCODE_CLUSTER_BROADCAST:
330 case SHADER_OPCODE_SCRATCH_HEADER:
331 case FS_OPCODE_DDX_COARSE:
332 case FS_OPCODE_DDX_FINE:
333 case FS_OPCODE_DDY_COARSE:
334 case FS_OPCODE_PIXEL_X:
335 case FS_OPCODE_PIXEL_Y:
336 case FS_OPCODE_SET_SAMPLE_ID:
337 case VEC4_OPCODE_MOV_BYTES:
338 case VEC4_OPCODE_UNPACK_UNIFORM:
339 case VEC4_OPCODE_DOUBLE_TO_F32:
340 case VEC4_OPCODE_DOUBLE_TO_D32:
341 case VEC4_OPCODE_DOUBLE_TO_U32:
342 case VEC4_OPCODE_TO_DOUBLE:
343 case VEC4_OPCODE_PICK_LOW_32BIT:
344 case VEC4_OPCODE_PICK_HIGH_32BIT:
345 case VEC4_OPCODE_SET_LOW_32BIT:
346 case VEC4_OPCODE_SET_HIGH_32BIT:
347 case GS_OPCODE_SET_DWORD_2:
348 case GS_OPCODE_SET_WRITE_OFFSET:
349 case GS_OPCODE_SET_VERTEX_COUNT:
350 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
351 case GS_OPCODE_SET_CHANNEL_MASKS:
352 case GS_OPCODE_GET_INSTANCE_ID:
353 case GS_OPCODE_SET_PRIMITIVE_ID:
354 case GS_OPCODE_SVB_SET_DST_INDEX:
355 case TCS_OPCODE_SRC0_010_IS_ZERO:
356 case TCS_OPCODE_GET_PRIMITIVE_ID:
357 case TES_OPCODE_GET_PRIMITIVE_ID:
358 if (devinfo->gen >= 11) {
359 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
360 0, 10, 6 /* XXX */, 14, 0, 0);
361 } else if (devinfo->gen >= 8) {
362 if (type_sz(info.tx) > 4)
363 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
364 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
365 else
366 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
367 0, 8, 4, 12, 0, 0);
368 } else if (devinfo->is_haswell) {
369 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
370 0, 10, 6 /* XXX */, 16, 0, 0);
371 } else {
372 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
373 0, 12, 8 /* XXX */, 18, 0, 0);
374 }
375
376 case BRW_OPCODE_MOV:
377 case BRW_OPCODE_CMP:
378 case BRW_OPCODE_ADD:
379 case BRW_OPCODE_MUL:
380 case SHADER_OPCODE_MOV_RELOC_IMM:
381 if (devinfo->gen >= 11) {
382 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
383 0, 10, 6, 14, 0, 0);
384 } else if (devinfo->gen >= 8) {
385 if (type_sz(info.tx) > 4)
386 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
387 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
388 else
389 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
390 0, 8, 4, 12, 0, 0);
391 } else if (devinfo->is_haswell) {
392 if (info.tx == BRW_REGISTER_TYPE_F)
393 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
394 0, 12, 8 /* XXX */, 18, 0, 0);
395 else
396 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
397 0, 10, 6 /* XXX */, 16, 0, 0);
398 } else if (devinfo->gen >= 7) {
399 if (info.tx == BRW_REGISTER_TYPE_F)
400 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
401 0, 14, 10 /* XXX */, 20, 0, 0);
402 else
403 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
404 0, 12, 8 /* XXX */, 18, 0, 0);
405 } else {
406 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
407 0, 2 /* XXX */,
408 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
409 0, 0);
410 }
411
412 case BRW_OPCODE_BFE:
413 case BRW_OPCODE_BFI2:
414 case BRW_OPCODE_CSEL:
415 if (devinfo->gen >= 11)
416 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
417 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
418 else if (devinfo->gen >= 8)
419 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
420 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
421 else if (devinfo->is_haswell)
422 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
423 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
424 else if (devinfo->gen >= 7)
425 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
426 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
427 else
428 abort();
429
430 case BRW_OPCODE_MAD:
431 if (devinfo->gen >= 11) {
432 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
433 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
434 } else if (devinfo->gen >= 8) {
435 if (type_sz(info.tx) > 4)
436 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
437 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
438 else
439 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
440 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
441 } else if (devinfo->is_haswell) {
442 if (info.tx == BRW_REGISTER_TYPE_F)
443 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
444 0, 12, 8 /* XXX */, 18, 0, 0);
445 else
446 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
447 0, 10, 6 /* XXX */, 16, 0, 0);
448 } else if (devinfo->gen >= 7) {
449 if (info.tx == BRW_REGISTER_TYPE_F)
450 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
451 0, 14, 10 /* XXX */, 20, 0, 0);
452 else
453 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
454 0, 12, 8 /* XXX */, 18, 0, 0);
455 } else if (devinfo->gen >= 6) {
456 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
457 0, 2 /* XXX */,
458 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
459 0, 0);
460 } else {
461 abort();
462 }
463
464 case BRW_OPCODE_F32TO16:
465 if (devinfo->gen >= 11)
466 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
467 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
468 else if (devinfo->gen >= 8)
469 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
470 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
471 else if (devinfo->is_haswell)
472 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
473 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
474 else if (devinfo->gen >= 7)
475 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
476 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
477 else
478 abort();
479
480 case BRW_OPCODE_DP4:
481 case BRW_OPCODE_DPH:
482 case BRW_OPCODE_DP3:
483 case BRW_OPCODE_DP2:
484 if (devinfo->gen >= 8)
485 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
486 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
487 else if (devinfo->is_haswell)
488 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
489 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
490 else
491 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
492 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
493
494 case SHADER_OPCODE_RCP:
495 case SHADER_OPCODE_RSQ:
496 case SHADER_OPCODE_SQRT:
497 case SHADER_OPCODE_EXP2:
498 case SHADER_OPCODE_LOG2:
499 case SHADER_OPCODE_SIN:
500 case SHADER_OPCODE_COS:
501 case SHADER_OPCODE_POW:
502 case SHADER_OPCODE_INT_QUOTIENT:
503 case SHADER_OPCODE_INT_REMAINDER:
504 if (devinfo->gen >= 6) {
505 switch (info.op) {
506 case SHADER_OPCODE_RCP:
507 case SHADER_OPCODE_RSQ:
508 case SHADER_OPCODE_SQRT:
509 case SHADER_OPCODE_EXP2:
510 case SHADER_OPCODE_LOG2:
511 case SHADER_OPCODE_SIN:
512 case SHADER_OPCODE_COS:
513 if (devinfo->gen >= 8)
514 return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
515 0, 16, 0, 0, 0, 0);
516 else if (devinfo->is_haswell)
517 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
518 0, 12, 0, 0, 0, 0);
519 else
520 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
521 0, 14, 0, 0, 0, 0);
522
523 case SHADER_OPCODE_POW:
524 if (devinfo->gen >= 8)
525 return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
526 0, 24, 0, 0, 0, 0);
527 else if (devinfo->is_haswell)
528 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
529 0, 20, 0, 0, 0, 0);
530 else
531 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
532 0, 22, 0, 0, 0, 0);
533
534 case SHADER_OPCODE_INT_QUOTIENT:
535 case SHADER_OPCODE_INT_REMAINDER:
536 return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
537 0, 28 /* XXX */, 0, 0, 0, 0);
538
539 default:
540 abort();
541 }
542 } else {
543 switch (info.op) {
544 case SHADER_OPCODE_RCP:
545 return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
546 0, 22, 0, 0, 0, 8);
547
548 case SHADER_OPCODE_RSQ:
549 return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
550 0, 44, 0, 0, 0, 8);
551
552 case SHADER_OPCODE_INT_QUOTIENT:
553 case SHADER_OPCODE_SQRT:
554 case SHADER_OPCODE_LOG2:
555 return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
556 0, 66, 0, 0, 0, 8);
557
558 case SHADER_OPCODE_INT_REMAINDER:
559 case SHADER_OPCODE_EXP2:
560 return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
561 0, 88, 0, 0, 0, 8);
562
563 case SHADER_OPCODE_SIN:
564 case SHADER_OPCODE_COS:
565 return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
566 0, 132, 0, 0, 0, 8);
567
568 case SHADER_OPCODE_POW:
569 return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
570 0, 176, 0, 0, 0, 8);
571
572 default:
573 abort();
574 }
575 }
576
577 case BRW_OPCODE_DO:
578 if (devinfo->gen >= 6)
579 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
580 0, 0, 0, 0, 0, 0);
581 else
582 return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
583 0, 0, 0, 0, 0, 0);
584
585 case BRW_OPCODE_IF:
586 case BRW_OPCODE_ELSE:
587 case BRW_OPCODE_ENDIF:
588 case BRW_OPCODE_WHILE:
589 case BRW_OPCODE_BREAK:
590 case BRW_OPCODE_CONTINUE:
591 case FS_OPCODE_DISCARD_JUMP:
592 if (devinfo->gen >= 8)
593 return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
594 0, 0, 0, 0, 0, 0);
595 else if (devinfo->is_haswell)
596 return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
597 0, 0, 0, 0, 0, 0);
598 else
599 return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
600 0, 0, 0, 0, 0, 0);
601
602 case FS_OPCODE_LINTERP:
603 if (devinfo->gen >= 8)
604 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
605 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
606 else if (devinfo->is_haswell)
607 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
608 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
609 else
610 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
611 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
612
613 case BRW_OPCODE_LRP:
614 if (devinfo->gen >= 8)
615 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
616 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
617 else if (devinfo->is_haswell)
618 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
619 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
620 else if (devinfo->gen >= 6)
621 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
622 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
623 else
624 abort();
625
626 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
627 if (devinfo->gen >= 11)
628 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
629 0, 10 /* XXX */, 6 /* XXX */,
630 14 /* XXX */, 0, 0);
631 else if (devinfo->gen >= 8)
632 return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
633 0, 8 /* XXX */, 4 /* XXX */,
634 12 /* XXX */, 0, 0);
635 else if (devinfo->is_haswell)
636 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
637 0, 10 /* XXX */, 6 /* XXX */,
638 16 /* XXX */, 0, 0);
639 else if (devinfo->gen >= 7)
640 return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
641 0, 12 /* XXX */, 8 /* XXX */,
642 18 /* XXX */, 0, 0);
643 else
644 abort();
645
646 case SHADER_OPCODE_MOV_INDIRECT:
647 if (devinfo->gen >= 11)
648 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
649 0, 10 /* XXX */, 6 /* XXX */,
650 14 /* XXX */, 0, 0);
651 else if (devinfo->gen >= 8)
652 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
653 0, 8 /* XXX */, 4 /* XXX */,
654 12 /* XXX */, 0, 0);
655 else if (devinfo->is_haswell)
656 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
657 0, 10 /* XXX */, 6 /* XXX */,
658 16 /* XXX */, 0, 0);
659 else
660 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
661 0, 12 /* XXX */, 8 /* XXX */,
662 18 /* XXX */, 0, 0);
663
664 case SHADER_OPCODE_BROADCAST:
665 if (devinfo->gen >= 11)
666 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
667 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
668 else if (devinfo->gen >= 8)
669 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
670 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
671 else if (devinfo->is_haswell)
672 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
673 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
674 else if (devinfo->gen >= 7)
675 return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
676 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
677 else
678 abort();
679
680 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
681 if (devinfo->gen >= 11)
682 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
683 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
684 else if (devinfo->gen >= 8)
685 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
686 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
687 else if (devinfo->is_haswell)
688 return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
689 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
690 else if (devinfo->gen >= 7)
691 return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
692 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
693 else
694 abort();
695
696 case SHADER_OPCODE_RND_MODE:
697 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
698 if (devinfo->gen >= 11)
699 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
700 4 /* XXX */, 0,
701 0, 0, 0, 0, 0, 0);
702 else if (devinfo->gen >= 8)
703 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
704 4 /* XXX */, 0,
705 0, 0, 0, 0, 0, 0);
706 else if (devinfo->is_haswell)
707 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
708 4 /* XXX */, 0,
709 0, 0, 0, 0, 0, 0);
710 else if (devinfo->gen >= 6)
711 return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
712 4 /* XXX */, 0,
713 0, 0, 0, 0, 0, 0);
714 else
715 abort();
716
717 case SHADER_OPCODE_SHUFFLE:
718 if (devinfo->gen >= 11)
719 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
720 44 /* XXX */, 0,
721 0, 10 /* XXX */, 6 /* XXX */,
722 14 /* XXX */, 0, 0);
723 else if (devinfo->gen >= 8)
724 return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
725 42 /* XXX */, 0,
726 0, 8 /* XXX */, 4 /* XXX */,
727 12 /* XXX */, 0, 0);
728 else if (devinfo->is_haswell)
729 return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
730 0, 44 /* XXX */,
731 0, 10 /* XXX */, 6 /* XXX */,
732 16 /* XXX */, 0, 0);
733 else if (devinfo->gen >= 6)
734 return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
735 0, 46 /* XXX */,
736 0, 12 /* XXX */, 8 /* XXX */,
737 18 /* XXX */, 0, 0);
738 else
739 abort();
740
741 case SHADER_OPCODE_SEL_EXEC:
742 if (devinfo->gen >= 11)
743 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
744 0, 4 /* XXX */,
745 0, 10 /* XXX */, 6 /* XXX */,
746 14 /* XXX */, 0, 0);
747 else if (devinfo->gen >= 8)
748 return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
749 0, 4 /* XXX */,
750 0, 8 /* XXX */, 4 /* XXX */,
751 12 /* XXX */, 0, 0);
752 else if (devinfo->is_haswell)
753 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
754 0, 4 /* XXX */,
755 0, 10 /* XXX */, 6 /* XXX */,
756 16 /* XXX */, 0, 0);
757 else
758 return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
759 0, 4 /* XXX */,
760 0, 12 /* XXX */, 8 /* XXX */,
761 18 /* XXX */, 0, 0);
762
763 case SHADER_OPCODE_QUAD_SWIZZLE:
764 if (devinfo->gen >= 11)
765 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
766 0, 8 /* XXX */,
767 0, 10 /* XXX */, 6 /* XXX */,
768 14 /* XXX */, 0, 0);
769 else if (devinfo->gen >= 8)
770 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
771 0, 8 /* XXX */,
772 0, 8 /* XXX */, 4 /* XXX */,
773 12 /* XXX */, 0, 0);
774 else if (devinfo->is_haswell)
775 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
776 0, 8 /* XXX */,
777 0, 10 /* XXX */, 6 /* XXX */,
778 16 /* XXX */, 0, 0);
779 else
780 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
781 0, 8 /* XXX */,
782 0, 12 /* XXX */, 8 /* XXX */,
783 18 /* XXX */, 0, 0);
784
785 case FS_OPCODE_DDY_FINE:
786 if (devinfo->gen >= 11)
787 return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
788 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
789 else if (devinfo->gen >= 8)
790 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
791 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
792 else if (devinfo->is_haswell)
793 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
794 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
795 else
796 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
797 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
798
799 case FS_OPCODE_LOAD_LIVE_CHANNELS:
800 if (devinfo->gen >= 11)
801 return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
802 2 /* XXX */, 0,
803 0, 0, 0, 10 /* XXX */, 0, 0);
804 else if (devinfo->gen >= 8)
805 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
806 0, 2 /* XXX */,
807 0, 0, 0, 8 /* XXX */, 0, 0);
808 else
809 abort();
810
811 case VEC4_OPCODE_PACK_BYTES:
812 if (devinfo->gen >= 8)
813 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
814 4 /* XXX */, 0,
815 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
816 0, 0);
817 else if (devinfo->is_haswell)
818 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
819 4 /* XXX */, 0,
820 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
821 0, 0);
822 else
823 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
824 4 /* XXX */, 0,
825 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
826 0, 0);
827
828 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
829 case TCS_OPCODE_GET_INSTANCE_ID:
830 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
831 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
832 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
833 if (devinfo->gen >= 8)
834 return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
835 6 /* XXX */, 0,
836 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
837 0, 0);
838 else if (devinfo->is_haswell)
839 return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
840 6 /* XXX */, 0,
841 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
842 0, 0);
843 else
844 return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
845 6 /* XXX */, 0,
846 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
847 0, 0);
848
849 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
850 case TCS_OPCODE_CREATE_BARRIER_HEADER:
851 if (devinfo->gen >= 8)
852 return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
853 8 /* XXX */, 0,
854 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
855 0, 0);
856 else if (devinfo->is_haswell)
857 return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
858 8 /* XXX */, 0,
859 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
860 0, 0);
861 else if (devinfo->gen >= 6)
862 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
863 8 /* XXX */, 0,
864 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
865 0, 0);
866 else
867 abort();
868
869 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
870 if (devinfo->gen >= 8)
871 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
872 4 /* XXX */, 0,
873 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
874 0, 0);
875 else if (devinfo->is_haswell)
876 return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
877 4 /* XXX */, 0,
878 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
879 0, 0);
880 else if (devinfo->gen >= 7)
881 return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
882 4 /* XXX */, 0,
883 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
884 0, 0);
885 else
886 abort();
887
888 case SHADER_OPCODE_TEX:
889 case FS_OPCODE_TXB:
890 case SHADER_OPCODE_TXD:
891 case SHADER_OPCODE_TXF:
892 case SHADER_OPCODE_TXF_LZ:
893 case SHADER_OPCODE_TXL:
894 case SHADER_OPCODE_TXL_LZ:
895 case SHADER_OPCODE_TXF_CMS:
896 case SHADER_OPCODE_TXF_CMS_W:
897 case SHADER_OPCODE_TXF_UMS:
898 case SHADER_OPCODE_TXF_MCS:
899 case SHADER_OPCODE_TXS:
900 case SHADER_OPCODE_LOD:
901 case SHADER_OPCODE_GET_BUFFER_SIZE:
902 case SHADER_OPCODE_TG4:
903 case SHADER_OPCODE_TG4_OFFSET:
904 case SHADER_OPCODE_SAMPLEINFO:
905 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
906 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
907 8 /* XXX */, 750 /* XXX */, 0, 0,
908 2 /* XXX */, 0);
909
910 case SHADER_OPCODE_URB_READ_SIMD8:
911 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
912 case SHADER_OPCODE_URB_WRITE_SIMD8:
913 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
914 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
915 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
916 case VEC4_OPCODE_URB_READ:
917 case VS_OPCODE_URB_WRITE:
918 case GS_OPCODE_URB_WRITE:
919 case GS_OPCODE_URB_WRITE_ALLOCATE:
920 case GS_OPCODE_THREAD_END:
921 case GS_OPCODE_FF_SYNC:
922 case TCS_OPCODE_URB_WRITE:
923 case TCS_OPCODE_RELEASE_INPUT:
924 case TCS_OPCODE_THREAD_END:
925 return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
926 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
927
928 case SHADER_OPCODE_MEMORY_FENCE:
929 case SHADER_OPCODE_INTERLOCK:
930 switch (info.sfid) {
931 case GEN6_SFID_DATAPORT_RENDER_CACHE:
932 if (devinfo->gen >= 7)
933 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
934 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
935 else
936 abort();
937
938 case GEN7_SFID_DATAPORT_DATA_CACHE:
939 case HSW_SFID_DATAPORT_DATA_CACHE_1:
940 if (devinfo->gen >= 7)
941 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
942 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
943 else
944 abort();
945
946 default:
947 abort();
948 }
949
950 case SHADER_OPCODE_GEN4_SCRATCH_READ:
951 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
952 case SHADER_OPCODE_GEN7_SCRATCH_READ:
953 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
954 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
955
956 case VEC4_OPCODE_UNTYPED_ATOMIC:
957 if (devinfo->gen >= 7)
958 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
959 30 /* XXX */, 400 /* XXX */,
960 10 /* XXX */, 100 /* XXX */, 0, 0,
961 0, 400 /* XXX */);
962 else
963 abort();
964
965 case VEC4_OPCODE_UNTYPED_SURFACE_READ:
966 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
967 if (devinfo->gen >= 7)
968 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
969 0, 20 /* XXX */,
970 10 /* XXX */, 100 /* XXX */, 0, 0,
971 0, 0);
972 else
973 abort();
974
975 case FS_OPCODE_FB_WRITE:
976 case FS_OPCODE_FB_READ:
977 case FS_OPCODE_REP_FB_WRITE:
978 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
979 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
980
981 case GS_OPCODE_SVB_WRITE:
982 if (devinfo->gen >= 6)
983 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
984 0, 450 /* XXX */,
985 10 /* XXX */, 300 /* XXX */, 0, 0,
986 0, 0);
987 else
988 abort();
989
990 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
991 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
992 return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
993 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
994
995 case VS_OPCODE_PULL_CONSTANT_LOAD:
996 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
997 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
998 8, 750, 0, 0, 2, 0);
999
1000 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1001 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1002 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1003 if (devinfo->gen >= 7)
1004 return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
1005 0, 90 /* XXX */, 0, 0, 0, 0);
1006 else
1007 abort();
1008
1009 case SHADER_OPCODE_BARRIER:
1010 if (devinfo->gen >= 7)
1011 return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1012 0 /* XXX */, 0,
1013 0, 0, 0, 0, 0, 0);
1014 else
1015 abort();
1016
1017 case CS_OPCODE_CS_TERMINATE:
1018 if (devinfo->gen >= 7)
1019 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1020 10 /* XXX */, 0, 0, 0, 0, 0);
1021 else
1022 abort();
1023
1024 case SHADER_OPCODE_SEND:
1025 switch (info.sfid) {
1026 case GEN6_SFID_DATAPORT_RENDER_CACHE:
1027 if (devinfo->gen >= 7) {
1028 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1029 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP:
1030 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1031 30 /* XXX */, 450 /* XXX */,
1032 10 /* XXX */, 100 /* XXX */,
1033 0, 0, 0, 400 /* XXX */);
1034 default:
1035 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1036 0, 450 /* XXX */,
1037 10 /* XXX */, 300 /* XXX */, 0, 0,
1038 0, 0);
1039 }
1040 } else if (devinfo->gen >= 6) {
1041 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1042 0, 450 /* XXX */,
1043 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1044 } else {
1045 abort();
1046 }
1047 case BRW_SFID_SAMPLER: {
1048 if (devinfo->gen >= 6)
1049 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1050 8, 750, 0, 0, 2, 0);
1051 else
1052 abort();
1053 }
1054 case GEN7_SFID_DATAPORT_DATA_CACHE:
1055 case HSW_SFID_DATAPORT_DATA_CACHE_1:
1056 if (devinfo->gen >= 8 || devinfo->is_haswell) {
1057 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1058 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1059 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1060 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1061 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1062 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1063 30 /* XXX */, 400 /* XXX */,
1064 10 /* XXX */, 100 /* XXX */, 0, 0,
1065 0, 400 /* XXX */);
1066
1067 default:
1068 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1069 0, 20 /* XXX */,
1070 10 /* XXX */, 100 /* XXX */, 0, 0,
1071 0, 0);
1072 }
1073 } else if (devinfo->gen >= 7) {
1074 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1075 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1076 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1077 30 /* XXX */, 400 /* XXX */,
1078 10 /* XXX */, 100 /* XXX */,
1079 0, 0, 0, 400 /* XXX */);
1080 default:
1081 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1082 0, 20 /* XXX */,
1083 10 /* XXX */, 100 /* XXX */, 0, 0,
1084 0, 0);
1085 }
1086 } else {
1087 abort();
1088 }
1089 default:
1090 abort();
1091 }
1092
1093 case SHADER_OPCODE_UNDEF:
1094 case FS_OPCODE_PLACEHOLDER_HALT:
1095 case FS_OPCODE_SCHEDULING_FENCE:
1096 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1097 0, 0, 0, 0, 0, 0);
1098
1099 default:
1100 abort();
1101 }
1102 }
1103
1104 /**
1105 * Model the performance behavior of a stall on the specified dependency
1106 * ID.
1107 */
1108 void
stall_on_dependency(state & st,dependency_id id)1109 stall_on_dependency(state &st, dependency_id id)
1110 {
1111 if (id < ARRAY_SIZE(st.dep_ready))
1112 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1113 st.dep_ready[id]);
1114 }
1115
1116 /**
1117 * Model the performance behavior of the front-end and back-end while
1118 * executing an instruction with the specified timing information, assuming
1119 * all dependencies are already clear.
1120 */
1121 void
execute_instruction(state & st,const perf_desc & perf)1122 execute_instruction(state &st, const perf_desc &perf)
1123 {
1124 /* Compute the time at which the front-end will be ready to execute the
1125 * next instruction.
1126 */
1127 st.unit_ready[unit_fe] += perf.df;
1128
1129 if (perf.u < num_units) {
1130 /* Wait for the back-end to be ready to execute this instruction. */
1131 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1132 st.unit_ready[perf.u]);
1133
1134 /* Compute the time at which the back-end will be ready to execute
1135 * the next instruction, and update the back-end utilization.
1136 */
1137 st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1138 st.unit_busy[perf.u] += perf.db * st.weight;
1139 }
1140 }
1141
1142 /**
1143 * Model the performance behavior of a read dependency provided by an
1144 * instruction.
1145 */
1146 void
mark_read_dependency(state & st,const perf_desc & perf,dependency_id id)1147 mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1148 {
1149 if (id < ARRAY_SIZE(st.dep_ready))
1150 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1151 }
1152
1153 /**
1154 * Model the performance behavior of a write dependency provided by an
1155 * instruction.
1156 */
1157 void
mark_write_dependency(state & st,const perf_desc & perf,dependency_id id)1158 mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1159 {
1160 if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1161 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1162 else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1163 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1164 else if (id < ARRAY_SIZE(st.dep_ready))
1165 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1166 }
1167
1168 /**
1169 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1170 */
1171 dependency_id
reg_dependency_id(const gen_device_info * devinfo,const backend_reg & r,const int delta)1172 reg_dependency_id(const gen_device_info *devinfo, const backend_reg &r,
1173 const int delta)
1174 {
1175 if (r.file == VGRF) {
1176 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1177 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1178 return dependency_id(dependency_id_grf0 + i);
1179
1180 } else if (r.file == FIXED_GRF) {
1181 const unsigned i = r.nr + delta;
1182 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1183 return dependency_id(dependency_id_grf0 + i);
1184
1185 } else if (r.file == MRF && devinfo->gen >= 7) {
1186 const unsigned i = GEN7_MRF_HACK_START +
1187 r.nr + r.offset / REG_SIZE + delta;
1188 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1189 return dependency_id(dependency_id_grf0 + i);
1190
1191 } else if (r.file == MRF && devinfo->gen < 7) {
1192 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1193 r.offset / REG_SIZE + delta;
1194 assert(i < dependency_id_addr0 - dependency_id_mrf0);
1195 return dependency_id(dependency_id_mrf0 + i);
1196
1197 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1198 r.nr < BRW_ARF_ACCUMULATOR) {
1199 assert(delta == 0);
1200 return dependency_id_addr0;
1201
1202 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1203 r.nr < BRW_ARF_FLAG) {
1204 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1205 assert(i < dependency_id_flag0 - dependency_id_accum0);
1206 return dependency_id(dependency_id_accum0 + i);
1207
1208 } else {
1209 return num_dependency_ids;
1210 }
1211 }
1212
1213 /**
1214 * Return the dependency ID of flag register starting at offset \p i.
1215 */
1216 dependency_id
flag_dependency_id(unsigned i)1217 flag_dependency_id(unsigned i)
1218 {
1219 assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1220 return dependency_id(dependency_id_flag0 + i);
1221 }
1222
1223 /**
1224 * Return the dependency ID corresponding to the SBID read completion
1225 * condition of a Gen12+ SWSB.
1226 */
1227 dependency_id
tgl_swsb_rd_dependency_id(tgl_swsb swsb)1228 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1229 {
1230 if (swsb.mode) {
1231 assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1232 return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1233 } else {
1234 return num_dependency_ids;
1235 }
1236 }
1237
1238 /**
1239 * Return the dependency ID corresponding to the SBID write completion
1240 * condition of a Gen12+ SWSB.
1241 */
1242 dependency_id
tgl_swsb_wr_dependency_id(tgl_swsb swsb)1243 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1244 {
1245 if (swsb.mode) {
1246 assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1247 return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1248 } else {
1249 return num_dependency_ids;
1250 }
1251 }
1252
1253 /**
1254 * Return the implicit accumulator register accessed by channel \p i of the
1255 * instruction.
1256 */
1257 unsigned
accum_reg_of_channel(const gen_device_info * devinfo,const backend_instruction * inst,brw_reg_type tx,unsigned i)1258 accum_reg_of_channel(const gen_device_info *devinfo,
1259 const backend_instruction *inst,
1260 brw_reg_type tx, unsigned i)
1261 {
1262 assert(inst->reads_accumulator_implicitly() ||
1263 inst->writes_accumulator_implicitly(devinfo));
1264 const unsigned offset = (inst->group + i) * type_sz(tx) *
1265 (devinfo->gen < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1266 return offset / REG_SIZE % 2;
1267 }
1268
1269 /**
1270 * Model the performance behavior of an FS back-end instruction.
1271 */
1272 void
issue_fs_inst(state & st,const gen_device_info * devinfo,const backend_instruction * be_inst)1273 issue_fs_inst(state &st, const gen_device_info *devinfo,
1274 const backend_instruction *be_inst)
1275 {
1276 const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1277 const instruction_info info(devinfo, inst);
1278 const perf_desc perf = instruction_desc(info);
1279
1280 /* Stall on any source dependencies. */
1281 for (unsigned i = 0; i < inst->sources; i++) {
1282 for (unsigned j = 0; j < regs_read(inst, i); j++)
1283 stall_on_dependency(
1284 st, reg_dependency_id(devinfo, inst->src[i], j));
1285 }
1286
1287 if (inst->reads_accumulator_implicitly()) {
1288 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1289 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1290 inst->exec_size - 1); j++)
1291 stall_on_dependency(
1292 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1293 }
1294
1295 if (is_send(inst) && inst->base_mrf != -1) {
1296 for (unsigned j = 0; j < inst->mlen; j++)
1297 stall_on_dependency(
1298 st, reg_dependency_id(
1299 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1300 }
1301
1302 if (const unsigned mask = inst->flags_read(devinfo)) {
1303 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1304 if (mask & (1 << i))
1305 stall_on_dependency(st, flag_dependency_id(i));
1306 }
1307 }
1308
1309 /* Stall on any write dependencies. */
1310 if (!inst->no_dd_check) {
1311 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1312 for (unsigned j = 0; j < regs_written(inst); j++)
1313 stall_on_dependency(
1314 st, reg_dependency_id(devinfo, inst->dst, j));
1315 }
1316
1317 if (inst->writes_accumulator_implicitly(devinfo)) {
1318 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1319 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1320 inst->exec_size - 1); j++)
1321 stall_on_dependency(
1322 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1323 }
1324
1325 if (const unsigned mask = inst->flags_written()) {
1326 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1327 if (mask & (1 << i))
1328 stall_on_dependency(st, flag_dependency_id(i));
1329 }
1330 }
1331 }
1332
1333 /* Stall on any SBID dependencies. */
1334 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1335 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1336 else if (inst->sched.mode & TGL_SBID_SRC)
1337 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1338
1339 /* Execute the instruction. */
1340 execute_instruction(st, perf);
1341
1342 /* Mark any source dependencies. */
1343 if (inst->is_send_from_grf()) {
1344 for (unsigned i = 0; i < inst->sources; i++) {
1345 if (inst->is_payload(i)) {
1346 for (unsigned j = 0; j < regs_read(inst, i); j++)
1347 mark_read_dependency(
1348 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1349 }
1350 }
1351 }
1352
1353 if (is_send(inst) && inst->base_mrf != -1) {
1354 for (unsigned j = 0; j < inst->mlen; j++)
1355 mark_read_dependency(st, perf,
1356 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1357 }
1358
1359 /* Mark any destination dependencies. */
1360 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1361 for (unsigned j = 0; j < regs_written(inst); j++) {
1362 mark_write_dependency(st, perf,
1363 reg_dependency_id(devinfo, inst->dst, j));
1364 }
1365 }
1366
1367 if (inst->writes_accumulator_implicitly(devinfo)) {
1368 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1369 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1370 inst->exec_size - 1); j++)
1371 mark_write_dependency(st, perf,
1372 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1373 }
1374
1375 if (const unsigned mask = inst->flags_written()) {
1376 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1377 if (mask & (1 << i))
1378 mark_write_dependency(st, perf, flag_dependency_id(i));
1379 }
1380 }
1381
1382 /* Mark any SBID dependencies. */
1383 if (inst->sched.mode & TGL_SBID_SET) {
1384 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1385 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1386 }
1387 }
1388
1389 /**
1390 * Model the performance behavior of a VEC4 back-end instruction.
1391 */
1392 void
issue_vec4_instruction(state & st,const gen_device_info * devinfo,const backend_instruction * be_inst)1393 issue_vec4_instruction(state &st, const gen_device_info *devinfo,
1394 const backend_instruction *be_inst)
1395 {
1396 const vec4_instruction *inst =
1397 static_cast<const vec4_instruction *>(be_inst);
1398 const instruction_info info(devinfo, inst);
1399 const perf_desc perf = instruction_desc(info);
1400
1401 /* Stall on any source dependencies. */
1402 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1403 for (unsigned j = 0; j < regs_read(inst, i); j++)
1404 stall_on_dependency(
1405 st, reg_dependency_id(devinfo, inst->src[i], j));
1406 }
1407
1408 if (inst->reads_accumulator_implicitly()) {
1409 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1410 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1411 inst->exec_size - 1); j++)
1412 stall_on_dependency(
1413 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1414 }
1415
1416 if (inst->base_mrf != -1) {
1417 for (unsigned j = 0; j < inst->mlen; j++)
1418 stall_on_dependency(
1419 st, reg_dependency_id(
1420 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1421 }
1422
1423 if (inst->reads_flag())
1424 stall_on_dependency(st, dependency_id_flag0);
1425
1426 /* Stall on any write dependencies. */
1427 if (!inst->no_dd_check) {
1428 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1429 for (unsigned j = 0; j < regs_written(inst); j++)
1430 stall_on_dependency(
1431 st, reg_dependency_id(devinfo, inst->dst, j));
1432 }
1433
1434 if (inst->writes_accumulator_implicitly(devinfo)) {
1435 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1436 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1437 inst->exec_size - 1); j++)
1438 stall_on_dependency(
1439 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1440 }
1441
1442 if (inst->writes_flag())
1443 stall_on_dependency(st, dependency_id_flag0);
1444 }
1445
1446 /* Execute the instruction. */
1447 execute_instruction(st, perf);
1448
1449 /* Mark any source dependencies. */
1450 if (inst->is_send_from_grf()) {
1451 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1452 for (unsigned j = 0; j < regs_read(inst, i); j++)
1453 mark_read_dependency(
1454 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1455 }
1456 }
1457
1458 if (inst->base_mrf != -1) {
1459 for (unsigned j = 0; j < inst->mlen; j++)
1460 mark_read_dependency(st, perf,
1461 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1462 }
1463
1464 /* Mark any destination dependencies. */
1465 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1466 for (unsigned j = 0; j < regs_written(inst); j++) {
1467 mark_write_dependency(st, perf,
1468 reg_dependency_id(devinfo, inst->dst, j));
1469 }
1470 }
1471
1472 if (inst->writes_accumulator_implicitly(devinfo)) {
1473 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1474 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1475 inst->exec_size - 1); j++)
1476 mark_write_dependency(st, perf,
1477 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1478 }
1479
1480 if (inst->writes_flag())
1481 mark_write_dependency(st, perf, dependency_id_flag0);
1482 }
1483
1484 /**
1485 * Calculate the maximum possible throughput of the program compatible with
1486 * the cycle-count utilization estimated for each asynchronous unit, in
1487 * threads-per-cycle units.
1488 */
1489 float
calculate_thread_throughput(const state & st,float busy)1490 calculate_thread_throughput(const state &st, float busy)
1491 {
1492 for (unsigned i = 0; i < num_units; i++)
1493 busy = MAX2(busy, st.unit_busy[i]);
1494
1495 return 1.0 / busy;
1496 }
1497
1498 /**
1499 * Estimate the performance of the specified shader.
1500 */
1501 void
calculate_performance(performance & p,const backend_shader * s,void (* issue_instruction)(state &,const gen_device_info *,const backend_instruction *),unsigned dispatch_width)1502 calculate_performance(performance &p, const backend_shader *s,
1503 void (*issue_instruction)(
1504 state &, const gen_device_info *,
1505 const backend_instruction *),
1506 unsigned dispatch_width)
1507 {
1508 /* XXX - Note that the previous version of this code used worst-case
1509 * scenario estimation of branching divergence for SIMD32 shaders,
1510 * but this heuristic was removed to improve performance in common
1511 * scenarios. Wider shader variants are less optimal when divergence
1512 * is high, e.g. when application renders complex scene on a small
1513 * surface. It is assumed that such renders are short, so their
1514 * time doesn't matter and when it comes to the overall performance,
1515 * they are dominated by more optimal larger renders.
1516 *
1517 * It's possible that we could do better with divergence analysis
1518 * by isolating branches which are 100% uniform.
1519 *
1520 * Plumbing the trip counts from NIR loop analysis would allow us
1521 * to do a better job regarding the loop weights.
1522 *
1523 * In the meantime use values that roughly match the control flow
1524 * weights used elsewhere in the compiler back-end.
1525 *
1526 * Note that we provide slightly more pessimistic weights on
1527 * Gen12+ for SIMD32, since the effective warp size on that
1528 * platform is 2x the SIMD width due to EU fusion, which increases
1529 * the likelihood of divergent control flow in comparison to
1530 * previous generations, giving narrower SIMD modes a performance
1531 * advantage in several test-cases with non-uniform discard jumps.
1532 */
1533 const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
1534 1.0 : 0.5);
1535 const float loop_weight = 10;
1536 unsigned discard_count = 0;
1537 unsigned elapsed = 0;
1538 state st;
1539
1540 foreach_block(block, s->cfg) {
1541 const unsigned elapsed0 = elapsed;
1542
1543 foreach_inst_in_block(backend_instruction, inst, block) {
1544 const unsigned clock0 = st.unit_ready[unit_fe];
1545
1546 issue_instruction(st, s->devinfo, inst);
1547
1548 if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
1549 st.weight /= discard_weight;
1550
1551 elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1552
1553 if (inst->opcode == BRW_OPCODE_DO)
1554 st.weight *= loop_weight;
1555 else if (inst->opcode == BRW_OPCODE_WHILE)
1556 st.weight /= loop_weight;
1557 else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
1558 st.weight *= discard_weight;
1559 }
1560
1561 p.block_latency[block->num] = elapsed - elapsed0;
1562 }
1563
1564 p.latency = elapsed;
1565 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1566 }
1567 }
1568
performance(const fs_visitor * v)1569 brw::performance::performance(const fs_visitor *v) :
1570 block_latency(new unsigned[v->cfg->num_blocks])
1571 {
1572 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1573 }
1574
performance(const vec4_visitor * v)1575 brw::performance::performance(const vec4_visitor *v) :
1576 block_latency(new unsigned[v->cfg->num_blocks])
1577 {
1578 calculate_performance(*this, v, issue_vec4_instruction, 8);
1579 }
1580
~performance()1581 brw::performance::~performance()
1582 {
1583 delete[] block_latency;
1584 }
1585