1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_private.h"
38 #include "intel_nir.h"
39 #include "shader_enums.h"
40 #include "dev/intel_debug.h"
41 #include "dev/intel_wa.h"
42 #include "compiler/glsl_types.h"
43 #include "compiler/nir/nir_builder.h"
44 #include "util/u_math.h"
45
46 using namespace brw;
47
48 static void
49 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
50
51 void
init(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
53 const brw_reg *src, unsigned sources)
54 {
55 memset((void*)this, 0, sizeof(*this));
56
57 initialize_sources(this, src, sources);
58
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->exec_size = exec_size;
65
66 assert(dst.file != IMM && dst.file != UNIFORM);
67
68 assert(this->exec_size != 0);
69
70 this->conditional_mod = BRW_CONDITIONAL_NONE;
71
72 /* This will be the case for almost all instructions. */
73 switch (dst.file) {
74 case VGRF:
75 case ADDRESS:
76 case ARF:
77 case FIXED_GRF:
78 case ATTR:
79 this->size_written = dst.component_size(exec_size);
80 break;
81 case BAD_FILE:
82 this->size_written = 0;
83 break;
84 case IMM:
85 case UNIFORM:
86 unreachable("Invalid destination register file");
87 }
88
89 this->writes_accumulator = false;
90 }
91
fs_inst()92 fs_inst::fs_inst()
93 {
94 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
95 }
96
fs_inst(enum opcode opcode,uint8_t exec_size)97 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
98 {
99 init(opcode, exec_size, reg_undef, NULL, 0);
100 }
101
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst)102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
103 {
104 init(opcode, exec_size, dst, NULL, 0);
105 }
106
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0)107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
108 const brw_reg &src0)
109 {
110 const brw_reg src[1] = { src0 };
111 init(opcode, exec_size, dst, src, 1);
112 }
113
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)114 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
115 const brw_reg &src0, const brw_reg &src1)
116 {
117 const brw_reg src[2] = { src0, src1 };
118 init(opcode, exec_size, dst, src, 2);
119 }
120
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)121 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
122 const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
123 {
124 const brw_reg src[3] = { src0, src1, src2 };
125 init(opcode, exec_size, dst, src, 3);
126 }
127
fs_inst(enum opcode opcode,uint8_t exec_width,const brw_reg & dst,const brw_reg src[],unsigned sources)128 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
129 const brw_reg src[], unsigned sources)
130 {
131 init(opcode, exec_width, dst, src, sources);
132 }
133
fs_inst(const fs_inst & that)134 fs_inst::fs_inst(const fs_inst &that)
135 {
136 memcpy((void*)this, &that, sizeof(that));
137 initialize_sources(this, that.src, that.sources);
138 }
139
~fs_inst()140 fs_inst::~fs_inst()
141 {
142 if (this->src != this->builtin_src)
143 delete[] this->src;
144 }
145
146 static void
initialize_sources(fs_inst * inst,const brw_reg src[],uint8_t num_sources)147 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
148 {
149 if (num_sources > ARRAY_SIZE(inst->builtin_src))
150 inst->src = new brw_reg[num_sources];
151 else
152 inst->src = inst->builtin_src;
153
154 for (unsigned i = 0; i < num_sources; i++)
155 inst->src[i] = src[i];
156
157 inst->sources = num_sources;
158 }
159
160 void
resize_sources(uint8_t num_sources)161 fs_inst::resize_sources(uint8_t num_sources)
162 {
163 if (this->sources == num_sources)
164 return;
165
166 brw_reg *old_src = this->src;
167 brw_reg *new_src;
168
169 const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
170
171 if (old_src == this->builtin_src) {
172 if (num_sources > builtin_size) {
173 new_src = new brw_reg[num_sources];
174 for (unsigned i = 0; i < this->sources; i++)
175 new_src[i] = old_src[i];
176
177 } else {
178 new_src = old_src;
179 }
180 } else {
181 if (num_sources <= builtin_size) {
182 new_src = this->builtin_src;
183 assert(this->sources > num_sources);
184 for (unsigned i = 0; i < num_sources; i++)
185 new_src[i] = old_src[i];
186
187 } else if (num_sources < this->sources) {
188 new_src = old_src;
189
190 } else {
191 new_src = new brw_reg[num_sources];
192 for (unsigned i = 0; i < this->sources; i++)
193 new_src[i] = old_src[i];
194 }
195
196 if (old_src != new_src)
197 delete[] old_src;
198 }
199
200 this->sources = num_sources;
201 this->src = new_src;
202 }
203
204 bool
is_send_from_grf() const205 fs_inst::is_send_from_grf() const
206 {
207 switch (opcode) {
208 case SHADER_OPCODE_SEND:
209 case SHADER_OPCODE_SEND_GATHER:
210 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
211 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
212 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
213 case SHADER_OPCODE_INTERLOCK:
214 case SHADER_OPCODE_MEMORY_FENCE:
215 case SHADER_OPCODE_BARRIER:
216 return true;
217 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
218 return src[1].file == VGRF;
219 default:
220 return false;
221 }
222 }
223
224 bool
is_control_source(unsigned arg) const225 fs_inst::is_control_source(unsigned arg) const
226 {
227 switch (opcode) {
228 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
229 return arg == 0;
230
231 case SHADER_OPCODE_BROADCAST:
232 case SHADER_OPCODE_SHUFFLE:
233 case SHADER_OPCODE_QUAD_SWIZZLE:
234 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
235 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
236 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
237 return arg == 1;
238
239 case SHADER_OPCODE_MOV_INDIRECT:
240 case SHADER_OPCODE_CLUSTER_BROADCAST:
241 return arg == 1 || arg == 2;
242
243 case SHADER_OPCODE_SEND:
244 case SHADER_OPCODE_SEND_GATHER:
245 return arg == 0 || arg == 1;
246
247 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
248 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
249 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
250 return arg != MEMORY_LOGICAL_BINDING &&
251 arg != MEMORY_LOGICAL_ADDRESS &&
252 arg != MEMORY_LOGICAL_DATA0 &&
253 arg != MEMORY_LOGICAL_DATA1;
254
255 case SHADER_OPCODE_QUAD_SWAP:
256 case SHADER_OPCODE_INCLUSIVE_SCAN:
257 case SHADER_OPCODE_EXCLUSIVE_SCAN:
258 case SHADER_OPCODE_VOTE_ANY:
259 case SHADER_OPCODE_VOTE_ALL:
260 case SHADER_OPCODE_REDUCE:
261 return arg != 0;
262
263 default:
264 return false;
265 }
266 }
267
268 bool
is_payload(unsigned arg) const269 fs_inst::is_payload(unsigned arg) const
270 {
271 switch (opcode) {
272 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
273 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
274 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
275 case SHADER_OPCODE_INTERLOCK:
276 case SHADER_OPCODE_MEMORY_FENCE:
277 case SHADER_OPCODE_BARRIER:
278 return arg == 0;
279
280 case SHADER_OPCODE_SEND:
281 return arg == 2 || arg == 3;
282
283 case SHADER_OPCODE_SEND_GATHER:
284 return arg >= 2;
285
286 default:
287 return false;
288 }
289 }
290
291 bool
can_do_source_mods(const struct intel_device_info * devinfo) const292 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
293 {
294 if (is_send_from_grf())
295 return false;
296
297 /* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
298 *
299 * "When multiplying a DW and any lower precision integer, source modifier
300 * is not supported."
301 */
302 if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
303 opcode == BRW_OPCODE_MAD)) {
304 const brw_reg_type exec_type = get_exec_type(this);
305 const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
306 MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
307 MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
308
309 if (brw_type_is_int(exec_type) &&
310 brw_type_size_bytes(exec_type) >= 4 &&
311 brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
312 return false;
313 }
314
315 switch (opcode) {
316 case BRW_OPCODE_ADDC:
317 case BRW_OPCODE_BFE:
318 case BRW_OPCODE_BFI1:
319 case BRW_OPCODE_BFI2:
320 case BRW_OPCODE_BFREV:
321 case BRW_OPCODE_CBIT:
322 case BRW_OPCODE_FBH:
323 case BRW_OPCODE_FBL:
324 case BRW_OPCODE_ROL:
325 case BRW_OPCODE_ROR:
326 case BRW_OPCODE_SUBB:
327 case BRW_OPCODE_DP4A:
328 case BRW_OPCODE_DPAS:
329 case SHADER_OPCODE_BROADCAST:
330 case SHADER_OPCODE_CLUSTER_BROADCAST:
331 case SHADER_OPCODE_MOV_INDIRECT:
332 case SHADER_OPCODE_SHUFFLE:
333 case SHADER_OPCODE_INT_QUOTIENT:
334 case SHADER_OPCODE_INT_REMAINDER:
335 case SHADER_OPCODE_REDUCE:
336 case SHADER_OPCODE_INCLUSIVE_SCAN:
337 case SHADER_OPCODE_EXCLUSIVE_SCAN:
338 case SHADER_OPCODE_VOTE_ANY:
339 case SHADER_OPCODE_VOTE_ALL:
340 case SHADER_OPCODE_VOTE_EQUAL:
341 case SHADER_OPCODE_BALLOT:
342 case SHADER_OPCODE_QUAD_SWAP:
343 case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
344 case SHADER_OPCODE_READ_FROM_CHANNEL:
345 return false;
346 default:
347 return true;
348 }
349 }
350
351 bool
can_do_cmod() const352 fs_inst::can_do_cmod() const
353 {
354 switch (opcode) {
355 case BRW_OPCODE_ADD:
356 case BRW_OPCODE_ADD3:
357 case BRW_OPCODE_ADDC:
358 case BRW_OPCODE_AND:
359 case BRW_OPCODE_ASR:
360 case BRW_OPCODE_AVG:
361 case BRW_OPCODE_CMP:
362 case BRW_OPCODE_CMPN:
363 case BRW_OPCODE_DP2:
364 case BRW_OPCODE_DP3:
365 case BRW_OPCODE_DP4:
366 case BRW_OPCODE_DPH:
367 case BRW_OPCODE_FRC:
368 case BRW_OPCODE_LINE:
369 case BRW_OPCODE_LRP:
370 case BRW_OPCODE_LZD:
371 case BRW_OPCODE_MAC:
372 case BRW_OPCODE_MACH:
373 case BRW_OPCODE_MAD:
374 case BRW_OPCODE_MOV:
375 case BRW_OPCODE_MUL:
376 case BRW_OPCODE_NOT:
377 case BRW_OPCODE_OR:
378 case BRW_OPCODE_PLN:
379 case BRW_OPCODE_RNDD:
380 case BRW_OPCODE_RNDE:
381 case BRW_OPCODE_RNDU:
382 case BRW_OPCODE_RNDZ:
383 case BRW_OPCODE_SHL:
384 case BRW_OPCODE_SHR:
385 case BRW_OPCODE_SUBB:
386 case BRW_OPCODE_XOR:
387 break;
388 default:
389 return false;
390 }
391
392 /* The accumulator result appears to get used for the conditional modifier
393 * generation. When negating a UD value, there is a 33rd bit generated for
394 * the sign in the accumulator value, so now you can't check, for example,
395 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
396 */
397 for (unsigned i = 0; i < sources; i++) {
398 if (brw_type_is_uint(src[i].type) && src[i].negate)
399 return false;
400 }
401
402 if (dst.file == ARF && dst.nr == BRW_ARF_SCALAR && src[0].file == IMM)
403 return false;
404
405 return true;
406 }
407
408 bool
can_change_types() const409 fs_inst::can_change_types() const
410 {
411 return dst.type == src[0].type &&
412 !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
413 (opcode == BRW_OPCODE_MOV ||
414 (opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
415 (opcode == BRW_OPCODE_SEL &&
416 dst.type == src[1].type &&
417 predicate != BRW_PREDICATE_NONE &&
418 !src[1].abs && !src[1].negate && src[1].file != ATTR));
419 }
420
421 void
vfail(const char * format,va_list va)422 fs_visitor::vfail(const char *format, va_list va)
423 {
424 char *msg;
425
426 if (failed)
427 return;
428
429 failed = true;
430
431 msg = ralloc_vasprintf(mem_ctx, format, va);
432 msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
433 dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
434
435 this->fail_msg = msg;
436
437 if (unlikely(debug_enabled)) {
438 fprintf(stderr, "%s", msg);
439 }
440 }
441
442 void
fail(const char * format,...)443 fs_visitor::fail(const char *format, ...)
444 {
445 va_list va;
446
447 va_start(va, format);
448 vfail(format, va);
449 va_end(va);
450 }
451
452 /**
453 * Mark this program as impossible to compile with dispatch width greater
454 * than n.
455 *
456 * During the SIMD8 compile (which happens first), we can detect and flag
457 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
458 * SIMD16+ compile altogether.
459 *
460 * During a compile of dispatch width greater than n (if one happens anyway),
461 * this just calls fail().
462 */
463 void
limit_dispatch_width(unsigned n,const char * msg)464 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
465 {
466 if (dispatch_width > n) {
467 fail("%s", msg);
468 } else {
469 max_dispatch_width = MIN2(max_dispatch_width, n);
470 brw_shader_perf_log(compiler, log_data,
471 "Shader dispatch width limited to SIMD%d: %s\n",
472 n, msg);
473 }
474 }
475
476 /**
477 * Returns true if the instruction has a flag that means it won't
478 * update an entire destination register.
479 *
480 * For example, dead code elimination and live variable analysis want to know
481 * when a write to a variable screens off any preceding values that were in
482 * it.
483 */
484 bool
is_partial_write() const485 fs_inst::is_partial_write() const
486 {
487 if (this->predicate && !this->predicate_trivial &&
488 this->opcode != BRW_OPCODE_SEL)
489 return true;
490
491 if (!this->dst.is_contiguous())
492 return true;
493
494 if (this->dst.offset % REG_SIZE != 0)
495 return true;
496
497 return this->size_written % REG_SIZE != 0;
498 }
499
500 unsigned
components_read(unsigned i) const501 fs_inst::components_read(unsigned i) const
502 {
503 /* Return zero if the source is not present. */
504 if (src[i].file == BAD_FILE)
505 return 0;
506
507 switch (opcode) {
508 case BRW_OPCODE_PLN:
509 return i == 0 ? 1 : 2;
510
511 case FS_OPCODE_PIXEL_X:
512 case FS_OPCODE_PIXEL_Y:
513 assert(i < 2);
514 if (i == 0)
515 return 2;
516 else
517 return 1;
518
519 case FS_OPCODE_FB_WRITE_LOGICAL:
520 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
521 /* First/second FB write color. */
522 if (i < 2)
523 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
524 else
525 return 1;
526
527 case SHADER_OPCODE_TEX_LOGICAL:
528 case SHADER_OPCODE_TXD_LOGICAL:
529 case SHADER_OPCODE_TXF_LOGICAL:
530 case SHADER_OPCODE_TXL_LOGICAL:
531 case SHADER_OPCODE_TXS_LOGICAL:
532 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
533 case FS_OPCODE_TXB_LOGICAL:
534 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
535 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
536 case SHADER_OPCODE_TXF_MCS_LOGICAL:
537 case SHADER_OPCODE_LOD_LOGICAL:
538 case SHADER_OPCODE_TG4_LOGICAL:
539 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
540 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
541 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
542 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
543 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
544 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
545 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
546 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
547 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
548 src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
549 /* Texture coordinates. */
550 if (i == TEX_LOGICAL_SRC_COORDINATE)
551 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
552 /* Texture derivatives. */
553 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
554 opcode == SHADER_OPCODE_TXD_LOGICAL)
555 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
556 /* Texture offset. */
557 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
558 return 2;
559 /* MCS */
560 else if (i == TEX_LOGICAL_SRC_MCS) {
561 if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
562 return 2;
563 else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
564 return 4;
565 else
566 return 1;
567 } else
568 return 1;
569
570 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
571 if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
572 return 0;
573 /* fallthrough */
574 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
575 if (i == MEMORY_LOGICAL_DATA1)
576 return 0;
577 /* fallthrough */
578 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
579 if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
580 return src[MEMORY_LOGICAL_COMPONENTS].ud;
581 else if (i == MEMORY_LOGICAL_ADDRESS)
582 return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
583 else
584 return 1;
585
586 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
587 return (i == 0 ? 2 : 1);
588
589 case SHADER_OPCODE_URB_WRITE_LOGICAL:
590 assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
591
592 if (i == URB_LOGICAL_SRC_DATA)
593 return src[URB_LOGICAL_SRC_COMPONENTS].ud;
594 else
595 return 1;
596
597 case BRW_OPCODE_DPAS:
598 unreachable("Do not use components_read() for DPAS.");
599
600 default:
601 return 1;
602 }
603 }
604
605 unsigned
size_read(const struct intel_device_info * devinfo,int arg) const606 fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
607 {
608 switch (opcode) {
609 case SHADER_OPCODE_SEND:
610 if (arg == 2) {
611 return mlen * REG_SIZE;
612 } else if (arg == 3) {
613 return ex_mlen * REG_SIZE;
614 }
615 break;
616
617 case SHADER_OPCODE_SEND_GATHER:
618 if (arg >= 3) {
619 /* SEND_GATHER is Xe3+, so no need to pass devinfo around. */
620 const unsigned reg_unit = 2;
621 return REG_SIZE * reg_unit;
622 }
623 break;
624
625 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
626 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
627 if (arg == 0)
628 return mlen * REG_SIZE;
629 break;
630
631 case BRW_OPCODE_PLN:
632 if (arg == 0)
633 return 16;
634 break;
635
636 case SHADER_OPCODE_LOAD_PAYLOAD:
637 if (arg < this->header_size)
638 return retype(src[arg], BRW_TYPE_UD).component_size(8);
639 break;
640
641 case SHADER_OPCODE_BARRIER:
642 return REG_SIZE;
643
644 case SHADER_OPCODE_MOV_INDIRECT:
645 if (arg == 0) {
646 assert(src[2].file == IMM);
647 return src[2].ud;
648 }
649 break;
650
651 case BRW_OPCODE_DPAS: {
652 /* This is a little bit sketchy. There's no way to get at devinfo from
653 * here, so the regular reg_unit() cannot be used. However, on
654 * reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
655 * reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
656 * coincidence, so this isn't so bad.
657 */
658 const unsigned reg_unit = this->exec_size / 8;
659
660 switch (arg) {
661 case 0:
662 if (src[0].type == BRW_TYPE_HF) {
663 return rcount * reg_unit * REG_SIZE / 2;
664 } else {
665 return rcount * reg_unit * REG_SIZE;
666 }
667 case 1:
668 return sdepth * reg_unit * REG_SIZE;
669 case 2:
670 /* This is simpler than the formula described in the Bspec, but it
671 * covers all of the cases that we support. Each inner sdepth
672 * iteration of the DPAS consumes a single dword for int8, uint8, or
673 * float16 types. These are the one source types currently
674 * supportable through Vulkan. This is independent of reg_unit.
675 */
676 return rcount * sdepth * 4;
677 default:
678 unreachable("Invalid source number.");
679 }
680 break;
681 }
682
683 default:
684 break;
685 }
686
687 switch (src[arg].file) {
688 case UNIFORM:
689 case IMM:
690 return components_read(arg) * brw_type_size_bytes(src[arg].type);
691 case BAD_FILE:
692 case ADDRESS:
693 case ARF:
694 case FIXED_GRF:
695 case VGRF:
696 case ATTR:
697 /* Regardless of exec_size, values marked as scalar are SIMD8. */
698 return components_read(arg) *
699 src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
700 }
701 return 0;
702 }
703
704 namespace {
705 unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)706 predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
707 {
708 if (devinfo->ver >= 20) {
709 return 1;
710 } else {
711 switch (predicate) {
712 case BRW_PREDICATE_NONE: return 1;
713 case BRW_PREDICATE_NORMAL: return 1;
714 case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
715 case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
716 case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
717 case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
718 case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
719 case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
720 case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
721 case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
722 case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
723 case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
724 default: unreachable("Unsupported predicate");
725 }
726 }
727 }
728 }
729
730 unsigned
flags_read(const intel_device_info * devinfo) const731 fs_inst::flags_read(const intel_device_info *devinfo) const
732 {
733 if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
734 predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
735 /* The vertical predication modes combine corresponding bits from
736 * f0.0 and f1.0 on Gfx7+.
737 */
738 const unsigned shift = 4;
739 return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
740 } else if (predicate) {
741 return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
742 } else {
743 unsigned mask = 0;
744 for (int i = 0; i < sources; i++) {
745 mask |= brw_fs_flag_mask(src[i], size_read(devinfo, i));
746 }
747 return mask;
748 }
749 }
750
751 unsigned
flags_written(const intel_device_info * devinfo) const752 fs_inst::flags_written(const intel_device_info *devinfo) const
753 {
754 if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
755 opcode != BRW_OPCODE_CSEL &&
756 opcode != BRW_OPCODE_IF &&
757 opcode != BRW_OPCODE_WHILE)) {
758 return brw_fs_flag_mask(this, 1);
759 } else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS ||
760 opcode == SHADER_OPCODE_BALLOT ||
761 opcode == SHADER_OPCODE_VOTE_ANY ||
762 opcode == SHADER_OPCODE_VOTE_ALL ||
763 opcode == SHADER_OPCODE_VOTE_EQUAL) {
764 return brw_fs_flag_mask(this, 32);
765 } else {
766 return brw_fs_flag_mask(dst, size_written);
767 }
768 }
769
770 bool
has_sampler_residency() const771 fs_inst::has_sampler_residency() const
772 {
773 switch (opcode) {
774 case SHADER_OPCODE_TEX_LOGICAL:
775 case FS_OPCODE_TXB_LOGICAL:
776 case SHADER_OPCODE_TXL_LOGICAL:
777 case SHADER_OPCODE_TXD_LOGICAL:
778 case SHADER_OPCODE_TXF_LOGICAL:
779 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
780 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
781 case SHADER_OPCODE_TXS_LOGICAL:
782 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
783 case SHADER_OPCODE_TG4_LOGICAL:
784 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
785 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
786 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
787 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
788 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
789 assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
790 return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
791 default:
792 return false;
793 }
794 }
795
796 /* \sa inst_is_raw_move in brw_eu_validate. */
797 bool
is_raw_move() const798 fs_inst::is_raw_move() const
799 {
800 if (opcode != BRW_OPCODE_MOV)
801 return false;
802
803 if (src[0].file == IMM) {
804 if (brw_type_is_vector_imm(src[0].type))
805 return false;
806 } else if (src[0].negate || src[0].abs) {
807 return false;
808 }
809
810 if (saturate)
811 return false;
812
813 return src[0].type == dst.type ||
814 (brw_type_is_int(src[0].type) &&
815 brw_type_is_int(dst.type) &&
816 brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
817 }
818
819 bool
uses_address_register_implicitly() const820 fs_inst::uses_address_register_implicitly() const
821 {
822 switch (opcode) {
823 case SHADER_OPCODE_BROADCAST:
824 case SHADER_OPCODE_SHUFFLE:
825 case SHADER_OPCODE_MOV_INDIRECT:
826 return true;
827 default:
828 return false;
829 }
830 }
831
832 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
833 * This brings in those uniform definitions
834 */
835 void
import_uniforms(fs_visitor * v)836 fs_visitor::import_uniforms(fs_visitor *v)
837 {
838 this->uniforms = v->uniforms;
839 }
840
841 enum intel_barycentric_mode
brw_barycentric_mode(const struct brw_wm_prog_key * key,nir_intrinsic_instr * intr)842 brw_barycentric_mode(const struct brw_wm_prog_key *key,
843 nir_intrinsic_instr *intr)
844 {
845 const glsl_interp_mode mode =
846 (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
847
848 /* Barycentric modes don't make sense for flat inputs. */
849 assert(mode != INTERP_MODE_FLAT);
850
851 unsigned bary;
852 switch (intr->intrinsic) {
853 case nir_intrinsic_load_barycentric_pixel:
854 case nir_intrinsic_load_barycentric_at_offset:
855 /* When per sample interpolation is dynamic, assume sample
856 * interpolation. We'll dynamically remap things so that the FS thread
857 * payload is not affected.
858 */
859 bary = key->persample_interp == INTEL_SOMETIMES ?
860 INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
861 INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
862 break;
863 case nir_intrinsic_load_barycentric_centroid:
864 bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
865 break;
866 case nir_intrinsic_load_barycentric_sample:
867 case nir_intrinsic_load_barycentric_at_sample:
868 bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
869 break;
870 default:
871 unreachable("invalid intrinsic");
872 }
873
874 if (mode == INTERP_MODE_NOPERSPECTIVE)
875 bary += 3;
876
877 return (enum intel_barycentric_mode) bary;
878 }
879
880 /**
881 * Walk backwards from the end of the program looking for a URB write that
882 * isn't in control flow, and mark it with EOT.
883 *
884 * Return true if successful or false if a separate EOT write is needed.
885 */
886 bool
mark_last_urb_write_with_eot()887 fs_visitor::mark_last_urb_write_with_eot()
888 {
889 foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
890 if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
891 prev->eot = true;
892
893 /* Delete now dead instructions. */
894 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
895 if (dead == prev)
896 break;
897 dead->remove();
898 }
899 return true;
900 } else if (prev->is_control_flow() || prev->has_side_effects()) {
901 break;
902 }
903 }
904
905 return false;
906 }
907
908 static unsigned
round_components_to_whole_registers(const intel_device_info * devinfo,unsigned c)909 round_components_to_whole_registers(const intel_device_info *devinfo,
910 unsigned c)
911 {
912 return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
913 }
914
915 void
assign_curb_setup()916 fs_visitor::assign_curb_setup()
917 {
918 unsigned uniform_push_length =
919 round_components_to_whole_registers(devinfo, prog_data->nr_params);
920
921 unsigned ubo_push_length = 0;
922 unsigned ubo_push_start[4];
923 for (int i = 0; i < 4; i++) {
924 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
925 ubo_push_length += prog_data->ubo_ranges[i].length;
926
927 assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
928 assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
929 }
930
931 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
932 if (stage == MESA_SHADER_FRAGMENT &&
933 ((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
934 prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
935
936 uint64_t used = 0;
937 bool is_compute = gl_shader_stage_is_compute(stage);
938
939 if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
940 assert(devinfo->has_lsc);
941 brw_builder ubld = brw_builder(this, 1).exec_all().at(
942 cfg->first_block(), cfg->first_block()->start());
943
944 /* The base offset for our push data is passed in as R0.0[31:6]. We have
945 * to mask off the bottom 6 bits.
946 */
947 brw_reg base_addr =
948 ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
949 brw_imm_ud(INTEL_MASK(31, 6)));
950
951 /* On Gfx12-HP we load constants at the start of the program using A32
952 * stateless messages.
953 */
954 for (unsigned i = 0; i < uniform_push_length;) {
955 /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
956 unsigned num_regs = MIN2(uniform_push_length - i, 8);
957 assert(num_regs > 0);
958 num_regs = 1 << util_logbase2(num_regs);
959
960 /* This pass occurs after all of the optimization passes, so don't
961 * emit an 'ADD addr, base_addr, 0' instruction.
962 */
963 brw_reg addr = i == 0 ? base_addr :
964 ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
965
966 brw_reg srcs[4] = {
967 brw_imm_ud(0), /* desc */
968 brw_imm_ud(0), /* ex_desc */
969 addr, /* payload */
970 brw_reg(), /* payload2 */
971 };
972
973 brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
974 BRW_TYPE_UD);
975 fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
976
977 send->sfid = GFX12_SFID_UGM;
978 uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
979 LSC_ADDR_SURFTYPE_FLAT,
980 LSC_ADDR_SIZE_A32,
981 LSC_DATA_SIZE_D32,
982 num_regs * 8 /* num_channels */,
983 true /* transpose */,
984 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
985 send->header_size = 0;
986 send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
987 send->size_written =
988 lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
989 send->send_is_volatile = true;
990
991 send->src[0] = brw_imm_ud(desc |
992 brw_message_desc(devinfo,
993 send->mlen,
994 send->size_written / REG_SIZE,
995 send->header_size));
996
997 i += num_regs;
998 }
999
1000 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1001 }
1002
1003 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1004 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1005 for (unsigned int i = 0; i < inst->sources; i++) {
1006 if (inst->src[i].file == UNIFORM) {
1007 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1008 int constant_nr;
1009 if (inst->src[i].nr >= UBO_START) {
1010 /* constant_nr is in 32-bit units, the rest are in bytes */
1011 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1012 inst->src[i].offset / 4;
1013 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1014 constant_nr = uniform_nr;
1015 } else {
1016 /* Section 5.11 of the OpenGL 4.1 spec says:
1017 * "Out-of-bounds reads return undefined values, which include
1018 * values from other variables of the active program or zero."
1019 * Just return the first push constant.
1020 */
1021 constant_nr = 0;
1022 }
1023
1024 assert(constant_nr / 8 < 64);
1025 used |= BITFIELD64_BIT(constant_nr / 8);
1026
1027 struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1028 constant_nr / 8,
1029 constant_nr % 8);
1030 brw_reg.abs = inst->src[i].abs;
1031 brw_reg.negate = inst->src[i].negate;
1032
1033 /* The combination of is_scalar for load_uniform, copy prop, and
1034 * lower_btd_logical_send can generate a MOV from a UNIFORM with
1035 * exec size 2 and stride of 1.
1036 */
1037 assert(inst->src[i].stride == 0 || inst->exec_size == 2);
1038 inst->src[i] = byte_offset(
1039 retype(brw_reg, inst->src[i].type),
1040 inst->src[i].offset % 4);
1041 }
1042 }
1043 }
1044
1045 uint64_t want_zero = used & prog_data->zero_push_reg;
1046 if (want_zero) {
1047 brw_builder ubld = brw_builder(this, 8).exec_all().at(
1048 cfg->first_block(), cfg->first_block()->start());
1049
1050 /* push_reg_mask_param is in 32-bit units */
1051 unsigned mask_param = prog_data->push_reg_mask_param;
1052 struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1053 mask_param % 8);
1054
1055 brw_reg b32;
1056 for (unsigned i = 0; i < 64; i++) {
1057 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1058 brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
1059 ubld.SHL(horiz_offset(shifted, 8),
1060 byte_offset(retype(mask, BRW_TYPE_W), i / 8),
1061 brw_imm_v(0x01234567));
1062 ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1063
1064 brw_builder ubld16 = ubld.group(16, 0);
1065 b32 = ubld16.vgrf(BRW_TYPE_D);
1066 ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1067 }
1068
1069 if (want_zero & BITFIELD64_BIT(i)) {
1070 assert(i < prog_data->curb_read_length);
1071 struct brw_reg push_reg =
1072 retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
1073
1074 ubld.AND(push_reg, push_reg, component(b32, i % 16));
1075 }
1076 }
1077
1078 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1079 }
1080
1081 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1082 this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1083 }
1084
1085 /*
1086 * Build up an array of indices into the urb_setup array that
1087 * references the active entries of the urb_setup array.
1088 * Used to accelerate walking the active entries of the urb_setup array
1089 * on each upload.
1090 */
1091 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1092 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1093 {
1094 /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1095 * skip per-primitive attributes here.
1096 */
1097
1098 /* Make sure uint8_t is sufficient */
1099 STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1100 uint8_t index = 0;
1101 for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1102 if (wm_prog_data->urb_setup[attr] >= 0) {
1103 wm_prog_data->urb_setup_attribs[index++] = attr;
1104 }
1105 }
1106 wm_prog_data->urb_setup_attribs_count = index;
1107 }
1108
1109 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1110 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1111 {
1112 for (int i = 0; i < inst->sources; i++) {
1113 if (inst->src[i].file == ATTR) {
1114 assert(inst->src[i].nr == 0);
1115 int grf = payload().num_regs +
1116 prog_data->curb_read_length +
1117 inst->src[i].offset / REG_SIZE;
1118
1119 /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1120 *
1121 * VertStride must be used to cross GRF register boundaries. This
1122 * rule implies that elements within a 'Width' cannot cross GRF
1123 * boundaries.
1124 *
1125 * So, for registers that are large enough, we have to split the exec
1126 * size in two and trust the compression state to sort it out.
1127 */
1128 unsigned total_size = inst->exec_size *
1129 inst->src[i].stride *
1130 brw_type_size_bytes(inst->src[i].type);
1131
1132 assert(total_size <= 2 * REG_SIZE);
1133 const unsigned exec_size =
1134 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1135
1136 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1137 struct brw_reg reg =
1138 stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1139 inst->src[i].offset % REG_SIZE),
1140 exec_size * inst->src[i].stride,
1141 width, inst->src[i].stride);
1142 reg.abs = inst->src[i].abs;
1143 reg.negate = inst->src[i].negate;
1144
1145 inst->src[i] = reg;
1146 }
1147 }
1148 }
1149
1150 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1151 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1152 const brw_stage_prog_data *prog_data)
1153 {
1154 if (prog_data->nr_params == 0)
1155 return -1;
1156
1157 if (devinfo->verx10 >= 125)
1158 return -1;
1159
1160 /* The local thread id is always the last parameter in the list */
1161 uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1162 if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1163 return prog_data->nr_params - 1;
1164
1165 return -1;
1166 }
1167
1168 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)1169 brw_fb_write_msg_control(const fs_inst *inst,
1170 const struct brw_wm_prog_data *prog_data)
1171 {
1172 uint32_t mctl;
1173
1174 if (prog_data->dual_src_blend) {
1175 assert(inst->exec_size < 32);
1176
1177 if (inst->group % 16 == 0)
1178 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
1179 else if (inst->group % 16 == 8)
1180 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
1181 else
1182 unreachable("Invalid dual-source FB write instruction group");
1183 } else {
1184 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
1185
1186 if (inst->exec_size == 16)
1187 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1188 else if (inst->exec_size == 8)
1189 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1190 else if (inst->exec_size == 32)
1191 mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
1192 else
1193 unreachable("Invalid FB write execution size");
1194 }
1195
1196 return mctl;
1197 }
1198
register_pressure(const fs_visitor * v)1199 brw::register_pressure::register_pressure(const fs_visitor *v)
1200 {
1201 const fs_live_variables &live = v->live_analysis.require();
1202 const unsigned num_instructions = v->cfg->num_blocks ?
1203 v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
1204
1205 regs_live_at_ip = new unsigned[num_instructions]();
1206
1207 for (unsigned reg = 0; reg < v->alloc.count; reg++) {
1208 for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
1209 regs_live_at_ip[ip] += v->alloc.sizes[reg];
1210 }
1211
1212 const unsigned payload_count = v->first_non_payload_grf;
1213
1214 int *payload_last_use_ip = new int[payload_count];
1215 v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
1216
1217 for (unsigned reg = 0; reg < payload_count; reg++) {
1218 for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
1219 ++regs_live_at_ip[ip];
1220 }
1221
1222 delete[] payload_last_use_ip;
1223 }
1224
~register_pressure()1225 brw::register_pressure::~register_pressure()
1226 {
1227 delete[] regs_live_at_ip;
1228 }
1229
1230 void
invalidate_analysis(brw::analysis_dependency_class c)1231 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
1232 {
1233 live_analysis.invalidate(c);
1234 regpressure_analysis.invalidate(c);
1235 performance_analysis.invalidate(c);
1236 idom_analysis.invalidate(c);
1237 def_analysis.invalidate(c);
1238 }
1239
1240 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const1241 fs_visitor::debug_optimizer(const nir_shader *nir,
1242 const char *pass_name,
1243 int iteration, int pass_num) const
1244 {
1245 if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
1246 return;
1247
1248 char *filename;
1249 int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
1250 debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
1251 _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
1252 iteration, pass_num, pass_name);
1253 if (ret == -1)
1254 return;
1255
1256 FILE *file = stderr;
1257 if (__normal_user()) {
1258 file = fopen(filename, "w");
1259 if (!file)
1260 file = stderr;
1261 }
1262
1263 brw_print_instructions(*this, file);
1264
1265 if (file != stderr)
1266 fclose(file);
1267
1268 free(filename);
1269 }
1270
1271 static uint32_t
brw_compute_max_register_pressure(fs_visitor & s)1272 brw_compute_max_register_pressure(fs_visitor &s)
1273 {
1274 const register_pressure &rp = s.regpressure_analysis.require();
1275 uint32_t ip = 0, max_pressure = 0;
1276 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1277 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
1278 ip++;
1279 }
1280 return max_pressure;
1281 }
1282
1283 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)1284 save_instruction_order(const struct cfg_t *cfg)
1285 {
1286 /* Before we schedule anything, stash off the instruction order as an array
1287 * of fs_inst *. This way, we can reset it between scheduling passes to
1288 * prevent dependencies between the different scheduling modes.
1289 */
1290 int num_insts = cfg->last_block()->end_ip + 1;
1291 fs_inst **inst_arr = new fs_inst * [num_insts];
1292
1293 int ip = 0;
1294 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1295 assert(ip >= block->start_ip && ip <= block->end_ip);
1296 inst_arr[ip++] = inst;
1297 }
1298 assert(ip == num_insts);
1299
1300 return inst_arr;
1301 }
1302
1303 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)1304 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
1305 {
1306 ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
1307
1308 int ip = 0;
1309 foreach_block (block, cfg) {
1310 block->instructions.make_empty();
1311
1312 assert(ip == block->start_ip);
1313 for (; ip <= block->end_ip; ip++)
1314 block->instructions.push_tail(inst_arr[ip]);
1315 }
1316 assert(ip == num_insts);
1317 }
1318
1319 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
1320 static inline unsigned
brw_get_scratch_size(int size)1321 brw_get_scratch_size(int size)
1322 {
1323 return MAX2(1024, util_next_power_of_two(size));
1324 }
1325
1326 void
brw_allocate_registers(fs_visitor & s,bool allow_spilling)1327 brw_allocate_registers(fs_visitor &s, bool allow_spilling)
1328 {
1329 const struct intel_device_info *devinfo = s.devinfo;
1330 const nir_shader *nir = s.nir;
1331 bool allocated;
1332
1333 static const enum brw_instruction_scheduler_mode pre_modes[] = {
1334 BRW_SCHEDULE_PRE,
1335 BRW_SCHEDULE_PRE_NON_LIFO,
1336 BRW_SCHEDULE_NONE,
1337 BRW_SCHEDULE_PRE_LIFO,
1338 };
1339
1340 static const char *scheduler_mode_name[] = {
1341 [BRW_SCHEDULE_PRE] = "top-down",
1342 [BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
1343 [BRW_SCHEDULE_PRE_LIFO] = "lifo",
1344 [BRW_SCHEDULE_POST] = "post",
1345 [BRW_SCHEDULE_NONE] = "none",
1346 };
1347
1348 uint32_t best_register_pressure = UINT32_MAX;
1349 enum brw_instruction_scheduler_mode best_sched = BRW_SCHEDULE_NONE;
1350
1351 brw_opt_compact_virtual_grfs(s);
1352
1353 if (s.needs_register_pressure)
1354 s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
1355
1356 s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
1357
1358 bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
1359
1360 /* Before we schedule anything, stash off the instruction order as an array
1361 * of fs_inst *. This way, we can reset it between scheduling passes to
1362 * prevent dependencies between the different scheduling modes.
1363 */
1364 fs_inst **orig_order = save_instruction_order(s.cfg);
1365 fs_inst **best_pressure_order = NULL;
1366
1367 void *scheduler_ctx = ralloc_context(NULL);
1368 brw_instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
1369
1370 /* Try each scheduling heuristic to see if it can successfully register
1371 * allocate without spilling. They should be ordered by decreasing
1372 * performance but increasing likelihood of allocating.
1373 */
1374 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
1375 enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
1376
1377 brw_schedule_instructions_pre_ra(s, sched, sched_mode);
1378 s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
1379
1380 s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
1381
1382 if (0) {
1383 brw_assign_regs_trivial(s);
1384 allocated = true;
1385 break;
1386 }
1387
1388 /* We should only spill registers on the last scheduling. */
1389 assert(!s.spilled_any_registers);
1390
1391 allocated = brw_assign_regs(s, false, spill_all);
1392 if (allocated)
1393 break;
1394
1395 /* Save the maximum register pressure */
1396 uint32_t this_pressure = brw_compute_max_register_pressure(s);
1397
1398 if (0) {
1399 fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
1400 scheduler_mode_name[sched_mode], this_pressure);
1401 }
1402
1403 if (this_pressure < best_register_pressure) {
1404 best_register_pressure = this_pressure;
1405 best_sched = sched_mode;
1406 delete[] best_pressure_order;
1407 best_pressure_order = save_instruction_order(s.cfg);
1408 }
1409
1410 /* Reset back to the original order before trying the next mode */
1411 restore_instruction_order(s.cfg, orig_order);
1412 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1413 }
1414
1415 ralloc_free(scheduler_ctx);
1416
1417 if (!allocated) {
1418 if (0) {
1419 fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
1420 scheduler_mode_name[best_sched]);
1421 }
1422 restore_instruction_order(s.cfg, best_pressure_order);
1423 s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
1424
1425 allocated = brw_assign_regs(s, allow_spilling, spill_all);
1426 }
1427
1428 delete[] orig_order;
1429 delete[] best_pressure_order;
1430
1431 if (!allocated) {
1432 s.fail("Failure to register allocate. Reduce number of "
1433 "live scalar values to avoid this.");
1434 } else if (s.spilled_any_registers) {
1435 brw_shader_perf_log(s.compiler, s.log_data,
1436 "%s shader triggered register spilling. "
1437 "Try reducing the number of live scalar "
1438 "values to improve performance.\n",
1439 _mesa_shader_stage_to_string(s.stage));
1440 }
1441
1442 if (s.failed)
1443 return;
1444
1445 int pass_num = 0;
1446
1447 s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
1448
1449 brw_opt_bank_conflicts(s);
1450
1451 s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
1452
1453 brw_schedule_instructions_post_ra(s);
1454
1455 s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
1456
1457 /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
1458 * of part of assign_regs since both bank conflicts optimization and post
1459 * RA scheduling take advantage of distinguishing references to registers
1460 * that were allocated from references that were already fixed.
1461 *
1462 * TODO: Change the passes above, then move this lowering to be part of
1463 * assign_regs.
1464 */
1465 brw_lower_vgrfs_to_fixed_grfs(s);
1466
1467 s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
1468
1469 if (s.devinfo->ver >= 30) {
1470 brw_lower_send_gather(s);
1471 s.debug_optimizer(nir, "lower_send_gather", 96, pass_num++);
1472 }
1473
1474 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
1475
1476 if (s.last_scratch > 0) {
1477 /* We currently only support up to 2MB of scratch space. If we
1478 * need to support more eventually, the documentation suggests
1479 * that we could allocate a larger buffer, and partition it out
1480 * ourselves. We'd just have to undo the hardware's address
1481 * calculation by subtracting (FFTID * Per Thread Scratch Space)
1482 * and then add FFTID * (Larger Per Thread Scratch Space).
1483 *
1484 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
1485 * Thread Group Tracking > Local Memory/Scratch Space.
1486 */
1487 if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
1488 /* Take the max of any previously compiled variant of the shader. In the
1489 * case of bindless shaders with return parts, this will also take the
1490 * max of all parts.
1491 */
1492 s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
1493 s.prog_data->total_scratch);
1494 } else {
1495 s.fail("Scratch space required is larger than supported");
1496 }
1497 }
1498
1499 if (s.failed)
1500 return;
1501
1502 brw_lower_scoreboard(s);
1503
1504 s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
1505 }
1506
1507 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)1508 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
1509 unsigned threads)
1510 {
1511 assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
1512 assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
1513 return cs_prog_data->push.per_thread.size * threads +
1514 cs_prog_data->push.cross_thread.size;
1515 }
1516
1517 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)1518 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1519 const struct brw_cs_prog_data *prog_data,
1520 const unsigned *override_local_size)
1521 {
1522 struct intel_cs_dispatch_info info = {};
1523
1524 const unsigned *sizes =
1525 override_local_size ? override_local_size :
1526 prog_data->local_size;
1527
1528 const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
1529 assert(simd >= 0 && simd < 3);
1530
1531 info.group_size = sizes[0] * sizes[1] * sizes[2];
1532 info.simd_size = 8u << simd;
1533 info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
1534
1535 const uint32_t remainder = info.group_size & (info.simd_size - 1);
1536 if (remainder > 0)
1537 info.right_mask = ~0u >> (32 - remainder);
1538 else
1539 info.right_mask = ~0u >> (32 - info.simd_size);
1540
1541 return info;
1542 }
1543
1544 void
brw_shader_phase_update(fs_visitor & s,enum brw_shader_phase phase)1545 brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
1546 {
1547 assert(phase == s.phase + 1);
1548 s.phase = phase;
1549 brw_validate(s);
1550 }
1551
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)1552 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
1553 {
1554 return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
1555 }
1556