1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_fs_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_private.h"
38 #include "intel_nir.h"
39 #include "shader_enums.h"
40 #include "dev/intel_debug.h"
41 #include "dev/intel_wa.h"
42 #include "compiler/glsl_types.h"
43 #include "compiler/nir/nir_builder.h"
44 #include "util/u_math.h"
45
46 using namespace brw;
47
48 static void
49 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
50
51 void
init(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
53 const brw_reg *src, unsigned sources)
54 {
55 memset((void*)this, 0, sizeof(*this));
56
57 initialize_sources(this, src, sources);
58
59 for (unsigned i = 0; i < sources; i++)
60 this->src[i] = src[i];
61
62 this->opcode = opcode;
63 this->dst = dst;
64 this->exec_size = exec_size;
65
66 assert(dst.file != IMM && dst.file != UNIFORM);
67
68 assert(this->exec_size != 0);
69
70 this->conditional_mod = BRW_CONDITIONAL_NONE;
71
72 /* This will be the case for almost all instructions. */
73 switch (dst.file) {
74 case VGRF:
75 case ADDRESS:
76 case ARF:
77 case FIXED_GRF:
78 case ATTR:
79 this->size_written = dst.component_size(exec_size);
80 break;
81 case BAD_FILE:
82 this->size_written = 0;
83 break;
84 case IMM:
85 case UNIFORM:
86 unreachable("Invalid destination register file");
87 }
88
89 this->writes_accumulator = false;
90 }
91
fs_inst()92 fs_inst::fs_inst()
93 {
94 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
95 }
96
fs_inst(enum opcode opcode,uint8_t exec_size)97 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
98 {
99 init(opcode, exec_size, reg_undef, NULL, 0);
100 }
101
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst)102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
103 {
104 init(opcode, exec_size, dst, NULL, 0);
105 }
106
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0)107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
108 const brw_reg &src0)
109 {
110 const brw_reg src[1] = { src0 };
111 init(opcode, exec_size, dst, src, 1);
112 }
113
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)114 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
115 const brw_reg &src0, const brw_reg &src1)
116 {
117 const brw_reg src[2] = { src0, src1 };
118 init(opcode, exec_size, dst, src, 2);
119 }
120
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)121 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
122 const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
123 {
124 const brw_reg src[3] = { src0, src1, src2 };
125 init(opcode, exec_size, dst, src, 3);
126 }
127
fs_inst(enum opcode opcode,uint8_t exec_width,const brw_reg & dst,const brw_reg src[],unsigned sources)128 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
129 const brw_reg src[], unsigned sources)
130 {
131 init(opcode, exec_width, dst, src, sources);
132 }
133
fs_inst(const fs_inst & that)134 fs_inst::fs_inst(const fs_inst &that)
135 {
136 memcpy((void*)this, &that, sizeof(that));
137 initialize_sources(this, that.src, that.sources);
138 }
139
~fs_inst()140 fs_inst::~fs_inst()
141 {
142 if (this->src != this->builtin_src)
143 delete[] this->src;
144 }
145
146 static void
initialize_sources(fs_inst * inst,const brw_reg src[],uint8_t num_sources)147 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
148 {
149 if (num_sources > ARRAY_SIZE(inst->builtin_src))
150 inst->src = new brw_reg[num_sources];
151 else
152 inst->src = inst->builtin_src;
153
154 for (unsigned i = 0; i < num_sources; i++)
155 inst->src[i] = src[i];
156
157 inst->sources = num_sources;
158 }
159
160 void
resize_sources(uint8_t num_sources)161 fs_inst::resize_sources(uint8_t num_sources)
162 {
163 if (this->sources == num_sources)
164 return;
165
166 brw_reg *old_src = this->src;
167 brw_reg *new_src;
168
169 const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
170
171 if (old_src == this->builtin_src) {
172 if (num_sources > builtin_size) {
173 new_src = new brw_reg[num_sources];
174 for (unsigned i = 0; i < this->sources; i++)
175 new_src[i] = old_src[i];
176
177 } else {
178 new_src = old_src;
179 }
180 } else {
181 if (num_sources <= builtin_size) {
182 new_src = this->builtin_src;
183 assert(this->sources > num_sources);
184 for (unsigned i = 0; i < num_sources; i++)
185 new_src[i] = old_src[i];
186
187 } else if (num_sources < this->sources) {
188 new_src = old_src;
189
190 } else {
191 new_src = new brw_reg[num_sources];
192 for (unsigned i = 0; i < this->sources; i++)
193 new_src[i] = old_src[i];
194 }
195
196 if (old_src != new_src)
197 delete[] old_src;
198 }
199
200 this->sources = num_sources;
201 this->src = new_src;
202 }
203
204 bool
is_send_from_grf() const205 fs_inst::is_send_from_grf() const
206 {
207 switch (opcode) {
208 case SHADER_OPCODE_SEND:
209 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
210 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
211 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
212 case SHADER_OPCODE_INTERLOCK:
213 case SHADER_OPCODE_MEMORY_FENCE:
214 case SHADER_OPCODE_BARRIER:
215 return true;
216 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
217 return src[1].file == VGRF;
218 default:
219 return false;
220 }
221 }
222
223 bool
is_control_source(unsigned arg) const224 fs_inst::is_control_source(unsigned arg) const
225 {
226 switch (opcode) {
227 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
228 return arg == 0;
229
230 case SHADER_OPCODE_BROADCAST:
231 case SHADER_OPCODE_SHUFFLE:
232 case SHADER_OPCODE_QUAD_SWIZZLE:
233 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
234 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
235 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
236 return arg == 1;
237
238 case SHADER_OPCODE_MOV_INDIRECT:
239 case SHADER_OPCODE_CLUSTER_BROADCAST:
240 return arg == 1 || arg == 2;
241
242 case SHADER_OPCODE_SEND:
243 return arg == 0 || arg == 1;
244
245 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
246 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
247 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
248 return arg != MEMORY_LOGICAL_BINDING &&
249 arg != MEMORY_LOGICAL_ADDRESS &&
250 arg != MEMORY_LOGICAL_DATA0 &&
251 arg != MEMORY_LOGICAL_DATA1;
252
253 case SHADER_OPCODE_QUAD_SWAP:
254 case SHADER_OPCODE_INCLUSIVE_SCAN:
255 case SHADER_OPCODE_EXCLUSIVE_SCAN:
256 case SHADER_OPCODE_VOTE_ANY:
257 case SHADER_OPCODE_VOTE_ALL:
258 case SHADER_OPCODE_REDUCE:
259 return arg != 0;
260
261 default:
262 return false;
263 }
264 }
265
266 bool
is_payload(unsigned arg) const267 fs_inst::is_payload(unsigned arg) const
268 {
269 switch (opcode) {
270 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
271 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
272 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
273 case SHADER_OPCODE_INTERLOCK:
274 case SHADER_OPCODE_MEMORY_FENCE:
275 case SHADER_OPCODE_BARRIER:
276 return arg == 0;
277
278 case SHADER_OPCODE_SEND:
279 return arg == 2 || arg == 3;
280
281 default:
282 return false;
283 }
284 }
285
286 bool
can_do_source_mods(const struct intel_device_info * devinfo) const287 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
288 {
289 if (is_send_from_grf())
290 return false;
291
292 /* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
293 *
294 * "When multiplying a DW and any lower precision integer, source modifier
295 * is not supported."
296 */
297 if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
298 opcode == BRW_OPCODE_MAD)) {
299 const brw_reg_type exec_type = get_exec_type(this);
300 const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
301 MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
302 MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
303
304 if (brw_type_is_int(exec_type) &&
305 brw_type_size_bytes(exec_type) >= 4 &&
306 brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
307 return false;
308 }
309
310 switch (opcode) {
311 case BRW_OPCODE_ADDC:
312 case BRW_OPCODE_BFE:
313 case BRW_OPCODE_BFI1:
314 case BRW_OPCODE_BFI2:
315 case BRW_OPCODE_BFREV:
316 case BRW_OPCODE_CBIT:
317 case BRW_OPCODE_FBH:
318 case BRW_OPCODE_FBL:
319 case BRW_OPCODE_ROL:
320 case BRW_OPCODE_ROR:
321 case BRW_OPCODE_SUBB:
322 case BRW_OPCODE_DP4A:
323 case BRW_OPCODE_DPAS:
324 case SHADER_OPCODE_BROADCAST:
325 case SHADER_OPCODE_CLUSTER_BROADCAST:
326 case SHADER_OPCODE_MOV_INDIRECT:
327 case SHADER_OPCODE_SHUFFLE:
328 case SHADER_OPCODE_INT_QUOTIENT:
329 case SHADER_OPCODE_INT_REMAINDER:
330 case SHADER_OPCODE_REDUCE:
331 case SHADER_OPCODE_INCLUSIVE_SCAN:
332 case SHADER_OPCODE_EXCLUSIVE_SCAN:
333 case SHADER_OPCODE_VOTE_ANY:
334 case SHADER_OPCODE_VOTE_ALL:
335 case SHADER_OPCODE_VOTE_EQUAL:
336 case SHADER_OPCODE_BALLOT:
337 case SHADER_OPCODE_QUAD_SWAP:
338 case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
339 case SHADER_OPCODE_READ_FROM_CHANNEL:
340 return false;
341 default:
342 return true;
343 }
344 }
345
346 bool
can_do_cmod() const347 fs_inst::can_do_cmod() const
348 {
349 switch (opcode) {
350 case BRW_OPCODE_ADD:
351 case BRW_OPCODE_ADD3:
352 case BRW_OPCODE_ADDC:
353 case BRW_OPCODE_AND:
354 case BRW_OPCODE_ASR:
355 case BRW_OPCODE_AVG:
356 case BRW_OPCODE_CMP:
357 case BRW_OPCODE_CMPN:
358 case BRW_OPCODE_DP2:
359 case BRW_OPCODE_DP3:
360 case BRW_OPCODE_DP4:
361 case BRW_OPCODE_DPH:
362 case BRW_OPCODE_FRC:
363 case BRW_OPCODE_LINE:
364 case BRW_OPCODE_LRP:
365 case BRW_OPCODE_LZD:
366 case BRW_OPCODE_MAC:
367 case BRW_OPCODE_MACH:
368 case BRW_OPCODE_MAD:
369 case BRW_OPCODE_MOV:
370 case BRW_OPCODE_MUL:
371 case BRW_OPCODE_NOT:
372 case BRW_OPCODE_OR:
373 case BRW_OPCODE_PLN:
374 case BRW_OPCODE_RNDD:
375 case BRW_OPCODE_RNDE:
376 case BRW_OPCODE_RNDU:
377 case BRW_OPCODE_RNDZ:
378 case BRW_OPCODE_SHL:
379 case BRW_OPCODE_SHR:
380 case BRW_OPCODE_SUBB:
381 case BRW_OPCODE_XOR:
382 break;
383 default:
384 return false;
385 }
386
387 /* The accumulator result appears to get used for the conditional modifier
388 * generation. When negating a UD value, there is a 33rd bit generated for
389 * the sign in the accumulator value, so now you can't check, for example,
390 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
391 */
392 for (unsigned i = 0; i < sources; i++) {
393 if (brw_type_is_uint(src[i].type) && src[i].negate)
394 return false;
395 }
396
397 if (dst.file == ARF && dst.nr == BRW_ARF_SCALAR && src[0].file == IMM)
398 return false;
399
400 return true;
401 }
402
403 bool
can_change_types() const404 fs_inst::can_change_types() const
405 {
406 return dst.type == src[0].type &&
407 !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
408 (opcode == BRW_OPCODE_MOV ||
409 (opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
410 (opcode == BRW_OPCODE_SEL &&
411 dst.type == src[1].type &&
412 predicate != BRW_PREDICATE_NONE &&
413 !src[1].abs && !src[1].negate && src[1].file != ATTR));
414 }
415
416 void
vfail(const char * format,va_list va)417 fs_visitor::vfail(const char *format, va_list va)
418 {
419 char *msg;
420
421 if (failed)
422 return;
423
424 failed = true;
425
426 msg = ralloc_vasprintf(mem_ctx, format, va);
427 msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
428 dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
429
430 this->fail_msg = msg;
431
432 if (unlikely(debug_enabled)) {
433 fprintf(stderr, "%s", msg);
434 }
435 }
436
437 void
fail(const char * format,...)438 fs_visitor::fail(const char *format, ...)
439 {
440 va_list va;
441
442 va_start(va, format);
443 vfail(format, va);
444 va_end(va);
445 }
446
447 /**
448 * Mark this program as impossible to compile with dispatch width greater
449 * than n.
450 *
451 * During the SIMD8 compile (which happens first), we can detect and flag
452 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
453 * SIMD16+ compile altogether.
454 *
455 * During a compile of dispatch width greater than n (if one happens anyway),
456 * this just calls fail().
457 */
458 void
limit_dispatch_width(unsigned n,const char * msg)459 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
460 {
461 if (dispatch_width > n) {
462 fail("%s", msg);
463 } else {
464 max_dispatch_width = MIN2(max_dispatch_width, n);
465 brw_shader_perf_log(compiler, log_data,
466 "Shader dispatch width limited to SIMD%d: %s\n",
467 n, msg);
468 }
469 }
470
471 /**
472 * Returns true if the instruction has a flag that means it won't
473 * update an entire destination register.
474 *
475 * For example, dead code elimination and live variable analysis want to know
476 * when a write to a variable screens off any preceding values that were in
477 * it.
478 */
479 bool
is_partial_write() const480 fs_inst::is_partial_write() const
481 {
482 if (this->predicate && !this->predicate_trivial &&
483 this->opcode != BRW_OPCODE_SEL)
484 return true;
485
486 if (!this->dst.is_contiguous())
487 return true;
488
489 if (this->dst.offset % REG_SIZE != 0)
490 return true;
491
492 return this->size_written % REG_SIZE != 0;
493 }
494
495 unsigned
components_read(unsigned i) const496 fs_inst::components_read(unsigned i) const
497 {
498 /* Return zero if the source is not present. */
499 if (src[i].file == BAD_FILE)
500 return 0;
501
502 switch (opcode) {
503 case BRW_OPCODE_PLN:
504 return i == 0 ? 1 : 2;
505
506 case FS_OPCODE_PIXEL_X:
507 case FS_OPCODE_PIXEL_Y:
508 assert(i < 2);
509 if (i == 0)
510 return 2;
511 else
512 return 1;
513
514 case FS_OPCODE_FB_WRITE_LOGICAL:
515 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
516 /* First/second FB write color. */
517 if (i < 2)
518 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
519 else
520 return 1;
521
522 case SHADER_OPCODE_TEX_LOGICAL:
523 case SHADER_OPCODE_TXD_LOGICAL:
524 case SHADER_OPCODE_TXF_LOGICAL:
525 case SHADER_OPCODE_TXL_LOGICAL:
526 case SHADER_OPCODE_TXS_LOGICAL:
527 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
528 case FS_OPCODE_TXB_LOGICAL:
529 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
530 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
531 case SHADER_OPCODE_TXF_MCS_LOGICAL:
532 case SHADER_OPCODE_LOD_LOGICAL:
533 case SHADER_OPCODE_TG4_LOGICAL:
534 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
535 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
536 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
537 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
538 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
539 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
540 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
541 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
542 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
543 src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
544 /* Texture coordinates. */
545 if (i == TEX_LOGICAL_SRC_COORDINATE)
546 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
547 /* Texture derivatives. */
548 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
549 opcode == SHADER_OPCODE_TXD_LOGICAL)
550 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
551 /* Texture offset. */
552 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
553 return 2;
554 /* MCS */
555 else if (i == TEX_LOGICAL_SRC_MCS) {
556 if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
557 return 2;
558 else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
559 return 4;
560 else
561 return 1;
562 } else
563 return 1;
564
565 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
566 if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
567 return 0;
568 /* fallthrough */
569 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
570 if (i == MEMORY_LOGICAL_DATA1)
571 return 0;
572 /* fallthrough */
573 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
574 if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
575 return src[MEMORY_LOGICAL_COMPONENTS].ud;
576 else if (i == MEMORY_LOGICAL_ADDRESS)
577 return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
578 else
579 return 1;
580
581 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
582 return (i == 0 ? 2 : 1);
583
584 case SHADER_OPCODE_URB_WRITE_LOGICAL:
585 assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
586
587 if (i == URB_LOGICAL_SRC_DATA)
588 return src[URB_LOGICAL_SRC_COMPONENTS].ud;
589 else
590 return 1;
591
592 case BRW_OPCODE_DPAS:
593 unreachable("Do not use components_read() for DPAS.");
594
595 default:
596 return 1;
597 }
598 }
599
600 unsigned
size_read(const struct intel_device_info * devinfo,int arg) const601 fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
602 {
603 switch (opcode) {
604 case SHADER_OPCODE_SEND:
605 if (arg == 2) {
606 return mlen * REG_SIZE;
607 } else if (arg == 3) {
608 return ex_mlen * REG_SIZE;
609 }
610 break;
611
612 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
613 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
614 if (arg == 0)
615 return mlen * REG_SIZE;
616 break;
617
618 case BRW_OPCODE_PLN:
619 if (arg == 0)
620 return 16;
621 break;
622
623 case SHADER_OPCODE_LOAD_PAYLOAD:
624 if (arg < this->header_size)
625 return retype(src[arg], BRW_TYPE_UD).component_size(8);
626 break;
627
628 case SHADER_OPCODE_BARRIER:
629 return REG_SIZE;
630
631 case SHADER_OPCODE_MOV_INDIRECT:
632 if (arg == 0) {
633 assert(src[2].file == IMM);
634 return src[2].ud;
635 }
636 break;
637
638 case BRW_OPCODE_DPAS: {
639 /* This is a little bit sketchy. There's no way to get at devinfo from
640 * here, so the regular reg_unit() cannot be used. However, on
641 * reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
642 * reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
643 * coincidence, so this isn't so bad.
644 */
645 const unsigned reg_unit = this->exec_size / 8;
646
647 switch (arg) {
648 case 0:
649 if (src[0].type == BRW_TYPE_HF) {
650 return rcount * reg_unit * REG_SIZE / 2;
651 } else {
652 return rcount * reg_unit * REG_SIZE;
653 }
654 case 1:
655 return sdepth * reg_unit * REG_SIZE;
656 case 2:
657 /* This is simpler than the formula described in the Bspec, but it
658 * covers all of the cases that we support. Each inner sdepth
659 * iteration of the DPAS consumes a single dword for int8, uint8, or
660 * float16 types. These are the one source types currently
661 * supportable through Vulkan. This is independent of reg_unit.
662 */
663 return rcount * sdepth * 4;
664 default:
665 unreachable("Invalid source number.");
666 }
667 break;
668 }
669
670 default:
671 break;
672 }
673
674 switch (src[arg].file) {
675 case UNIFORM:
676 case IMM:
677 return components_read(arg) * brw_type_size_bytes(src[arg].type);
678 case BAD_FILE:
679 case ADDRESS:
680 case ARF:
681 case FIXED_GRF:
682 case VGRF:
683 case ATTR:
684 /* Regardless of exec_size, values marked as scalar are SIMD8. */
685 return components_read(arg) *
686 src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
687 }
688 return 0;
689 }
690
691 namespace {
692 unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)693 predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
694 {
695 if (devinfo->ver >= 20) {
696 return 1;
697 } else {
698 switch (predicate) {
699 case BRW_PREDICATE_NONE: return 1;
700 case BRW_PREDICATE_NORMAL: return 1;
701 case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
702 case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
703 case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
704 case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
705 case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
706 case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
707 case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
708 case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
709 case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
710 case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
711 default: unreachable("Unsupported predicate");
712 }
713 }
714 }
715 }
716
717 unsigned
flags_read(const intel_device_info * devinfo) const718 fs_inst::flags_read(const intel_device_info *devinfo) const
719 {
720 if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
721 predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
722 /* The vertical predication modes combine corresponding bits from
723 * f0.0 and f1.0 on Gfx7+.
724 */
725 const unsigned shift = 4;
726 return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
727 } else if (predicate) {
728 return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
729 } else {
730 unsigned mask = 0;
731 for (int i = 0; i < sources; i++) {
732 mask |= brw_fs_flag_mask(src[i], size_read(devinfo, i));
733 }
734 return mask;
735 }
736 }
737
738 unsigned
flags_written(const intel_device_info * devinfo) const739 fs_inst::flags_written(const intel_device_info *devinfo) const
740 {
741 if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
742 opcode != BRW_OPCODE_CSEL &&
743 opcode != BRW_OPCODE_IF &&
744 opcode != BRW_OPCODE_WHILE)) {
745 return brw_fs_flag_mask(this, 1);
746 } else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
747 return brw_fs_flag_mask(this, 32);
748 } else {
749 return brw_fs_flag_mask(dst, size_written);
750 }
751 }
752
753 bool
has_sampler_residency() const754 fs_inst::has_sampler_residency() const
755 {
756 switch (opcode) {
757 case SHADER_OPCODE_TEX_LOGICAL:
758 case FS_OPCODE_TXB_LOGICAL:
759 case SHADER_OPCODE_TXL_LOGICAL:
760 case SHADER_OPCODE_TXD_LOGICAL:
761 case SHADER_OPCODE_TXF_LOGICAL:
762 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
763 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
764 case SHADER_OPCODE_TXS_LOGICAL:
765 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
766 case SHADER_OPCODE_TG4_LOGICAL:
767 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
768 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
769 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
770 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
771 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
772 assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
773 return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
774 default:
775 return false;
776 }
777 }
778
779 /* \sa inst_is_raw_move in brw_eu_validate. */
780 bool
is_raw_move() const781 fs_inst::is_raw_move() const
782 {
783 if (opcode != BRW_OPCODE_MOV)
784 return false;
785
786 if (src[0].file == IMM) {
787 if (brw_type_is_vector_imm(src[0].type))
788 return false;
789 } else if (src[0].negate || src[0].abs) {
790 return false;
791 }
792
793 if (saturate)
794 return false;
795
796 return src[0].type == dst.type ||
797 (brw_type_is_int(src[0].type) &&
798 brw_type_is_int(dst.type) &&
799 brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
800 }
801
802 bool
uses_address_register_implicitly() const803 fs_inst::uses_address_register_implicitly() const
804 {
805 switch (opcode) {
806 case SHADER_OPCODE_BROADCAST:
807 case SHADER_OPCODE_SHUFFLE:
808 case SHADER_OPCODE_MOV_INDIRECT:
809 return true;
810 default:
811 return false;
812 }
813 }
814
815 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
816 * This brings in those uniform definitions
817 */
818 void
import_uniforms(fs_visitor * v)819 fs_visitor::import_uniforms(fs_visitor *v)
820 {
821 this->uniforms = v->uniforms;
822 }
823
824 enum intel_barycentric_mode
brw_barycentric_mode(const struct brw_wm_prog_key * key,nir_intrinsic_instr * intr)825 brw_barycentric_mode(const struct brw_wm_prog_key *key,
826 nir_intrinsic_instr *intr)
827 {
828 const glsl_interp_mode mode =
829 (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
830
831 /* Barycentric modes don't make sense for flat inputs. */
832 assert(mode != INTERP_MODE_FLAT);
833
834 unsigned bary;
835 switch (intr->intrinsic) {
836 case nir_intrinsic_load_barycentric_pixel:
837 case nir_intrinsic_load_barycentric_at_offset:
838 /* When per sample interpolation is dynamic, assume sample
839 * interpolation. We'll dynamically remap things so that the FS thread
840 * payload is not affected.
841 */
842 bary = key->persample_interp == INTEL_SOMETIMES ?
843 INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
844 INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
845 break;
846 case nir_intrinsic_load_barycentric_centroid:
847 bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
848 break;
849 case nir_intrinsic_load_barycentric_sample:
850 case nir_intrinsic_load_barycentric_at_sample:
851 bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
852 break;
853 default:
854 unreachable("invalid intrinsic");
855 }
856
857 if (mode == INTERP_MODE_NOPERSPECTIVE)
858 bary += 3;
859
860 return (enum intel_barycentric_mode) bary;
861 }
862
863 /**
864 * Walk backwards from the end of the program looking for a URB write that
865 * isn't in control flow, and mark it with EOT.
866 *
867 * Return true if successful or false if a separate EOT write is needed.
868 */
869 bool
mark_last_urb_write_with_eot()870 fs_visitor::mark_last_urb_write_with_eot()
871 {
872 foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
873 if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
874 prev->eot = true;
875
876 /* Delete now dead instructions. */
877 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
878 if (dead == prev)
879 break;
880 dead->remove();
881 }
882 return true;
883 } else if (prev->is_control_flow() || prev->has_side_effects()) {
884 break;
885 }
886 }
887
888 return false;
889 }
890
891 static unsigned
round_components_to_whole_registers(const intel_device_info * devinfo,unsigned c)892 round_components_to_whole_registers(const intel_device_info *devinfo,
893 unsigned c)
894 {
895 return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
896 }
897
898 void
assign_curb_setup()899 fs_visitor::assign_curb_setup()
900 {
901 unsigned uniform_push_length =
902 round_components_to_whole_registers(devinfo, prog_data->nr_params);
903
904 unsigned ubo_push_length = 0;
905 unsigned ubo_push_start[4];
906 for (int i = 0; i < 4; i++) {
907 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
908 ubo_push_length += prog_data->ubo_ranges[i].length;
909
910 assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
911 assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
912 }
913
914 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
915 if (stage == MESA_SHADER_FRAGMENT &&
916 ((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
917 prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
918
919 uint64_t used = 0;
920 bool is_compute = gl_shader_stage_is_compute(stage);
921
922 if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
923 assert(devinfo->has_lsc);
924 fs_builder ubld = fs_builder(this, 1).exec_all().at(
925 cfg->first_block(), cfg->first_block()->start());
926
927 /* The base offset for our push data is passed in as R0.0[31:6]. We have
928 * to mask off the bottom 6 bits.
929 */
930 brw_reg base_addr =
931 ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
932 brw_imm_ud(INTEL_MASK(31, 6)));
933
934 /* On Gfx12-HP we load constants at the start of the program using A32
935 * stateless messages.
936 */
937 for (unsigned i = 0; i < uniform_push_length;) {
938 /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
939 unsigned num_regs = MIN2(uniform_push_length - i, 8);
940 assert(num_regs > 0);
941 num_regs = 1 << util_logbase2(num_regs);
942
943 /* This pass occurs after all of the optimization passes, so don't
944 * emit an 'ADD addr, base_addr, 0' instruction.
945 */
946 brw_reg addr = i == 0 ? base_addr :
947 ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
948
949 brw_reg srcs[4] = {
950 brw_imm_ud(0), /* desc */
951 brw_imm_ud(0), /* ex_desc */
952 addr, /* payload */
953 brw_reg(), /* payload2 */
954 };
955
956 brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
957 BRW_TYPE_UD);
958 fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
959
960 send->sfid = GFX12_SFID_UGM;
961 uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
962 LSC_ADDR_SURFTYPE_FLAT,
963 LSC_ADDR_SIZE_A32,
964 LSC_DATA_SIZE_D32,
965 num_regs * 8 /* num_channels */,
966 true /* transpose */,
967 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
968 send->header_size = 0;
969 send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
970 send->size_written =
971 lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
972 send->send_is_volatile = true;
973
974 send->src[0] = brw_imm_ud(desc |
975 brw_message_desc(devinfo,
976 send->mlen,
977 send->size_written / REG_SIZE,
978 send->header_size));
979
980 i += num_regs;
981 }
982
983 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
984 }
985
986 /* Map the offsets in the UNIFORM file to fixed HW regs. */
987 foreach_block_and_inst(block, fs_inst, inst, cfg) {
988 for (unsigned int i = 0; i < inst->sources; i++) {
989 if (inst->src[i].file == UNIFORM) {
990 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
991 int constant_nr;
992 if (inst->src[i].nr >= UBO_START) {
993 /* constant_nr is in 32-bit units, the rest are in bytes */
994 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
995 inst->src[i].offset / 4;
996 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
997 constant_nr = uniform_nr;
998 } else {
999 /* Section 5.11 of the OpenGL 4.1 spec says:
1000 * "Out-of-bounds reads return undefined values, which include
1001 * values from other variables of the active program or zero."
1002 * Just return the first push constant.
1003 */
1004 constant_nr = 0;
1005 }
1006
1007 assert(constant_nr / 8 < 64);
1008 used |= BITFIELD64_BIT(constant_nr / 8);
1009
1010 struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1011 constant_nr / 8,
1012 constant_nr % 8);
1013 brw_reg.abs = inst->src[i].abs;
1014 brw_reg.negate = inst->src[i].negate;
1015
1016 /* The combination of is_scalar for load_uniform, copy prop, and
1017 * lower_btd_logical_send can generate a MOV from a UNIFORM with
1018 * exec size 2 and stride of 1.
1019 */
1020 assert(inst->src[i].stride == 0 || inst->exec_size == 2);
1021 inst->src[i] = byte_offset(
1022 retype(brw_reg, inst->src[i].type),
1023 inst->src[i].offset % 4);
1024 }
1025 }
1026 }
1027
1028 uint64_t want_zero = used & prog_data->zero_push_reg;
1029 if (want_zero) {
1030 fs_builder ubld = fs_builder(this, 8).exec_all().at(
1031 cfg->first_block(), cfg->first_block()->start());
1032
1033 /* push_reg_mask_param is in 32-bit units */
1034 unsigned mask_param = prog_data->push_reg_mask_param;
1035 struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1036 mask_param % 8);
1037
1038 brw_reg b32;
1039 for (unsigned i = 0; i < 64; i++) {
1040 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1041 brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
1042 ubld.SHL(horiz_offset(shifted, 8),
1043 byte_offset(retype(mask, BRW_TYPE_W), i / 8),
1044 brw_imm_v(0x01234567));
1045 ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1046
1047 fs_builder ubld16 = ubld.group(16, 0);
1048 b32 = ubld16.vgrf(BRW_TYPE_D);
1049 ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1050 }
1051
1052 if (want_zero & BITFIELD64_BIT(i)) {
1053 assert(i < prog_data->curb_read_length);
1054 struct brw_reg push_reg =
1055 retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
1056
1057 ubld.AND(push_reg, push_reg, component(b32, i % 16));
1058 }
1059 }
1060
1061 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1062 }
1063
1064 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1065 this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1066 }
1067
1068 /*
1069 * Build up an array of indices into the urb_setup array that
1070 * references the active entries of the urb_setup array.
1071 * Used to accelerate walking the active entries of the urb_setup array
1072 * on each upload.
1073 */
1074 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1075 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1076 {
1077 /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1078 * skip per-primitive attributes here.
1079 */
1080
1081 /* Make sure uint8_t is sufficient */
1082 STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1083 uint8_t index = 0;
1084 for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1085 if (wm_prog_data->urb_setup[attr] >= 0) {
1086 wm_prog_data->urb_setup_attribs[index++] = attr;
1087 }
1088 }
1089 wm_prog_data->urb_setup_attribs_count = index;
1090 }
1091
1092 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1093 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1094 {
1095 for (int i = 0; i < inst->sources; i++) {
1096 if (inst->src[i].file == ATTR) {
1097 assert(inst->src[i].nr == 0);
1098 int grf = payload().num_regs +
1099 prog_data->curb_read_length +
1100 inst->src[i].offset / REG_SIZE;
1101
1102 /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1103 *
1104 * VertStride must be used to cross GRF register boundaries. This
1105 * rule implies that elements within a 'Width' cannot cross GRF
1106 * boundaries.
1107 *
1108 * So, for registers that are large enough, we have to split the exec
1109 * size in two and trust the compression state to sort it out.
1110 */
1111 unsigned total_size = inst->exec_size *
1112 inst->src[i].stride *
1113 brw_type_size_bytes(inst->src[i].type);
1114
1115 assert(total_size <= 2 * REG_SIZE);
1116 const unsigned exec_size =
1117 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1118
1119 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1120 struct brw_reg reg =
1121 stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1122 inst->src[i].offset % REG_SIZE),
1123 exec_size * inst->src[i].stride,
1124 width, inst->src[i].stride);
1125 reg.abs = inst->src[i].abs;
1126 reg.negate = inst->src[i].negate;
1127
1128 inst->src[i] = reg;
1129 }
1130 }
1131 }
1132
1133 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1134 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1135 const brw_stage_prog_data *prog_data)
1136 {
1137 if (prog_data->nr_params == 0)
1138 return -1;
1139
1140 if (devinfo->verx10 >= 125)
1141 return -1;
1142
1143 /* The local thread id is always the last parameter in the list */
1144 uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1145 if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1146 return prog_data->nr_params - 1;
1147
1148 return -1;
1149 }
1150
1151 /**
1152 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
1153 * by discard. Due to the layout of the sample mask in the fragment shader
1154 * thread payload, \p bld is required to have a dispatch_width() not greater
1155 * than 16 for fragment shaders.
1156 */
1157 brw_reg
brw_sample_mask_reg(const fs_builder & bld)1158 brw_sample_mask_reg(const fs_builder &bld)
1159 {
1160 const fs_visitor &s = *bld.shader;
1161
1162 if (s.stage != MESA_SHADER_FRAGMENT) {
1163 return brw_imm_ud(0xffffffff);
1164 } else if (s.devinfo->ver >= 20 ||
1165 brw_wm_prog_data(s.prog_data)->uses_kill) {
1166 return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
1167 } else {
1168 assert(bld.dispatch_width() <= 16);
1169 assert(s.devinfo->ver < 20);
1170 return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
1171 BRW_TYPE_UW);
1172 }
1173 }
1174
1175 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)1176 brw_fb_write_msg_control(const fs_inst *inst,
1177 const struct brw_wm_prog_data *prog_data)
1178 {
1179 uint32_t mctl;
1180
1181 if (prog_data->dual_src_blend) {
1182 assert(inst->exec_size < 32);
1183
1184 if (inst->group % 16 == 0)
1185 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
1186 else if (inst->group % 16 == 8)
1187 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
1188 else
1189 unreachable("Invalid dual-source FB write instruction group");
1190 } else {
1191 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
1192
1193 if (inst->exec_size == 16)
1194 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1195 else if (inst->exec_size == 8)
1196 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1197 else if (inst->exec_size == 32)
1198 mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
1199 else
1200 unreachable("Invalid FB write execution size");
1201 }
1202
1203 return mctl;
1204 }
1205
1206 /**
1207 * Predicate the specified instruction on the sample mask.
1208 */
1209 void
brw_emit_predicate_on_sample_mask(const fs_builder & bld,fs_inst * inst)1210 brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
1211 {
1212 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1213 bld.group() == inst->group &&
1214 bld.dispatch_width() == inst->exec_size);
1215
1216 const fs_visitor &s = *bld.shader;
1217 const brw_reg sample_mask = brw_sample_mask_reg(bld);
1218 const unsigned subreg = sample_mask_flag_subreg(s);
1219
1220 if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
1221 assert(sample_mask.file == ARF &&
1222 sample_mask.nr == brw_flag_subreg(subreg).nr &&
1223 sample_mask.subnr == brw_flag_subreg(
1224 subreg + inst->group / 16).subnr);
1225 } else {
1226 bld.group(1, 0).exec_all()
1227 .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
1228 }
1229
1230 if (inst->predicate) {
1231 assert(inst->predicate == BRW_PREDICATE_NORMAL);
1232 assert(!inst->predicate_inverse);
1233 assert(inst->flag_subreg == 0);
1234 assert(s.devinfo->ver < 20);
1235 /* Combine the sample mask with the existing predicate by using a
1236 * vertical predication mode.
1237 */
1238 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1239 } else {
1240 inst->flag_subreg = subreg;
1241 inst->predicate = BRW_PREDICATE_NORMAL;
1242 inst->predicate_inverse = false;
1243 }
1244 }
1245
register_pressure(const fs_visitor * v)1246 brw::register_pressure::register_pressure(const fs_visitor *v)
1247 {
1248 const fs_live_variables &live = v->live_analysis.require();
1249 const unsigned num_instructions = v->cfg->num_blocks ?
1250 v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
1251
1252 regs_live_at_ip = new unsigned[num_instructions]();
1253
1254 for (unsigned reg = 0; reg < v->alloc.count; reg++) {
1255 for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
1256 regs_live_at_ip[ip] += v->alloc.sizes[reg];
1257 }
1258
1259 const unsigned payload_count = v->first_non_payload_grf;
1260
1261 int *payload_last_use_ip = new int[payload_count];
1262 v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
1263
1264 for (unsigned reg = 0; reg < payload_count; reg++) {
1265 for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
1266 ++regs_live_at_ip[ip];
1267 }
1268
1269 delete[] payload_last_use_ip;
1270 }
1271
~register_pressure()1272 brw::register_pressure::~register_pressure()
1273 {
1274 delete[] regs_live_at_ip;
1275 }
1276
1277 void
invalidate_analysis(brw::analysis_dependency_class c)1278 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
1279 {
1280 live_analysis.invalidate(c);
1281 regpressure_analysis.invalidate(c);
1282 performance_analysis.invalidate(c);
1283 idom_analysis.invalidate(c);
1284 def_analysis.invalidate(c);
1285 }
1286
1287 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const1288 fs_visitor::debug_optimizer(const nir_shader *nir,
1289 const char *pass_name,
1290 int iteration, int pass_num) const
1291 {
1292 if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
1293 return;
1294
1295 char *filename;
1296 int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
1297 debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
1298 _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
1299 iteration, pass_num, pass_name);
1300 if (ret == -1)
1301 return;
1302
1303 FILE *file = stderr;
1304 if (__normal_user()) {
1305 file = fopen(filename, "w");
1306 if (!file)
1307 file = stderr;
1308 }
1309
1310 brw_print_instructions(*this, file);
1311
1312 if (file != stderr)
1313 fclose(file);
1314
1315 free(filename);
1316 }
1317
1318 static uint32_t
brw_compute_max_register_pressure(fs_visitor & s)1319 brw_compute_max_register_pressure(fs_visitor &s)
1320 {
1321 const register_pressure &rp = s.regpressure_analysis.require();
1322 uint32_t ip = 0, max_pressure = 0;
1323 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1324 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
1325 ip++;
1326 }
1327 return max_pressure;
1328 }
1329
1330 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)1331 save_instruction_order(const struct cfg_t *cfg)
1332 {
1333 /* Before we schedule anything, stash off the instruction order as an array
1334 * of fs_inst *. This way, we can reset it between scheduling passes to
1335 * prevent dependencies between the different scheduling modes.
1336 */
1337 int num_insts = cfg->last_block()->end_ip + 1;
1338 fs_inst **inst_arr = new fs_inst * [num_insts];
1339
1340 int ip = 0;
1341 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1342 assert(ip >= block->start_ip && ip <= block->end_ip);
1343 inst_arr[ip++] = inst;
1344 }
1345 assert(ip == num_insts);
1346
1347 return inst_arr;
1348 }
1349
1350 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)1351 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
1352 {
1353 ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
1354
1355 int ip = 0;
1356 foreach_block (block, cfg) {
1357 block->instructions.make_empty();
1358
1359 assert(ip == block->start_ip);
1360 for (; ip <= block->end_ip; ip++)
1361 block->instructions.push_tail(inst_arr[ip]);
1362 }
1363 assert(ip == num_insts);
1364 }
1365
1366 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
1367 static inline unsigned
brw_get_scratch_size(int size)1368 brw_get_scratch_size(int size)
1369 {
1370 return MAX2(1024, util_next_power_of_two(size));
1371 }
1372
1373 void
brw_allocate_registers(fs_visitor & s,bool allow_spilling)1374 brw_allocate_registers(fs_visitor &s, bool allow_spilling)
1375 {
1376 const struct intel_device_info *devinfo = s.devinfo;
1377 const nir_shader *nir = s.nir;
1378 bool allocated;
1379
1380 static const enum instruction_scheduler_mode pre_modes[] = {
1381 SCHEDULE_PRE,
1382 SCHEDULE_PRE_NON_LIFO,
1383 SCHEDULE_NONE,
1384 SCHEDULE_PRE_LIFO,
1385 };
1386
1387 static const char *scheduler_mode_name[] = {
1388 [SCHEDULE_PRE] = "top-down",
1389 [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
1390 [SCHEDULE_PRE_LIFO] = "lifo",
1391 [SCHEDULE_POST] = "post",
1392 [SCHEDULE_NONE] = "none",
1393 };
1394
1395 uint32_t best_register_pressure = UINT32_MAX;
1396 enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
1397
1398 brw_opt_compact_virtual_grfs(s);
1399
1400 if (s.needs_register_pressure)
1401 s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
1402
1403 s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
1404
1405 bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
1406
1407 /* Before we schedule anything, stash off the instruction order as an array
1408 * of fs_inst *. This way, we can reset it between scheduling passes to
1409 * prevent dependencies between the different scheduling modes.
1410 */
1411 fs_inst **orig_order = save_instruction_order(s.cfg);
1412 fs_inst **best_pressure_order = NULL;
1413
1414 void *scheduler_ctx = ralloc_context(NULL);
1415 instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
1416
1417 /* Try each scheduling heuristic to see if it can successfully register
1418 * allocate without spilling. They should be ordered by decreasing
1419 * performance but increasing likelihood of allocating.
1420 */
1421 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
1422 enum instruction_scheduler_mode sched_mode = pre_modes[i];
1423
1424 brw_schedule_instructions_pre_ra(s, sched, sched_mode);
1425 s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
1426
1427 s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
1428
1429 if (0) {
1430 brw_assign_regs_trivial(s);
1431 allocated = true;
1432 break;
1433 }
1434
1435 /* We should only spill registers on the last scheduling. */
1436 assert(!s.spilled_any_registers);
1437
1438 allocated = brw_assign_regs(s, false, spill_all);
1439 if (allocated)
1440 break;
1441
1442 /* Save the maximum register pressure */
1443 uint32_t this_pressure = brw_compute_max_register_pressure(s);
1444
1445 if (0) {
1446 fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
1447 scheduler_mode_name[sched_mode], this_pressure);
1448 }
1449
1450 if (this_pressure < best_register_pressure) {
1451 best_register_pressure = this_pressure;
1452 best_sched = sched_mode;
1453 delete[] best_pressure_order;
1454 best_pressure_order = save_instruction_order(s.cfg);
1455 }
1456
1457 /* Reset back to the original order before trying the next mode */
1458 restore_instruction_order(s.cfg, orig_order);
1459 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1460 }
1461
1462 ralloc_free(scheduler_ctx);
1463
1464 if (!allocated) {
1465 if (0) {
1466 fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
1467 scheduler_mode_name[best_sched]);
1468 }
1469 restore_instruction_order(s.cfg, best_pressure_order);
1470 s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
1471
1472 allocated = brw_assign_regs(s, allow_spilling, spill_all);
1473 }
1474
1475 delete[] orig_order;
1476 delete[] best_pressure_order;
1477
1478 if (!allocated) {
1479 s.fail("Failure to register allocate. Reduce number of "
1480 "live scalar values to avoid this.");
1481 } else if (s.spilled_any_registers) {
1482 brw_shader_perf_log(s.compiler, s.log_data,
1483 "%s shader triggered register spilling. "
1484 "Try reducing the number of live scalar "
1485 "values to improve performance.\n",
1486 _mesa_shader_stage_to_string(s.stage));
1487 }
1488
1489 if (s.failed)
1490 return;
1491
1492 int pass_num = 0;
1493
1494 s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
1495
1496 brw_opt_bank_conflicts(s);
1497
1498 s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
1499
1500 brw_schedule_instructions_post_ra(s);
1501
1502 s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
1503
1504 /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
1505 * of part of assign_regs since both bank conflicts optimization and post
1506 * RA scheduling take advantage of distinguishing references to registers
1507 * that were allocated from references that were already fixed.
1508 *
1509 * TODO: Change the passes above, then move this lowering to be part of
1510 * assign_regs.
1511 */
1512 brw_lower_vgrfs_to_fixed_grfs(s);
1513
1514 s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
1515
1516 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
1517
1518 if (s.last_scratch > 0) {
1519 /* We currently only support up to 2MB of scratch space. If we
1520 * need to support more eventually, the documentation suggests
1521 * that we could allocate a larger buffer, and partition it out
1522 * ourselves. We'd just have to undo the hardware's address
1523 * calculation by subtracting (FFTID * Per Thread Scratch Space)
1524 * and then add FFTID * (Larger Per Thread Scratch Space).
1525 *
1526 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
1527 * Thread Group Tracking > Local Memory/Scratch Space.
1528 */
1529 if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
1530 /* Take the max of any previously compiled variant of the shader. In the
1531 * case of bindless shaders with return parts, this will also take the
1532 * max of all parts.
1533 */
1534 s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
1535 s.prog_data->total_scratch);
1536 } else {
1537 s.fail("Scratch space required is larger than supported");
1538 }
1539 }
1540
1541 if (s.failed)
1542 return;
1543
1544 brw_lower_scoreboard(s);
1545
1546 s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
1547 }
1548
1549 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)1550 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
1551 unsigned threads)
1552 {
1553 assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
1554 assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
1555 return cs_prog_data->push.per_thread.size * threads +
1556 cs_prog_data->push.cross_thread.size;
1557 }
1558
1559 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)1560 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1561 const struct brw_cs_prog_data *prog_data,
1562 const unsigned *override_local_size)
1563 {
1564 struct intel_cs_dispatch_info info = {};
1565
1566 const unsigned *sizes =
1567 override_local_size ? override_local_size :
1568 prog_data->local_size;
1569
1570 const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
1571 assert(simd >= 0 && simd < 3);
1572
1573 info.group_size = sizes[0] * sizes[1] * sizes[2];
1574 info.simd_size = 8u << simd;
1575 info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
1576
1577 const uint32_t remainder = info.group_size & (info.simd_size - 1);
1578 if (remainder > 0)
1579 info.right_mask = ~0u >> (32 - remainder);
1580 else
1581 info.right_mask = ~0u >> (32 - info.simd_size);
1582
1583 return info;
1584 }
1585
1586 void
brw_shader_phase_update(fs_visitor & s,enum brw_shader_phase phase)1587 brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
1588 {
1589 assert(phase == s.phase + 1);
1590 s.phase = phase;
1591 brw_validate(s);
1592 }
1593
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)1594 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
1595 {
1596 return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
1597 }
1598
1599 namespace brw {
1600 brw_reg
fetch_payload_reg(const brw::fs_builder & bld,uint8_t regs[2],brw_reg_type type,unsigned n)1601 fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
1602 brw_reg_type type, unsigned n)
1603 {
1604 if (!regs[0])
1605 return brw_reg();
1606
1607 if (bld.dispatch_width() > 16) {
1608 const brw_reg tmp = bld.vgrf(type, n);
1609 const brw::fs_builder hbld = bld.exec_all().group(16, 0);
1610 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1611 brw_reg *const components = new brw_reg[m * n];
1612
1613 for (unsigned c = 0; c < n; c++) {
1614 for (unsigned g = 0; g < m; g++)
1615 components[c * m + g] =
1616 offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
1617 }
1618
1619 hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
1620
1621 delete[] components;
1622 return tmp;
1623
1624 } else {
1625 return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
1626 }
1627 }
1628
1629 brw_reg
fetch_barycentric_reg(const brw::fs_builder & bld,uint8_t regs[2])1630 fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
1631 {
1632 if (!regs[0])
1633 return brw_reg();
1634 else if (bld.shader->devinfo->ver >= 20)
1635 return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
1636
1637 const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
1638 const brw::fs_builder hbld = bld.exec_all().group(8, 0);
1639 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1640 brw_reg *const components = new brw_reg[2 * m];
1641
1642 for (unsigned c = 0; c < 2; c++) {
1643 for (unsigned g = 0; g < m; g++)
1644 components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
1645 hbld, c + 2 * (g % 2));
1646 }
1647
1648 hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
1649
1650 delete[] components;
1651 return tmp;
1652 }
1653
1654 void
check_dynamic_msaa_flag(const fs_builder & bld,const struct brw_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)1655 check_dynamic_msaa_flag(const fs_builder &bld,
1656 const struct brw_wm_prog_data *wm_prog_data,
1657 enum intel_msaa_flags flag)
1658 {
1659 fs_inst *inst = bld.AND(bld.null_reg_ud(),
1660 dynamic_msaa_flags(wm_prog_data),
1661 brw_imm_ud(flag));
1662 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1663 }
1664 }
1665