1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_fs_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_dead_control_flow.h"
38 #include "brw_private.h"
39 #include "intel_nir.h"
40 #include "shader_enums.h"
41 #include "dev/intel_debug.h"
42 #include "dev/intel_wa.h"
43 #include "compiler/glsl_types.h"
44 #include "compiler/nir/nir_builder.h"
45 #include "util/u_math.h"
46
47 #include <memory>
48
49 using namespace brw;
50
51 void
init(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
53 const fs_reg *src, unsigned sources)
54 {
55 memset((void*)this, 0, sizeof(*this));
56
57 this->src = new fs_reg[MAX2(sources, 3)];
58 for (unsigned i = 0; i < sources; i++)
59 this->src[i] = src[i];
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->sources = sources;
64 this->exec_size = exec_size;
65
66 assert(dst.file != IMM && dst.file != UNIFORM);
67
68 assert(this->exec_size != 0);
69
70 this->conditional_mod = BRW_CONDITIONAL_NONE;
71
72 /* This will be the case for almost all instructions. */
73 switch (dst.file) {
74 case VGRF:
75 case ARF:
76 case FIXED_GRF:
77 case ATTR:
78 this->size_written = dst.component_size(exec_size);
79 break;
80 case BAD_FILE:
81 this->size_written = 0;
82 break;
83 case IMM:
84 case UNIFORM:
85 unreachable("Invalid destination register file");
86 }
87
88 this->writes_accumulator = false;
89 }
90
fs_inst()91 fs_inst::fs_inst()
92 {
93 init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
94 }
95
fs_inst(enum opcode opcode,uint8_t exec_size)96 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
97 {
98 init(opcode, exec_size, reg_undef, NULL, 0);
99 }
100
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst)101 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
102 {
103 init(opcode, exec_size, dst, NULL, 0);
104 }
105
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg & src0)106 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
107 const fs_reg &src0)
108 {
109 const fs_reg src[1] = { src0 };
110 init(opcode, exec_size, dst, src, 1);
111 }
112
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg & src0,const fs_reg & src1)113 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
114 const fs_reg &src0, const fs_reg &src1)
115 {
116 const fs_reg src[2] = { src0, src1 };
117 init(opcode, exec_size, dst, src, 2);
118 }
119
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg & src0,const fs_reg & src1,const fs_reg & src2)120 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
121 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
122 {
123 const fs_reg src[3] = { src0, src1, src2 };
124 init(opcode, exec_size, dst, src, 3);
125 }
126
fs_inst(enum opcode opcode,uint8_t exec_width,const fs_reg & dst,const fs_reg src[],unsigned sources)127 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
128 const fs_reg src[], unsigned sources)
129 {
130 init(opcode, exec_width, dst, src, sources);
131 }
132
fs_inst(const fs_inst & that)133 fs_inst::fs_inst(const fs_inst &that)
134 {
135 memcpy((void*)this, &that, sizeof(that));
136
137 this->src = new fs_reg[MAX2(that.sources, 3)];
138
139 for (unsigned i = 0; i < that.sources; i++)
140 this->src[i] = that.src[i];
141 }
142
~fs_inst()143 fs_inst::~fs_inst()
144 {
145 delete[] this->src;
146 }
147
148 void
resize_sources(uint8_t num_sources)149 fs_inst::resize_sources(uint8_t num_sources)
150 {
151 if (this->sources != num_sources) {
152 fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
153
154 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
155 src[i] = this->src[i];
156
157 delete[] this->src;
158 this->src = src;
159 this->sources = num_sources;
160 }
161 }
162
163 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const fs_reg & dst,const fs_reg & surface,const fs_reg & surface_handle,const fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)164 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
165 const fs_reg &dst,
166 const fs_reg &surface,
167 const fs_reg &surface_handle,
168 const fs_reg &varying_offset,
169 uint32_t const_offset,
170 uint8_t alignment,
171 unsigned components)
172 {
173 assert(components <= 4);
174
175 /* We have our constant surface use a pitch of 4 bytes, so our index can
176 * be any component of a vector, and then we load 4 contiguous
177 * components starting from that. TODO: Support loading fewer than 4.
178 */
179 fs_reg total_offset = vgrf(glsl_uint_type());
180 bld.ADD(total_offset, varying_offset, brw_imm_ud(const_offset));
181
182 /* The pull load message will load a vec4 (16 bytes). If we are loading
183 * a double this means we are only loading 2 elements worth of data.
184 * We also want to use a 32-bit data type for the dst of the load operation
185 * so other parts of the driver don't get confused about the size of the
186 * result.
187 */
188 fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
189
190 fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
191 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
192 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
193 srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
194 srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment);
195
196 fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
197 vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
198 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
199
200 shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
201 }
202
203 bool
is_send_from_grf() const204 fs_inst::is_send_from_grf() const
205 {
206 switch (opcode) {
207 case SHADER_OPCODE_SEND:
208 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
209 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
210 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
211 case SHADER_OPCODE_INTERLOCK:
212 case SHADER_OPCODE_MEMORY_FENCE:
213 case SHADER_OPCODE_BARRIER:
214 return true;
215 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
216 return src[1].file == VGRF;
217 case FS_OPCODE_FB_READ:
218 return src[0].file == VGRF;
219 default:
220 return false;
221 }
222 }
223
224 bool
is_control_source(unsigned arg) const225 fs_inst::is_control_source(unsigned arg) const
226 {
227 switch (opcode) {
228 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
229 return arg == 0;
230
231 case SHADER_OPCODE_BROADCAST:
232 case SHADER_OPCODE_SHUFFLE:
233 case SHADER_OPCODE_QUAD_SWIZZLE:
234 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
235 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
236 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
237 return arg == 1;
238
239 case SHADER_OPCODE_MOV_INDIRECT:
240 case SHADER_OPCODE_CLUSTER_BROADCAST:
241 case SHADER_OPCODE_TEX:
242 case FS_OPCODE_TXB:
243 case SHADER_OPCODE_TXD:
244 case SHADER_OPCODE_TXF:
245 case SHADER_OPCODE_TXF_LZ:
246 case SHADER_OPCODE_TXF_CMS:
247 case SHADER_OPCODE_TXF_CMS_W:
248 case SHADER_OPCODE_TXF_UMS:
249 case SHADER_OPCODE_TXF_MCS:
250 case SHADER_OPCODE_TXL:
251 case SHADER_OPCODE_TXL_LZ:
252 case SHADER_OPCODE_TXS:
253 case SHADER_OPCODE_LOD:
254 case SHADER_OPCODE_TG4:
255 case SHADER_OPCODE_TG4_OFFSET:
256 case SHADER_OPCODE_TG4_BIAS:
257 case SHADER_OPCODE_TG4_EXPLICIT_LOD:
258 case SHADER_OPCODE_TG4_IMPLICIT_LOD:
259 case SHADER_OPCODE_TG4_OFFSET_LOD:
260 case SHADER_OPCODE_TG4_OFFSET_BIAS:
261 case SHADER_OPCODE_SAMPLEINFO:
262 return arg == 1 || arg == 2;
263
264 case SHADER_OPCODE_SEND:
265 return arg == 0 || arg == 1;
266
267 default:
268 return false;
269 }
270 }
271
272 bool
is_payload(unsigned arg) const273 fs_inst::is_payload(unsigned arg) const
274 {
275 switch (opcode) {
276 case FS_OPCODE_FB_READ:
277 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
278 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
279 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
280 case SHADER_OPCODE_INTERLOCK:
281 case SHADER_OPCODE_MEMORY_FENCE:
282 case SHADER_OPCODE_BARRIER:
283 case SHADER_OPCODE_TEX:
284 case FS_OPCODE_TXB:
285 case SHADER_OPCODE_TXD:
286 case SHADER_OPCODE_TXF:
287 case SHADER_OPCODE_TXF_LZ:
288 case SHADER_OPCODE_TXF_CMS:
289 case SHADER_OPCODE_TXF_CMS_W:
290 case SHADER_OPCODE_TXF_UMS:
291 case SHADER_OPCODE_TXF_MCS:
292 case SHADER_OPCODE_TXL:
293 case SHADER_OPCODE_TXL_LZ:
294 case SHADER_OPCODE_TXS:
295 case SHADER_OPCODE_LOD:
296 case SHADER_OPCODE_TG4:
297 case SHADER_OPCODE_TG4_OFFSET:
298 case SHADER_OPCODE_TG4_BIAS:
299 case SHADER_OPCODE_TG4_EXPLICIT_LOD:
300 case SHADER_OPCODE_TG4_IMPLICIT_LOD:
301 case SHADER_OPCODE_TG4_OFFSET_LOD:
302 case SHADER_OPCODE_TG4_OFFSET_BIAS:
303 case SHADER_OPCODE_SAMPLEINFO:
304 return arg == 0;
305
306 case SHADER_OPCODE_SEND:
307 return arg == 2 || arg == 3;
308
309 default:
310 return false;
311 }
312 }
313
314 /**
315 * Returns true if this instruction's sources and destinations cannot
316 * safely be the same register.
317 *
318 * In most cases, a register can be written over safely by the same
319 * instruction that is its last use. For a single instruction, the
320 * sources are dereferenced before writing of the destination starts
321 * (naturally).
322 *
323 * However, there are a few cases where this can be problematic:
324 *
325 * - Virtual opcodes that translate to multiple instructions in the
326 * code generator: if src == dst and one instruction writes the
327 * destination before a later instruction reads the source, then
328 * src will have been clobbered.
329 *
330 * - SIMD16 compressed instructions with certain regioning (see below).
331 *
332 * The register allocator uses this information to set up conflicts between
333 * GRF sources and the destination.
334 */
335 bool
has_source_and_destination_hazard() const336 fs_inst::has_source_and_destination_hazard() const
337 {
338 switch (opcode) {
339 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
340 /* Multiple partial writes to the destination */
341 return true;
342 case SHADER_OPCODE_SHUFFLE:
343 /* This instruction returns an arbitrary channel from the source and
344 * gets split into smaller instructions in the generator. It's possible
345 * that one of the instructions will read from a channel corresponding
346 * to an earlier instruction.
347 */
348 case SHADER_OPCODE_SEL_EXEC:
349 /* This is implemented as
350 *
351 * mov(16) g4<1>D 0D { align1 WE_all 1H };
352 * mov(16) g4<1>D g5<8,8,1>D { align1 1H }
353 *
354 * Because the source is only read in the second instruction, the first
355 * may stomp all over it.
356 */
357 return true;
358 case SHADER_OPCODE_QUAD_SWIZZLE:
359 switch (src[1].ud) {
360 case BRW_SWIZZLE_XXXX:
361 case BRW_SWIZZLE_YYYY:
362 case BRW_SWIZZLE_ZZZZ:
363 case BRW_SWIZZLE_WWWW:
364 case BRW_SWIZZLE_XXZZ:
365 case BRW_SWIZZLE_YYWW:
366 case BRW_SWIZZLE_XYXY:
367 case BRW_SWIZZLE_ZWZW:
368 /* These can be implemented as a single Align1 region on all
369 * platforms, so there's never a hazard between source and
370 * destination. C.f. fs_generator::generate_quad_swizzle().
371 */
372 return false;
373 default:
374 return !is_uniform(src[0]);
375 }
376 case BRW_OPCODE_DPAS:
377 /* This is overly conservative. The actual hazard is more complicated to
378 * describe. When the repeat count is N, the single instruction behaves
379 * like N instructions with a repeat count of one, but the destination
380 * and source registers are incremented (in somewhat complex ways) for
381 * each instruction.
382 *
383 * This means the source and destination register is actually a range of
384 * registers. The hazard exists of an earlier iteration would write a
385 * register that should be read by a later iteration.
386 *
387 * There may be some advantage to properly modeling this, but for now,
388 * be overly conservative.
389 */
390 return rcount > 1;
391 default:
392 /* The SIMD16 compressed instruction
393 *
394 * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
395 *
396 * is actually decoded in hardware as:
397 *
398 * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
399 * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
400 *
401 * Which is safe. However, if we have uniform accesses
402 * happening, we get into trouble:
403 *
404 * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
405 * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
406 *
407 * Now our destination for the first instruction overwrote the
408 * second instruction's src0, and we get garbage for those 8
409 * pixels. There's a similar issue for the pre-gfx6
410 * pixel_x/pixel_y, which are registers of 16-bit values and thus
411 * would get stomped by the first decode as well.
412 */
413 if (exec_size == 16) {
414 for (int i = 0; i < sources; i++) {
415 if (src[i].file == VGRF && (src[i].stride == 0 ||
416 src[i].type == BRW_REGISTER_TYPE_UW ||
417 src[i].type == BRW_REGISTER_TYPE_W ||
418 src[i].type == BRW_REGISTER_TYPE_UB ||
419 src[i].type == BRW_REGISTER_TYPE_B)) {
420 return true;
421 }
422 }
423 }
424 return false;
425 }
426 }
427
428 bool
can_do_source_mods(const struct intel_device_info * devinfo) const429 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
430 {
431 if (is_send_from_grf())
432 return false;
433
434 /* From Wa_1604601757:
435 *
436 * "When multiplying a DW and any lower precision integer, source modifier
437 * is not supported."
438 */
439 if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
440 opcode == BRW_OPCODE_MAD)) {
441 const brw_reg_type exec_type = get_exec_type(this);
442 const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
443 MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
444 MIN2(type_sz(src[0].type), type_sz(src[1].type));
445
446 if (brw_reg_type_is_integer(exec_type) &&
447 type_sz(exec_type) >= 4 &&
448 type_sz(exec_type) != min_type_sz)
449 return false;
450 }
451
452 if (!backend_instruction::can_do_source_mods())
453 return false;
454
455 return true;
456 }
457
458 bool
can_do_cmod()459 fs_inst::can_do_cmod()
460 {
461 if (!backend_instruction::can_do_cmod())
462 return false;
463
464 /* The accumulator result appears to get used for the conditional modifier
465 * generation. When negating a UD value, there is a 33rd bit generated for
466 * the sign in the accumulator value, so now you can't check, for example,
467 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
468 */
469 for (unsigned i = 0; i < sources; i++) {
470 if (brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
471 return false;
472 }
473
474 return true;
475 }
476
477 bool
can_change_types() const478 fs_inst::can_change_types() const
479 {
480 return dst.type == src[0].type &&
481 !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
482 (opcode == BRW_OPCODE_MOV ||
483 (opcode == BRW_OPCODE_SEL &&
484 dst.type == src[1].type &&
485 predicate != BRW_PREDICATE_NONE &&
486 !src[1].abs && !src[1].negate && src[1].file != ATTR));
487 }
488
489 void
init()490 fs_reg::init()
491 {
492 memset((void*)this, 0, sizeof(*this));
493 type = BRW_REGISTER_TYPE_UD;
494 stride = 1;
495 }
496
497 /** Generic unset register constructor. */
fs_reg()498 fs_reg::fs_reg()
499 {
500 init();
501 this->file = BAD_FILE;
502 }
503
fs_reg(struct::brw_reg reg)504 fs_reg::fs_reg(struct ::brw_reg reg) :
505 backend_reg(reg)
506 {
507 this->offset = 0;
508 this->stride = 1;
509 if (this->file == IMM &&
510 (this->type != BRW_REGISTER_TYPE_V &&
511 this->type != BRW_REGISTER_TYPE_UV &&
512 this->type != BRW_REGISTER_TYPE_VF)) {
513 this->stride = 0;
514 }
515 }
516
517 bool
equals(const fs_reg & r) const518 fs_reg::equals(const fs_reg &r) const
519 {
520 return (this->backend_reg::equals(r) &&
521 stride == r.stride);
522 }
523
524 bool
negative_equals(const fs_reg & r) const525 fs_reg::negative_equals(const fs_reg &r) const
526 {
527 return (this->backend_reg::negative_equals(r) &&
528 stride == r.stride);
529 }
530
531 bool
is_contiguous() const532 fs_reg::is_contiguous() const
533 {
534 switch (file) {
535 case ARF:
536 case FIXED_GRF:
537 return hstride == BRW_HORIZONTAL_STRIDE_1 &&
538 vstride == width + hstride;
539 case VGRF:
540 case ATTR:
541 return stride == 1;
542 case UNIFORM:
543 case IMM:
544 case BAD_FILE:
545 return true;
546 }
547
548 unreachable("Invalid register file");
549 }
550
551 unsigned
component_size(unsigned width) const552 fs_reg::component_size(unsigned width) const
553 {
554 if (file == ARF || file == FIXED_GRF) {
555 const unsigned w = MIN2(width, 1u << this->width);
556 const unsigned h = width >> this->width;
557 const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
558 const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
559 assert(w > 0);
560 return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
561 } else {
562 return MAX2(width * stride, 1) * type_sz(type);
563 }
564 }
565
566 void
vfail(const char * format,va_list va)567 fs_visitor::vfail(const char *format, va_list va)
568 {
569 char *msg;
570
571 if (failed)
572 return;
573
574 failed = true;
575
576 msg = ralloc_vasprintf(mem_ctx, format, va);
577 msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
578 dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
579
580 this->fail_msg = msg;
581
582 if (unlikely(debug_enabled)) {
583 fprintf(stderr, "%s", msg);
584 }
585 }
586
587 void
fail(const char * format,...)588 fs_visitor::fail(const char *format, ...)
589 {
590 va_list va;
591
592 va_start(va, format);
593 vfail(format, va);
594 va_end(va);
595 }
596
597 /**
598 * Mark this program as impossible to compile with dispatch width greater
599 * than n.
600 *
601 * During the SIMD8 compile (which happens first), we can detect and flag
602 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
603 * SIMD16+ compile altogether.
604 *
605 * During a compile of dispatch width greater than n (if one happens anyway),
606 * this just calls fail().
607 */
608 void
limit_dispatch_width(unsigned n,const char * msg)609 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
610 {
611 if (dispatch_width > n) {
612 fail("%s", msg);
613 } else {
614 max_dispatch_width = MIN2(max_dispatch_width, n);
615 brw_shader_perf_log(compiler, log_data,
616 "Shader dispatch width limited to SIMD%d: %s\n",
617 n, msg);
618 }
619 }
620
621 /**
622 * Returns true if the instruction has a flag that means it won't
623 * update an entire destination register.
624 *
625 * For example, dead code elimination and live variable analysis want to know
626 * when a write to a variable screens off any preceding values that were in
627 * it.
628 */
629 bool
is_partial_write() const630 fs_inst::is_partial_write() const
631 {
632 if (this->predicate && !this->predicate_trivial &&
633 this->opcode != BRW_OPCODE_SEL)
634 return true;
635
636 if (this->dst.offset % REG_SIZE != 0)
637 return true;
638
639 /* SEND instructions always write whole registers */
640 if (this->opcode == SHADER_OPCODE_SEND)
641 return false;
642
643 /* Special case UNDEF since a lot of places in the backend do things like this :
644 *
645 * fs_builder ubld = bld.exec_all().group(1, 0);
646 * fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
647 * ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
648 */
649 if (this->opcode == SHADER_OPCODE_UNDEF) {
650 assert(this->dst.is_contiguous());
651 return this->size_written < 32;
652 }
653
654 return this->exec_size * type_sz(this->dst.type) < 32 ||
655 !this->dst.is_contiguous();
656 }
657
658 unsigned
components_read(unsigned i) const659 fs_inst::components_read(unsigned i) const
660 {
661 /* Return zero if the source is not present. */
662 if (src[i].file == BAD_FILE)
663 return 0;
664
665 switch (opcode) {
666 case FS_OPCODE_LINTERP:
667 if (i == 0)
668 return 2;
669 else
670 return 1;
671
672 case FS_OPCODE_PIXEL_X:
673 case FS_OPCODE_PIXEL_Y:
674 assert(i < 2);
675 if (i == 0)
676 return 2;
677 else
678 return 1;
679
680 case FS_OPCODE_FB_WRITE_LOGICAL:
681 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
682 /* First/second FB write color. */
683 if (i < 2)
684 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
685 else
686 return 1;
687
688 case SHADER_OPCODE_TEX_LOGICAL:
689 case SHADER_OPCODE_TXD_LOGICAL:
690 case SHADER_OPCODE_TXF_LOGICAL:
691 case SHADER_OPCODE_TXL_LOGICAL:
692 case SHADER_OPCODE_TXS_LOGICAL:
693 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
694 case FS_OPCODE_TXB_LOGICAL:
695 case SHADER_OPCODE_TXF_CMS_LOGICAL:
696 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
697 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
698 case SHADER_OPCODE_TXF_UMS_LOGICAL:
699 case SHADER_OPCODE_TXF_MCS_LOGICAL:
700 case SHADER_OPCODE_LOD_LOGICAL:
701 case SHADER_OPCODE_TG4_LOGICAL:
702 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
703 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
704 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
705 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
706 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
707 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
708 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
709 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
710 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
711 src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
712 /* Texture coordinates. */
713 if (i == TEX_LOGICAL_SRC_COORDINATE)
714 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
715 /* Texture derivatives. */
716 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
717 opcode == SHADER_OPCODE_TXD_LOGICAL)
718 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
719 /* Texture offset. */
720 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
721 return 2;
722 /* MCS */
723 else if (i == TEX_LOGICAL_SRC_MCS) {
724 if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
725 return 2;
726 else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
727 return 4;
728 else
729 return 1;
730 } else
731 return 1;
732
733 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
734 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
735 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
736 /* Surface coordinates. */
737 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
738 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
739 /* Surface operation source (ignored for reads). */
740 else if (i == SURFACE_LOGICAL_SRC_DATA)
741 return 0;
742 else
743 return 1;
744
745 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
746 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
747 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
748 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
749 /* Surface coordinates. */
750 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
751 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
752 /* Surface operation source. */
753 else if (i == SURFACE_LOGICAL_SRC_DATA)
754 return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
755 else
756 return 1;
757
758 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
759 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
760 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
761 assert(src[A64_LOGICAL_ARG].file == IMM);
762 return 1;
763
764 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
765 assert(src[A64_LOGICAL_ARG].file == IMM);
766 if (i == A64_LOGICAL_SRC) { /* data to write */
767 const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
768 assert(comps > 0);
769 return comps;
770 } else {
771 return 1;
772 }
773
774 case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
775 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
776 return 1;
777
778 case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
779 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
780 if (i == SURFACE_LOGICAL_SRC_DATA) {
781 const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
782 assert(comps > 0);
783 return comps;
784 } else {
785 return 1;
786 }
787
788 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
789 assert(src[A64_LOGICAL_ARG].file == IMM);
790 return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
791
792 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
793 assert(src[A64_LOGICAL_ARG].file == IMM);
794 return i == A64_LOGICAL_SRC ?
795 lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
796
797 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
798 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
799 /* Scattered logical opcodes use the following params:
800 * src[0] Surface coordinates
801 * src[1] Surface operation source (ignored for reads)
802 * src[2] Surface
803 * src[3] IMM with always 1 dimension.
804 * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
805 */
806 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
807 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
808 return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
809
810 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
811 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
812 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
813 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
814 return 1;
815
816 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
817 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
818 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
819 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
820 const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
821 /* Surface coordinates. */
822 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
823 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
824 /* Surface operation source. */
825 else if (i == SURFACE_LOGICAL_SRC_DATA)
826 return lsc_op_num_data_values(op);
827 else
828 return 1;
829 }
830 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
831 return (i == 0 ? 2 : 1);
832
833 case SHADER_OPCODE_URB_WRITE_LOGICAL:
834 assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
835
836 if (i == URB_LOGICAL_SRC_DATA)
837 return src[URB_LOGICAL_SRC_COMPONENTS].ud;
838 else
839 return 1;
840
841 case BRW_OPCODE_DPAS:
842 unreachable("Do not use components_read() for DPAS.");
843
844 default:
845 return 1;
846 }
847 }
848
849 unsigned
size_read(int arg) const850 fs_inst::size_read(int arg) const
851 {
852 switch (opcode) {
853 case SHADER_OPCODE_SEND:
854 if (arg == 2) {
855 return mlen * REG_SIZE;
856 } else if (arg == 3) {
857 return ex_mlen * REG_SIZE;
858 }
859 break;
860
861 case FS_OPCODE_FB_READ:
862 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
863 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
864 if (arg == 0)
865 return mlen * REG_SIZE;
866 break;
867
868 case FS_OPCODE_LINTERP:
869 if (arg == 1)
870 return 16;
871 break;
872
873 case SHADER_OPCODE_LOAD_PAYLOAD:
874 if (arg < this->header_size)
875 return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8);
876 break;
877
878 case CS_OPCODE_CS_TERMINATE:
879 case SHADER_OPCODE_BARRIER:
880 return REG_SIZE;
881
882 case SHADER_OPCODE_MOV_INDIRECT:
883 if (arg == 0) {
884 assert(src[2].file == IMM);
885 return src[2].ud;
886 }
887 break;
888
889 case BRW_OPCODE_DPAS:
890 switch (arg) {
891 case 0:
892 if (src[0].type == BRW_REGISTER_TYPE_HF) {
893 return rcount * REG_SIZE / 2;
894 } else {
895 return rcount * REG_SIZE;
896 }
897 case 1:
898 return sdepth * REG_SIZE;
899 case 2:
900 /* This is simpler than the formula described in the Bspec, but it
901 * covers all of the cases that we support on DG2.
902 */
903 return rcount * REG_SIZE;
904 default:
905 unreachable("Invalid source number.");
906 }
907 break;
908
909 case SHADER_OPCODE_TEX:
910 case FS_OPCODE_TXB:
911 case SHADER_OPCODE_TXD:
912 case SHADER_OPCODE_TXF:
913 case SHADER_OPCODE_TXF_LZ:
914 case SHADER_OPCODE_TXF_CMS:
915 case SHADER_OPCODE_TXF_CMS_W:
916 case SHADER_OPCODE_TXF_UMS:
917 case SHADER_OPCODE_TXF_MCS:
918 case SHADER_OPCODE_TXL:
919 case SHADER_OPCODE_TXL_LZ:
920 case SHADER_OPCODE_TXS:
921 case SHADER_OPCODE_LOD:
922 case SHADER_OPCODE_TG4:
923 case SHADER_OPCODE_TG4_OFFSET:
924 case SHADER_OPCODE_TG4_BIAS:
925 case SHADER_OPCODE_TG4_EXPLICIT_LOD:
926 case SHADER_OPCODE_TG4_IMPLICIT_LOD:
927 case SHADER_OPCODE_TG4_OFFSET_LOD:
928 case SHADER_OPCODE_TG4_OFFSET_BIAS:
929 case SHADER_OPCODE_SAMPLEINFO:
930 if (arg == 0 && src[0].file == VGRF)
931 return mlen * REG_SIZE;
932 break;
933
934 default:
935 break;
936 }
937
938 switch (src[arg].file) {
939 case UNIFORM:
940 case IMM:
941 return components_read(arg) * type_sz(src[arg].type);
942 case BAD_FILE:
943 case ARF:
944 case FIXED_GRF:
945 case VGRF:
946 case ATTR:
947 return components_read(arg) * src[arg].component_size(exec_size);
948 }
949 return 0;
950 }
951
952 namespace {
953 unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)954 predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
955 {
956 if (devinfo->ver >= 20) {
957 return 1;
958 } else {
959 switch (predicate) {
960 case BRW_PREDICATE_NONE: return 1;
961 case BRW_PREDICATE_NORMAL: return 1;
962 case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
963 case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
964 case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
965 case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
966 case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
967 case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
968 case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
969 case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
970 case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
971 case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
972 default: unreachable("Unsupported predicate");
973 }
974 }
975 }
976 }
977
978 unsigned
flags_read(const intel_device_info * devinfo) const979 fs_inst::flags_read(const intel_device_info *devinfo) const
980 {
981 if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
982 predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
983 /* The vertical predication modes combine corresponding bits from
984 * f0.0 and f1.0 on Gfx7+.
985 */
986 const unsigned shift = 4;
987 return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
988 } else if (predicate) {
989 return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
990 } else {
991 unsigned mask = 0;
992 for (int i = 0; i < sources; i++) {
993 mask |= brw_fs_flag_mask(src[i], size_read(i));
994 }
995 return mask;
996 }
997 }
998
999 unsigned
flags_written(const intel_device_info * devinfo) const1000 fs_inst::flags_written(const intel_device_info *devinfo) const
1001 {
1002 if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1003 opcode != BRW_OPCODE_CSEL &&
1004 opcode != BRW_OPCODE_IF &&
1005 opcode != BRW_OPCODE_WHILE)) {
1006 return brw_fs_flag_mask(this, 1);
1007 } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1008 opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1009 opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
1010 return brw_fs_flag_mask(this, 32);
1011 } else {
1012 return brw_fs_flag_mask(dst, size_written);
1013 }
1014 }
1015
1016 bool
has_sampler_residency() const1017 fs_inst::has_sampler_residency() const
1018 {
1019 switch (opcode) {
1020 case SHADER_OPCODE_TEX_LOGICAL:
1021 case FS_OPCODE_TXB_LOGICAL:
1022 case SHADER_OPCODE_TXL_LOGICAL:
1023 case SHADER_OPCODE_TXD_LOGICAL:
1024 case SHADER_OPCODE_TXF_LOGICAL:
1025 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1026 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1027 case SHADER_OPCODE_TXF_CMS_LOGICAL:
1028 case SHADER_OPCODE_TXS_LOGICAL:
1029 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1030 case SHADER_OPCODE_TG4_LOGICAL:
1031 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
1032 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
1033 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
1034 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
1035 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
1036 assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1037 return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1038 default:
1039 return false;
1040 }
1041 }
1042
1043 fs_reg
vgrf(const glsl_type * const type)1044 fs_visitor::vgrf(const glsl_type *const type)
1045 {
1046 int reg_width = dispatch_width / 8;
1047 return fs_reg(VGRF,
1048 alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1049 brw_type_for_base_type(type));
1050 }
1051
fs_reg(enum brw_reg_file file,unsigned nr)1052 fs_reg::fs_reg(enum brw_reg_file file, unsigned nr)
1053 {
1054 init();
1055 this->file = file;
1056 this->nr = nr;
1057 this->type = BRW_REGISTER_TYPE_F;
1058 this->stride = (file == UNIFORM ? 0 : 1);
1059 }
1060
fs_reg(enum brw_reg_file file,unsigned nr,enum brw_reg_type type)1061 fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type)
1062 {
1063 init();
1064 this->file = file;
1065 this->nr = nr;
1066 this->type = type;
1067 this->stride = (file == UNIFORM ? 0 : 1);
1068 }
1069
1070 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1071 * This brings in those uniform definitions
1072 */
1073 void
import_uniforms(fs_visitor * v)1074 fs_visitor::import_uniforms(fs_visitor *v)
1075 {
1076 this->push_constant_loc = v->push_constant_loc;
1077 this->uniforms = v->uniforms;
1078 }
1079
1080 enum brw_barycentric_mode
brw_barycentric_mode(nir_intrinsic_instr * intr)1081 brw_barycentric_mode(nir_intrinsic_instr *intr)
1082 {
1083 const glsl_interp_mode mode =
1084 (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1085
1086 /* Barycentric modes don't make sense for flat inputs. */
1087 assert(mode != INTERP_MODE_FLAT);
1088
1089 unsigned bary;
1090 switch (intr->intrinsic) {
1091 case nir_intrinsic_load_barycentric_pixel:
1092 case nir_intrinsic_load_barycentric_at_offset:
1093 bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
1094 break;
1095 case nir_intrinsic_load_barycentric_centroid:
1096 bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
1097 break;
1098 case nir_intrinsic_load_barycentric_sample:
1099 case nir_intrinsic_load_barycentric_at_sample:
1100 bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1101 break;
1102 default:
1103 unreachable("invalid intrinsic");
1104 }
1105
1106 if (mode == INTERP_MODE_NOPERSPECTIVE)
1107 bary += 3;
1108
1109 return (enum brw_barycentric_mode) bary;
1110 }
1111
1112 /**
1113 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1114 */
1115 static enum brw_barycentric_mode
centroid_to_pixel(enum brw_barycentric_mode bary)1116 centroid_to_pixel(enum brw_barycentric_mode bary)
1117 {
1118 assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1119 bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1120 return (enum brw_barycentric_mode) ((unsigned) bary - 1);
1121 }
1122
1123 /**
1124 * Walk backwards from the end of the program looking for a URB write that
1125 * isn't in control flow, and mark it with EOT.
1126 *
1127 * Return true if successful or false if a separate EOT write is needed.
1128 */
1129 bool
mark_last_urb_write_with_eot()1130 fs_visitor::mark_last_urb_write_with_eot()
1131 {
1132 foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1133 if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
1134 prev->eot = true;
1135
1136 /* Delete now dead instructions. */
1137 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1138 if (dead == prev)
1139 break;
1140 dead->remove();
1141 }
1142 return true;
1143 } else if (prev->is_control_flow() || prev->has_side_effects()) {
1144 break;
1145 }
1146 }
1147
1148 return false;
1149 }
1150
1151 void
emit_gs_thread_end()1152 fs_visitor::emit_gs_thread_end()
1153 {
1154 assert(stage == MESA_SHADER_GEOMETRY);
1155
1156 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1157
1158 if (gs_compile->control_data_header_size_bits > 0) {
1159 emit_gs_control_data_bits(this->final_gs_vertex_count);
1160 }
1161
1162 const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1163 fs_inst *inst;
1164
1165 if (gs_prog_data->static_vertex_count != -1) {
1166 /* Try and tag the last URB write with EOT instead of emitting a whole
1167 * separate write just to finish the thread.
1168 */
1169 if (mark_last_urb_write_with_eot())
1170 return;
1171
1172 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1173 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1174 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
1175 inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1176 srcs, ARRAY_SIZE(srcs));
1177 } else {
1178 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1179 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1180 srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1181 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
1182 inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1183 srcs, ARRAY_SIZE(srcs));
1184 }
1185 inst->eot = true;
1186 inst->offset = 0;
1187 }
1188
1189 void
assign_curb_setup()1190 fs_visitor::assign_curb_setup()
1191 {
1192 unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1193
1194 unsigned ubo_push_length = 0;
1195 unsigned ubo_push_start[4];
1196 for (int i = 0; i < 4; i++) {
1197 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1198 ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1199 }
1200
1201 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1202
1203 uint64_t used = 0;
1204 bool is_compute = gl_shader_stage_is_compute(stage);
1205
1206 if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
1207 /* With COMPUTE_WALKER, we can push up to one register worth of data via
1208 * the inline data parameter in the COMPUTE_WALKER command itself.
1209 *
1210 * TODO: Support inline data and push at the same time.
1211 */
1212 assert(devinfo->verx10 >= 125);
1213 assert(uniform_push_length <= reg_unit(devinfo));
1214 } else if (is_compute && devinfo->verx10 >= 125) {
1215 assert(devinfo->has_lsc);
1216 fs_builder ubld = fs_builder(this, 1).exec_all().at(
1217 cfg->first_block(), cfg->first_block()->start());
1218
1219 /* The base offset for our push data is passed in as R0.0[31:6]. We have
1220 * to mask off the bottom 6 bits.
1221 */
1222 fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1223 ubld.AND(base_addr,
1224 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
1225 brw_imm_ud(INTEL_MASK(31, 6)));
1226
1227 /* On Gfx12-HP we load constants at the start of the program using A32
1228 * stateless messages.
1229 */
1230 for (unsigned i = 0; i < uniform_push_length;) {
1231 /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
1232 unsigned num_regs = MIN2(uniform_push_length - i, 8);
1233 assert(num_regs > 0);
1234 num_regs = 1 << util_logbase2(num_regs);
1235
1236 fs_reg addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1237 ubld.ADD(addr, base_addr, brw_imm_ud(i * REG_SIZE));
1238
1239 fs_reg srcs[4] = {
1240 brw_imm_ud(0), /* desc */
1241 brw_imm_ud(0), /* ex_desc */
1242 addr, /* payload */
1243 fs_reg(), /* payload2 */
1244 };
1245
1246 fs_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
1247 BRW_REGISTER_TYPE_UD);
1248 fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
1249
1250 send->sfid = GFX12_SFID_UGM;
1251 send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1252 1 /* exec_size */,
1253 LSC_ADDR_SURFTYPE_FLAT,
1254 LSC_ADDR_SIZE_A32,
1255 1 /* num_coordinates */,
1256 LSC_DATA_SIZE_D32,
1257 num_regs * 8 /* num_channels */,
1258 true /* transpose */,
1259 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1260 true /* has_dest */);
1261 send->header_size = 0;
1262 send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
1263 send->size_written =
1264 lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
1265 send->send_is_volatile = true;
1266
1267 i += num_regs;
1268 }
1269
1270 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1271 }
1272
1273 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1274 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1275 for (unsigned int i = 0; i < inst->sources; i++) {
1276 if (inst->src[i].file == UNIFORM) {
1277 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1278 int constant_nr;
1279 if (inst->src[i].nr >= UBO_START) {
1280 /* constant_nr is in 32-bit units, the rest are in bytes */
1281 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1282 inst->src[i].offset / 4;
1283 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1284 constant_nr = push_constant_loc[uniform_nr];
1285 } else {
1286 /* Section 5.11 of the OpenGL 4.1 spec says:
1287 * "Out-of-bounds reads return undefined values, which include
1288 * values from other variables of the active program or zero."
1289 * Just return the first push constant.
1290 */
1291 constant_nr = 0;
1292 }
1293
1294 assert(constant_nr / 8 < 64);
1295 used |= BITFIELD64_BIT(constant_nr / 8);
1296
1297 struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1298 constant_nr / 8,
1299 constant_nr % 8);
1300 brw_reg.abs = inst->src[i].abs;
1301 brw_reg.negate = inst->src[i].negate;
1302
1303 assert(inst->src[i].stride == 0);
1304 inst->src[i] = byte_offset(
1305 retype(brw_reg, inst->src[i].type),
1306 inst->src[i].offset % 4);
1307 }
1308 }
1309 }
1310
1311 uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1312 if (want_zero) {
1313 fs_builder ubld = fs_builder(this, 8).exec_all().at(
1314 cfg->first_block(), cfg->first_block()->start());
1315
1316 /* push_reg_mask_param is in 32-bit units */
1317 unsigned mask_param = stage_prog_data->push_reg_mask_param;
1318 struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1319 mask_param % 8);
1320
1321 fs_reg b32;
1322 for (unsigned i = 0; i < 64; i++) {
1323 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1324 fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
1325 ubld.SHL(horiz_offset(shifted, 8),
1326 byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
1327 brw_imm_v(0x01234567));
1328 ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1329
1330 fs_builder ubld16 = ubld.group(16, 0);
1331 b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
1332 ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1333 }
1334
1335 if (want_zero & BITFIELD64_BIT(i)) {
1336 assert(i < prog_data->curb_read_length);
1337 struct brw_reg push_reg =
1338 retype(brw_vec8_grf(payload().num_regs + i, 0),
1339 BRW_REGISTER_TYPE_D);
1340
1341 ubld.AND(push_reg, push_reg, component(b32, i % 16));
1342 }
1343 }
1344
1345 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1346 }
1347
1348 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1349 this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1350 }
1351
1352 /*
1353 * Build up an array of indices into the urb_setup array that
1354 * references the active entries of the urb_setup array.
1355 * Used to accelerate walking the active entries of the urb_setup array
1356 * on each upload.
1357 */
1358 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1359 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1360 {
1361 /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1362 * skip per-primitive attributes here.
1363 */
1364
1365 /* Make sure uint8_t is sufficient */
1366 STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1367 uint8_t index = 0;
1368 for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1369 if (wm_prog_data->urb_setup[attr] >= 0) {
1370 wm_prog_data->urb_setup_attribs[index++] = attr;
1371 }
1372 }
1373 wm_prog_data->urb_setup_attribs_count = index;
1374 }
1375
1376 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const nir_shader * nir,const struct brw_mue_map * mue_map)1377 calculate_urb_setup(const struct intel_device_info *devinfo,
1378 const struct brw_wm_prog_key *key,
1379 struct brw_wm_prog_data *prog_data,
1380 const nir_shader *nir,
1381 const struct brw_mue_map *mue_map)
1382 {
1383 memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1384 memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1385
1386 int urb_next = 0; /* in vec4s */
1387
1388 const uint64_t inputs_read =
1389 nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1390
1391 /* Figure out where each of the incoming setup attributes lands. */
1392 if (key->mesh_input != BRW_NEVER) {
1393 /* Per-Primitive Attributes are laid out by Hardware before the regular
1394 * attributes, so order them like this to make easy later to map setup
1395 * into real HW registers.
1396 */
1397 if (nir->info.per_primitive_inputs) {
1398 uint64_t per_prim_inputs_read =
1399 nir->info.inputs_read & nir->info.per_primitive_inputs;
1400
1401 /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
1402 * are always at the beginning, because they come from MUE
1403 * Primitive Header, not Per-Primitive Attributes.
1404 */
1405 const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
1406 VARYING_BIT_LAYER |
1407 VARYING_BIT_PRIMITIVE_SHADING_RATE;
1408
1409 if (mue_map) {
1410 unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
1411 unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
1412
1413 bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
1414
1415 if (reads_header || mue_map->user_data_in_primitive_header) {
1416 /* Primitive Shading Rate, Layer and Viewport live in the same
1417 * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
1418 * is dword 2).
1419 */
1420 if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
1421 prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
1422
1423 if (per_prim_inputs_read & VARYING_BIT_LAYER)
1424 prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
1425
1426 if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
1427 prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
1428
1429 per_prim_inputs_read &= ~primitive_header_bits;
1430 } else {
1431 /* If fs doesn't need primitive header, then it won't be made
1432 * available through SBE_MESH, so we have to skip them when
1433 * calculating offset from start of per-prim data.
1434 */
1435 per_prim_start_dw += mue_map->per_primitive_header_size_dw;
1436 per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
1437 }
1438
1439 u_foreach_bit64(i, per_prim_inputs_read) {
1440 int start = mue_map->start_dw[i];
1441
1442 assert(start >= 0);
1443 assert(mue_map->len_dw[i] > 0);
1444
1445 assert(unsigned(start) >= per_prim_start_dw);
1446 unsigned pos_dw = unsigned(start) - per_prim_start_dw;
1447
1448 prog_data->urb_setup[i] = urb_next + pos_dw / 4;
1449 prog_data->urb_setup_channel[i] = pos_dw % 4;
1450 }
1451
1452 urb_next = per_prim_size_dw / 4;
1453 } else {
1454 /* With no MUE map, we never read the primitive header, and
1455 * per-primitive attributes won't be packed either, so just lay
1456 * them in varying order.
1457 */
1458 per_prim_inputs_read &= ~primitive_header_bits;
1459
1460 for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1461 if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
1462 prog_data->urb_setup[i] = urb_next++;
1463 }
1464 }
1465
1466 /* The actual setup attributes later must be aligned to a full GRF. */
1467 urb_next = ALIGN(urb_next, 2);
1468 }
1469
1470 prog_data->num_per_primitive_inputs = urb_next;
1471 }
1472
1473 const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
1474 VARYING_BIT_CLIP_DIST1;
1475
1476 uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
1477
1478 if (inputs_read & clip_dist_bits) {
1479 assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
1480 unique_fs_attrs &= ~clip_dist_bits;
1481 }
1482
1483 if (mue_map) {
1484 unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
1485 unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
1486
1487 /* Per-Vertex header is available to fragment shader only if there's
1488 * user data there.
1489 */
1490 if (!mue_map->user_data_in_vertex_header) {
1491 per_vertex_start_dw += 8;
1492 per_vertex_size_dw -= 8;
1493 }
1494
1495 /* In Mesh, CLIP_DIST slots are always at the beginning, because
1496 * they come from MUE Vertex Header, not Per-Vertex Attributes.
1497 */
1498 if (inputs_read & clip_dist_bits) {
1499 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
1500 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
1501 } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
1502 /* Clip distances are in MUE, but we are not reading them in FS. */
1503 per_vertex_start_dw += 8;
1504 per_vertex_size_dw -= 8;
1505 }
1506
1507 /* Per-Vertex attributes are laid out ordered. Because we always link
1508 * Mesh and Fragment shaders, the which slots are written and read by
1509 * each of them will match. */
1510 u_foreach_bit64(i, unique_fs_attrs) {
1511 int start = mue_map->start_dw[i];
1512
1513 assert(start >= 0);
1514 assert(mue_map->len_dw[i] > 0);
1515
1516 assert(unsigned(start) >= per_vertex_start_dw);
1517 unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
1518
1519 prog_data->urb_setup[i] = urb_next + pos_dw / 4;
1520 prog_data->urb_setup_channel[i] = pos_dw % 4;
1521 }
1522
1523 urb_next += per_vertex_size_dw / 4;
1524 } else {
1525 /* If we don't have an MUE map, just lay down the inputs the FS reads
1526 * in varying order, as we do for the legacy pipeline.
1527 */
1528 if (inputs_read & clip_dist_bits) {
1529 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
1530 prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
1531 }
1532
1533 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1534 if (unique_fs_attrs & BITFIELD64_BIT(i))
1535 prog_data->urb_setup[i] = urb_next++;
1536 }
1537 }
1538 } else {
1539 assert(!nir->info.per_primitive_inputs);
1540
1541 uint64_t vue_header_bits =
1542 VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1543
1544 uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
1545
1546 /* VUE header fields all live in the same URB slot, so we pass them
1547 * as a single FS input attribute. We want to only count them once.
1548 */
1549 if (inputs_read & vue_header_bits) {
1550 unique_fs_attrs &= ~vue_header_bits;
1551 unique_fs_attrs |= VARYING_BIT_PSIZ;
1552 }
1553
1554 if (util_bitcount64(unique_fs_attrs) <= 16) {
1555 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1556 * first 16 varying inputs, so we can put them wherever we want.
1557 * Just put them in order.
1558 *
1559 * This is useful because it means that (a) inputs not used by the
1560 * fragment shader won't take up valuable register space, and (b) we
1561 * won't have to recompile the fragment shader if it gets paired with
1562 * a different vertex (or geometry) shader.
1563 *
1564 * VUE header fields share the same FS input attribute.
1565 */
1566 if (inputs_read & vue_header_bits) {
1567 if (inputs_read & VARYING_BIT_PSIZ)
1568 prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1569 if (inputs_read & VARYING_BIT_LAYER)
1570 prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1571 if (inputs_read & VARYING_BIT_VIEWPORT)
1572 prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1573
1574 urb_next++;
1575 }
1576
1577 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1578 if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1579 BITFIELD64_BIT(i)) {
1580 prog_data->urb_setup[i] = urb_next++;
1581 }
1582 }
1583 } else {
1584 /* We have enough input varyings that the SF/SBE pipeline stage can't
1585 * arbitrarily rearrange them to suit our whim; we have to put them
1586 * in an order that matches the output of the previous pipeline stage
1587 * (geometry or vertex shader).
1588 */
1589
1590 /* Re-compute the VUE map here in the case that the one coming from
1591 * geometry has more than one position slot (used for Primitive
1592 * Replication).
1593 */
1594 struct intel_vue_map prev_stage_vue_map;
1595 brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1596 key->input_slots_valid,
1597 nir->info.separate_shader, 1);
1598
1599 int first_slot =
1600 brw_compute_first_urb_slot_required(inputs_read,
1601 &prev_stage_vue_map);
1602
1603 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1604 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1605 slot++) {
1606 int varying = prev_stage_vue_map.slot_to_varying[slot];
1607 if (varying != BRW_VARYING_SLOT_PAD &&
1608 (inputs_read & BRW_FS_VARYING_INPUT_MASK &
1609 BITFIELD64_BIT(varying))) {
1610 prog_data->urb_setup[varying] = slot - first_slot;
1611 }
1612 }
1613 urb_next = prev_stage_vue_map.num_slots - first_slot;
1614 }
1615 }
1616
1617 prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1618 prog_data->inputs = inputs_read;
1619
1620 brw_compute_urb_setup_index(prog_data);
1621 }
1622
1623 void
assign_urb_setup()1624 fs_visitor::assign_urb_setup()
1625 {
1626 assert(stage == MESA_SHADER_FRAGMENT);
1627 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1628
1629 int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1630
1631 /* Offset all the urb_setup[] index by the actual position of the
1632 * setup regs, now that the location of the constants has been chosen.
1633 */
1634 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1635 for (int i = 0; i < inst->sources; i++) {
1636 if (inst->src[i].file == ATTR) {
1637 /* ATTR fs_reg::nr in the FS is in units of logical scalar
1638 * inputs each of which consumes 16B on Gfx4-Gfx12. In
1639 * single polygon mode this leads to the following layout
1640 * of the vertex setup plane parameters in the ATTR
1641 * register file:
1642 *
1643 * fs_reg::nr Input Comp0 Comp1 Comp2 Comp3
1644 * 0 Attr0.x a1-a0 a2-a0 N/A a0
1645 * 1 Attr0.y a1-a0 a2-a0 N/A a0
1646 * 2 Attr0.z a1-a0 a2-a0 N/A a0
1647 * 3 Attr0.w a1-a0 a2-a0 N/A a0
1648 * 4 Attr1.x a1-a0 a2-a0 N/A a0
1649 * ...
1650 *
1651 * In multipolygon mode that no longer works since
1652 * different channels may be processing polygons with
1653 * different plane parameters, so each parameter above is
1654 * represented as a dispatch_width-wide vector:
1655 *
1656 * fs_reg::nr fs_reg::offset Input Comp0 ... CompN
1657 * 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N]
1658 * 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N]
1659 * 0 8 * dispatch_width Attr0.x N/A ... N/A
1660 * 0 12 * dispatch_width Attr0.x a0[0] ... a0[N]
1661 * 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N]
1662 * ...
1663 *
1664 * Note that many of the components on a single row above
1665 * are likely to be replicated multiple times (if, say, a
1666 * single SIMD thread is only processing 2 different
1667 * polygons), so plane parameters aren't actually stored
1668 * in GRF memory with that layout to avoid wasting space.
1669 * Instead we compose ATTR register regions with a 2D
1670 * region that walks through the parameters of each
1671 * polygon with the correct stride, reading the parameter
1672 * corresponding to each channel directly from the PS
1673 * thread payload.
1674 *
1675 * The latter layout corresponds to a param_width equal to
1676 * dispatch_width, while the former (scalar parameter)
1677 * layout has a param_width of 1.
1678 *
1679 * Gfx20+ represent plane parameters in a format similar
1680 * to the above, except the parameters are packed in 12B
1681 * and ordered like "a0, a1-a0, a2-a0" instead of the
1682 * above vec4 representation with a missing component.
1683 */
1684 const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
1685
1686 /* Size of a single scalar component of a plane parameter
1687 * in bytes.
1688 */
1689 const unsigned chan_sz = 4;
1690 struct brw_reg reg;
1691 assert(max_polygons > 0);
1692
1693 /* Calculate the base register on the thread payload of
1694 * either the block of vertex setup data or the block of
1695 * per-primitive constant data depending on whether we're
1696 * accessing a primitive or vertex input. Also calculate
1697 * the index of the input within that block.
1698 */
1699 const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1700 const unsigned base = urb_start +
1701 (per_prim ? 0 :
1702 ALIGN(prog_data->num_per_primitive_inputs / 2,
1703 reg_unit(devinfo)) * max_polygons);
1704 const unsigned idx = per_prim ? inst->src[i].nr :
1705 inst->src[i].nr - prog_data->num_per_primitive_inputs;
1706
1707 /* Translate the offset within the param_width-wide
1708 * representation described above into an offset and a
1709 * grf, which contains the plane parameters for the first
1710 * polygon processed by the thread.
1711 */
1712 if (devinfo->ver >= 20 && !per_prim) {
1713 /* Gfx20+ is able to pack 5 logical input components
1714 * per 64B register for vertex setup data.
1715 */
1716 const unsigned grf = base + idx / 5 * 2 * max_polygons;
1717 assert(inst->src[i].offset / param_width < 12);
1718 const unsigned delta = idx % 5 * 12 +
1719 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1720 inst->src[i].offset % chan_sz;
1721 reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1722 delta);
1723 } else {
1724 /* Earlier platforms and per-primitive block pack 2 logical
1725 * input components per 32B register.
1726 */
1727 const unsigned grf = base + idx / 2 * max_polygons;
1728 assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1729 const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1730 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1731 inst->src[i].offset % chan_sz;
1732 reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1733 delta);
1734 }
1735
1736 if (max_polygons > 1) {
1737 assert(devinfo->ver >= 12);
1738 /* Misaligned channel strides that would lead to
1739 * cross-channel access in the representation above are
1740 * disallowed.
1741 */
1742 assert(inst->src[i].stride * type_sz(inst->src[i].type) == chan_sz);
1743
1744 /* Number of channels processing the same polygon. */
1745 const unsigned poly_width = dispatch_width / max_polygons;
1746 assert(dispatch_width % max_polygons == 0);
1747
1748 /* Accessing a subset of channels of a parameter vector
1749 * starting from "chan" is necessary to handle
1750 * SIMD-lowered instructions though.
1751 */
1752 const unsigned chan = inst->src[i].offset %
1753 (param_width * chan_sz) / chan_sz;
1754 assert(chan < dispatch_width);
1755 assert(chan % poly_width == 0);
1756 const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
1757 reg = byte_offset(reg, chan / poly_width * reg_size);
1758
1759 if (inst->exec_size > poly_width) {
1760 /* Accessing the parameters for multiple polygons.
1761 * Corresponding parameters for different polygons
1762 * are stored a GRF apart on the thread payload, so
1763 * use that as vertical stride.
1764 */
1765 const unsigned vstride = reg_size / type_sz(inst->src[i].type);
1766 assert(vstride <= 32);
1767 assert(chan % poly_width == 0);
1768 reg = stride(reg, vstride, poly_width, 0);
1769 } else {
1770 /* Accessing one parameter for a single polygon --
1771 * Translate to a scalar region.
1772 */
1773 assert(chan % poly_width + inst->exec_size <= poly_width);
1774 reg = stride(reg, 0, 1, 0);
1775 }
1776
1777 } else {
1778 const unsigned width = inst->src[i].stride == 0 ?
1779 1 : MIN2(inst->exec_size, 8);
1780 reg = stride(reg, width * inst->src[i].stride,
1781 width, inst->src[i].stride);
1782 }
1783
1784 reg.abs = inst->src[i].abs;
1785 reg.negate = inst->src[i].negate;
1786 inst->src[i] = reg;
1787 }
1788 }
1789 }
1790
1791 /* Each attribute is 4 setup channels, each of which is half a reg,
1792 * but they may be replicated multiple times for multipolygon
1793 * dispatch.
1794 */
1795 this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
1796
1797 /* Unlike regular attributes, per-primitive attributes have all 4 channels
1798 * in the same slot, so each GRF can store two slots.
1799 */
1800 assert(prog_data->num_per_primitive_inputs % 2 == 0);
1801 this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
1802 }
1803
1804 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1805 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1806 {
1807 for (int i = 0; i < inst->sources; i++) {
1808 if (inst->src[i].file == ATTR) {
1809 assert(inst->src[i].nr == 0);
1810 int grf = payload().num_regs +
1811 prog_data->curb_read_length +
1812 inst->src[i].offset / REG_SIZE;
1813
1814 /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1815 *
1816 * VertStride must be used to cross GRF register boundaries. This
1817 * rule implies that elements within a 'Width' cannot cross GRF
1818 * boundaries.
1819 *
1820 * So, for registers that are large enough, we have to split the exec
1821 * size in two and trust the compression state to sort it out.
1822 */
1823 unsigned total_size = inst->exec_size *
1824 inst->src[i].stride *
1825 type_sz(inst->src[i].type);
1826
1827 assert(total_size <= 2 * REG_SIZE);
1828 const unsigned exec_size =
1829 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1830
1831 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1832 struct brw_reg reg =
1833 stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1834 inst->src[i].offset % REG_SIZE),
1835 exec_size * inst->src[i].stride,
1836 width, inst->src[i].stride);
1837 reg.abs = inst->src[i].abs;
1838 reg.negate = inst->src[i].negate;
1839
1840 inst->src[i] = reg;
1841 }
1842 }
1843 }
1844
1845 void
assign_vs_urb_setup()1846 fs_visitor::assign_vs_urb_setup()
1847 {
1848 struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
1849
1850 assert(stage == MESA_SHADER_VERTEX);
1851
1852 /* Each attribute is 4 regs. */
1853 this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1854
1855 assert(vs_prog_data->base.urb_read_length <= 15);
1856
1857 /* Rewrite all ATTR file references to the hw grf that they land in. */
1858 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1859 convert_attr_sources_to_hw_regs(inst);
1860 }
1861 }
1862
1863 void
assign_tcs_urb_setup()1864 fs_visitor::assign_tcs_urb_setup()
1865 {
1866 assert(stage == MESA_SHADER_TESS_CTRL);
1867
1868 /* Rewrite all ATTR file references to HW_REGs. */
1869 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1870 convert_attr_sources_to_hw_regs(inst);
1871 }
1872 }
1873
1874 void
assign_tes_urb_setup()1875 fs_visitor::assign_tes_urb_setup()
1876 {
1877 assert(stage == MESA_SHADER_TESS_EVAL);
1878
1879 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1880
1881 first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1882
1883 /* Rewrite all ATTR file references to HW_REGs. */
1884 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1885 convert_attr_sources_to_hw_regs(inst);
1886 }
1887 }
1888
1889 void
assign_gs_urb_setup()1890 fs_visitor::assign_gs_urb_setup()
1891 {
1892 assert(stage == MESA_SHADER_GEOMETRY);
1893
1894 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1895
1896 first_non_payload_grf +=
1897 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1898
1899 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1900 /* Rewrite all ATTR file references to GRFs. */
1901 convert_attr_sources_to_hw_regs(inst);
1902 }
1903 }
1904
1905 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1906 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1907 const brw_stage_prog_data *prog_data)
1908 {
1909 if (prog_data->nr_params == 0)
1910 return -1;
1911
1912 if (devinfo->verx10 >= 125)
1913 return -1;
1914
1915 /* The local thread id is always the last parameter in the list */
1916 uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1917 if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1918 return prog_data->nr_params - 1;
1919
1920 return -1;
1921 }
1922
1923 /**
1924 * Assign UNIFORM file registers to either push constants or pull constants.
1925 *
1926 * We allow a fragment shader to have more than the specified minimum
1927 * maximum number of fragment shader uniform components (64). If
1928 * there are too many of these, they'd fill up all of register space.
1929 * So, this will push some of them out to the pull constant buffer and
1930 * update the program to load them.
1931 */
1932 void
assign_constant_locations()1933 fs_visitor::assign_constant_locations()
1934 {
1935 /* Only the first compile gets to decide on locations. */
1936 if (push_constant_loc)
1937 return;
1938
1939 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1940 for (unsigned u = 0; u < uniforms; u++)
1941 push_constant_loc[u] = u;
1942
1943 /* Now that we know how many regular uniforms we'll push, reduce the
1944 * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1945 *
1946 * If changing this value, note the limitation about total_regs in
1947 * brw_curbe.c/crocus_state.c
1948 */
1949 const unsigned max_push_length = 64;
1950 unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1951 for (int i = 0; i < 4; i++) {
1952 struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
1953
1954 if (push_length + range->length > max_push_length)
1955 range->length = max_push_length - push_length;
1956
1957 push_length += range->length;
1958 }
1959 assert(push_length <= max_push_length);
1960 }
1961
1962 bool
get_pull_locs(const fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)1963 fs_visitor::get_pull_locs(const fs_reg &src,
1964 unsigned *out_surf_index,
1965 unsigned *out_pull_index)
1966 {
1967 assert(src.file == UNIFORM);
1968
1969 if (src.nr < UBO_START)
1970 return false;
1971
1972 const struct brw_ubo_range *range =
1973 &prog_data->ubo_ranges[src.nr - UBO_START];
1974
1975 /* If this access is in our (reduced) range, use the push data. */
1976 if (src.offset / 32 < range->length)
1977 return false;
1978
1979 *out_surf_index = range->block;
1980 *out_pull_index = (32 * range->start + src.offset) / 4;
1981
1982 prog_data->has_ubo_pull = true;
1983
1984 return true;
1985 }
1986
1987 /**
1988 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
1989 * instructions to FS_OPCODE_REP_FB_WRITE.
1990 */
1991 void
emit_repclear_shader()1992 fs_visitor::emit_repclear_shader()
1993 {
1994 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1995 fs_inst *write = NULL;
1996
1997 assert(uniforms == 0);
1998 assume(key->nr_color_regions > 0);
1999
2000 fs_reg color_output = retype(brw_vec4_grf(127, 0), BRW_REGISTER_TYPE_UD);
2001 fs_reg header = retype(brw_vec8_grf(125, 0), BRW_REGISTER_TYPE_UD);
2002
2003 /* We pass the clear color as a flat input. Copy it to the output. */
2004 fs_reg color_input =
2005 brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,
2006 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
2007 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2008
2009 const fs_builder bld = fs_builder(this).at_end();
2010 bld.exec_all().group(4, 0).MOV(color_output, color_input);
2011
2012 if (key->nr_color_regions > 1) {
2013 /* Copy g0..g1 as the message header */
2014 bld.exec_all().group(16, 0)
2015 .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2016 }
2017
2018 for (int i = 0; i < key->nr_color_regions; ++i) {
2019 if (i > 0)
2020 bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
2021
2022 write = bld.emit(SHADER_OPCODE_SEND);
2023 write->resize_sources(3);
2024 write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
2025 write->src[0] = brw_imm_ud(0);
2026 write->src[1] = brw_imm_ud(0);
2027 write->src[2] = i == 0 ? color_output : header;
2028 write->check_tdr = true;
2029 write->send_has_side_effects = true;
2030 write->desc = brw_fb_write_desc(devinfo, i,
2031 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
2032 i == key->nr_color_regions - 1, false);
2033
2034 /* We can use a headerless message for the first render target */
2035 write->header_size = i == 0 ? 0 : 2;
2036 write->mlen = 1 + write->header_size;
2037 }
2038 write->eot = true;
2039 write->last_rt = true;
2040
2041 calculate_cfg();
2042
2043 this->first_non_payload_grf = payload().num_regs;
2044
2045 brw_fs_lower_scoreboard(*this);
2046 }
2047
2048 /**
2049 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
2050 * by discard. Due to the layout of the sample mask in the fragment shader
2051 * thread payload, \p bld is required to have a dispatch_width() not greater
2052 * than 16 for fragment shaders.
2053 */
2054 fs_reg
brw_sample_mask_reg(const fs_builder & bld)2055 brw_sample_mask_reg(const fs_builder &bld)
2056 {
2057 const fs_visitor &s = *bld.shader;
2058
2059 if (s.stage != MESA_SHADER_FRAGMENT) {
2060 return brw_imm_ud(0xffffffff);
2061 } else if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) {
2062 assert(bld.dispatch_width() <= 16);
2063 return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
2064 } else {
2065 assert(bld.dispatch_width() <= 16);
2066 assert(s.devinfo->ver < 20);
2067 return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
2068 BRW_REGISTER_TYPE_UW);
2069 }
2070 }
2071
2072 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)2073 brw_fb_write_msg_control(const fs_inst *inst,
2074 const struct brw_wm_prog_data *prog_data)
2075 {
2076 uint32_t mctl;
2077
2078 if (prog_data->dual_src_blend) {
2079 assert(inst->exec_size == 8);
2080
2081 if (inst->group % 16 == 0)
2082 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
2083 else if (inst->group % 16 == 8)
2084 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
2085 else
2086 unreachable("Invalid dual-source FB write instruction group");
2087 } else {
2088 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
2089
2090 if (inst->exec_size == 16)
2091 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2092 else if (inst->exec_size == 8)
2093 mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2094 else
2095 unreachable("Invalid FB write execution size");
2096 }
2097
2098 return mctl;
2099 }
2100
2101 /**
2102 * Predicate the specified instruction on the sample mask.
2103 */
2104 void
brw_emit_predicate_on_sample_mask(const fs_builder & bld,fs_inst * inst)2105 brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
2106 {
2107 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
2108 bld.group() == inst->group &&
2109 bld.dispatch_width() == inst->exec_size);
2110
2111 const fs_visitor &s = *bld.shader;
2112 const fs_reg sample_mask = brw_sample_mask_reg(bld);
2113 const unsigned subreg = sample_mask_flag_subreg(s);
2114
2115 if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) {
2116 assert(sample_mask.file == ARF &&
2117 sample_mask.nr == brw_flag_subreg(subreg).nr &&
2118 sample_mask.subnr == brw_flag_subreg(
2119 subreg + inst->group / 16).subnr);
2120 } else {
2121 bld.group(1, 0).exec_all()
2122 .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
2123 }
2124
2125 if (inst->predicate) {
2126 assert(inst->predicate == BRW_PREDICATE_NORMAL);
2127 assert(!inst->predicate_inverse);
2128 assert(inst->flag_subreg == 0);
2129 assert(s.devinfo->ver < 20);
2130 /* Combine the sample mask with the existing predicate by using a
2131 * vertical predication mode.
2132 */
2133 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
2134 } else {
2135 inst->flag_subreg = subreg;
2136 inst->predicate = BRW_PREDICATE_NORMAL;
2137 inst->predicate_inverse = false;
2138 }
2139 }
2140
2141 void
dump_instructions_to_file(FILE * file) const2142 fs_visitor::dump_instructions_to_file(FILE *file) const
2143 {
2144 if (cfg) {
2145 const register_pressure &rp = regpressure_analysis.require();
2146 unsigned ip = 0, max_pressure = 0;
2147 unsigned cf_count = 0;
2148 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
2149 if (inst->is_control_flow_end())
2150 cf_count -= 1;
2151
2152 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
2153 fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
2154 for (unsigned i = 0; i < cf_count; i++)
2155 fprintf(file, " ");
2156 dump_instruction(inst, file);
2157 ip++;
2158
2159 if (inst->is_control_flow_begin())
2160 cf_count += 1;
2161 }
2162 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2163 } else {
2164 int ip = 0;
2165 foreach_in_list(backend_instruction, inst, &instructions) {
2166 fprintf(file, "%4d: ", ip++);
2167 dump_instruction(inst, file);
2168 }
2169 }
2170 }
2171
2172 void
dump_instruction_to_file(const backend_instruction * be_inst,FILE * file) const2173 fs_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *file) const
2174 {
2175 const fs_inst *inst = (const fs_inst *)be_inst;
2176
2177 if (inst->predicate) {
2178 fprintf(file, "(%cf%d.%d) ",
2179 inst->predicate_inverse ? '-' : '+',
2180 inst->flag_subreg / 2,
2181 inst->flag_subreg % 2);
2182 }
2183
2184 fprintf(file, "%s", brw_instruction_name(&compiler->isa, inst->opcode));
2185 if (inst->saturate)
2186 fprintf(file, ".sat");
2187 if (inst->conditional_mod) {
2188 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2189 if (!inst->predicate &&
2190 (inst->opcode != BRW_OPCODE_SEL &&
2191 inst->opcode != BRW_OPCODE_CSEL &&
2192 inst->opcode != BRW_OPCODE_IF &&
2193 inst->opcode != BRW_OPCODE_WHILE)) {
2194 fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
2195 inst->flag_subreg % 2);
2196 }
2197 }
2198 fprintf(file, "(%d) ", inst->exec_size);
2199
2200 if (inst->mlen) {
2201 fprintf(file, "(mlen: %d) ", inst->mlen);
2202 }
2203
2204 if (inst->ex_mlen) {
2205 fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
2206 }
2207
2208 if (inst->eot) {
2209 fprintf(file, "(EOT) ");
2210 }
2211
2212 switch (inst->dst.file) {
2213 case VGRF:
2214 fprintf(file, "vgrf%d", inst->dst.nr);
2215 break;
2216 case FIXED_GRF:
2217 fprintf(file, "g%d", inst->dst.nr);
2218 break;
2219 case BAD_FILE:
2220 fprintf(file, "(null)");
2221 break;
2222 case UNIFORM:
2223 fprintf(file, "***u%d***", inst->dst.nr);
2224 break;
2225 case ATTR:
2226 fprintf(file, "***attr%d***", inst->dst.nr);
2227 break;
2228 case ARF:
2229 switch (inst->dst.nr) {
2230 case BRW_ARF_NULL:
2231 fprintf(file, "null");
2232 break;
2233 case BRW_ARF_ADDRESS:
2234 fprintf(file, "a0.%d", inst->dst.subnr);
2235 break;
2236 case BRW_ARF_ACCUMULATOR:
2237 fprintf(file, "acc%d", inst->dst.subnr);
2238 break;
2239 case BRW_ARF_FLAG:
2240 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
2241 break;
2242 default:
2243 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
2244 break;
2245 }
2246 break;
2247 case IMM:
2248 unreachable("not reached");
2249 }
2250
2251 if (inst->dst.offset ||
2252 (inst->dst.file == VGRF &&
2253 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
2254 const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
2255 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
2256 inst->dst.offset % reg_size);
2257 }
2258
2259 if (inst->dst.stride != 1)
2260 fprintf(file, "<%u>", inst->dst.stride);
2261 fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
2262
2263 for (int i = 0; i < inst->sources; i++) {
2264 if (inst->src[i].negate)
2265 fprintf(file, "-");
2266 if (inst->src[i].abs)
2267 fprintf(file, "|");
2268 switch (inst->src[i].file) {
2269 case VGRF:
2270 fprintf(file, "vgrf%d", inst->src[i].nr);
2271 break;
2272 case FIXED_GRF:
2273 fprintf(file, "g%d", inst->src[i].nr);
2274 break;
2275 case ATTR:
2276 fprintf(file, "attr%d", inst->src[i].nr);
2277 break;
2278 case UNIFORM:
2279 fprintf(file, "u%d", inst->src[i].nr);
2280 break;
2281 case BAD_FILE:
2282 fprintf(file, "(null)");
2283 break;
2284 case IMM:
2285 switch (inst->src[i].type) {
2286 case BRW_REGISTER_TYPE_HF:
2287 fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
2288 break;
2289 case BRW_REGISTER_TYPE_F:
2290 fprintf(file, "%-gf", inst->src[i].f);
2291 break;
2292 case BRW_REGISTER_TYPE_DF:
2293 fprintf(file, "%fdf", inst->src[i].df);
2294 break;
2295 case BRW_REGISTER_TYPE_W:
2296 case BRW_REGISTER_TYPE_D:
2297 fprintf(file, "%dd", inst->src[i].d);
2298 break;
2299 case BRW_REGISTER_TYPE_UW:
2300 case BRW_REGISTER_TYPE_UD:
2301 fprintf(file, "%uu", inst->src[i].ud);
2302 break;
2303 case BRW_REGISTER_TYPE_Q:
2304 fprintf(file, "%" PRId64 "q", inst->src[i].d64);
2305 break;
2306 case BRW_REGISTER_TYPE_UQ:
2307 fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
2308 break;
2309 case BRW_REGISTER_TYPE_VF:
2310 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
2311 brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
2312 brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
2313 brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
2314 brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
2315 break;
2316 case BRW_REGISTER_TYPE_V:
2317 case BRW_REGISTER_TYPE_UV:
2318 fprintf(file, "%08x%s", inst->src[i].ud,
2319 inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
2320 break;
2321 default:
2322 fprintf(file, "???");
2323 break;
2324 }
2325 break;
2326 case ARF:
2327 switch (inst->src[i].nr) {
2328 case BRW_ARF_NULL:
2329 fprintf(file, "null");
2330 break;
2331 case BRW_ARF_ADDRESS:
2332 fprintf(file, "a0.%d", inst->src[i].subnr);
2333 break;
2334 case BRW_ARF_ACCUMULATOR:
2335 fprintf(file, "acc%d", inst->src[i].subnr);
2336 break;
2337 case BRW_ARF_FLAG:
2338 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
2339 break;
2340 default:
2341 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
2342 break;
2343 }
2344 break;
2345 }
2346
2347 if (inst->src[i].offset ||
2348 (inst->src[i].file == VGRF &&
2349 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
2350 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
2351 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
2352 inst->src[i].offset % reg_size);
2353 }
2354
2355 if (inst->src[i].abs)
2356 fprintf(file, "|");
2357
2358 if (inst->src[i].file != IMM) {
2359 unsigned stride;
2360 if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
2361 unsigned hstride = inst->src[i].hstride;
2362 stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
2363 } else {
2364 stride = inst->src[i].stride;
2365 }
2366 if (stride != 1)
2367 fprintf(file, "<%u>", stride);
2368
2369 fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
2370 }
2371
2372 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2373 fprintf(file, ", ");
2374 }
2375
2376 fprintf(file, " ");
2377
2378 if (inst->force_writemask_all)
2379 fprintf(file, "NoMask ");
2380
2381 if (inst->exec_size != dispatch_width)
2382 fprintf(file, "group%d ", inst->group);
2383
2384 fprintf(file, "\n");
2385 }
2386
register_pressure(const fs_visitor * v)2387 brw::register_pressure::register_pressure(const fs_visitor *v)
2388 {
2389 const fs_live_variables &live = v->live_analysis.require();
2390 const unsigned num_instructions = v->cfg->num_blocks ?
2391 v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
2392
2393 regs_live_at_ip = new unsigned[num_instructions]();
2394
2395 for (unsigned reg = 0; reg < v->alloc.count; reg++) {
2396 for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
2397 regs_live_at_ip[ip] += v->alloc.sizes[reg];
2398 }
2399
2400 const unsigned payload_count = v->first_non_payload_grf;
2401
2402 int *payload_last_use_ip = new int[payload_count];
2403 v->calculate_payload_ranges(payload_count, payload_last_use_ip);
2404
2405 for (unsigned reg = 0; reg < payload_count; reg++) {
2406 for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
2407 ++regs_live_at_ip[ip];
2408 }
2409
2410 delete[] payload_last_use_ip;
2411 }
2412
~register_pressure()2413 brw::register_pressure::~register_pressure()
2414 {
2415 delete[] regs_live_at_ip;
2416 }
2417
2418 void
invalidate_analysis(brw::analysis_dependency_class c)2419 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
2420 {
2421 backend_shader::invalidate_analysis(c);
2422 live_analysis.invalidate(c);
2423 regpressure_analysis.invalidate(c);
2424 }
2425
2426 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const2427 fs_visitor::debug_optimizer(const nir_shader *nir,
2428 const char *pass_name,
2429 int iteration, int pass_num) const
2430 {
2431 if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
2432 return;
2433
2434 char *filename;
2435 int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
2436 debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
2437 _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
2438 iteration, pass_num, pass_name);
2439 if (ret == -1)
2440 return;
2441 dump_instructions(filename);
2442 free(filename);
2443 }
2444
2445 uint32_t
compute_max_register_pressure()2446 fs_visitor::compute_max_register_pressure()
2447 {
2448 const register_pressure &rp = regpressure_analysis.require();
2449 uint32_t ip = 0, max_pressure = 0;
2450 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
2451 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
2452 ip++;
2453 }
2454 return max_pressure;
2455 }
2456
2457 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)2458 save_instruction_order(const struct cfg_t *cfg)
2459 {
2460 /* Before we schedule anything, stash off the instruction order as an array
2461 * of fs_inst *. This way, we can reset it between scheduling passes to
2462 * prevent dependencies between the different scheduling modes.
2463 */
2464 int num_insts = cfg->last_block()->end_ip + 1;
2465 fs_inst **inst_arr = new fs_inst * [num_insts];
2466
2467 int ip = 0;
2468 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2469 assert(ip >= block->start_ip && ip <= block->end_ip);
2470 inst_arr[ip++] = inst;
2471 }
2472 assert(ip == num_insts);
2473
2474 return inst_arr;
2475 }
2476
2477 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)2478 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
2479 {
2480 ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
2481
2482 int ip = 0;
2483 foreach_block (block, cfg) {
2484 block->instructions.make_empty();
2485
2486 assert(ip == block->start_ip);
2487 for (; ip <= block->end_ip; ip++)
2488 block->instructions.push_tail(inst_arr[ip]);
2489 }
2490 assert(ip == num_insts);
2491 }
2492
2493 void
allocate_registers(bool allow_spilling)2494 fs_visitor::allocate_registers(bool allow_spilling)
2495 {
2496 bool allocated;
2497
2498 static const enum instruction_scheduler_mode pre_modes[] = {
2499 SCHEDULE_PRE,
2500 SCHEDULE_PRE_NON_LIFO,
2501 SCHEDULE_NONE,
2502 SCHEDULE_PRE_LIFO,
2503 };
2504
2505 static const char *scheduler_mode_name[] = {
2506 [SCHEDULE_PRE] = "top-down",
2507 [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
2508 [SCHEDULE_PRE_LIFO] = "lifo",
2509 [SCHEDULE_POST] = "post",
2510 [SCHEDULE_NONE] = "none",
2511 };
2512
2513 uint32_t best_register_pressure = UINT32_MAX;
2514 enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
2515
2516 brw_fs_opt_compact_virtual_grfs(*this);
2517
2518 if (needs_register_pressure)
2519 shader_stats.max_register_pressure = compute_max_register_pressure();
2520
2521 debug_optimizer(nir, "pre_register_allocate", 90, 90);
2522
2523 bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
2524
2525 /* Before we schedule anything, stash off the instruction order as an array
2526 * of fs_inst *. This way, we can reset it between scheduling passes to
2527 * prevent dependencies between the different scheduling modes.
2528 */
2529 fs_inst **orig_order = save_instruction_order(cfg);
2530 fs_inst **best_pressure_order = NULL;
2531
2532 void *scheduler_ctx = ralloc_context(NULL);
2533 fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
2534
2535 /* Try each scheduling heuristic to see if it can successfully register
2536 * allocate without spilling. They should be ordered by decreasing
2537 * performance but increasing likelihood of allocating.
2538 */
2539 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
2540 enum instruction_scheduler_mode sched_mode = pre_modes[i];
2541
2542 schedule_instructions_pre_ra(sched, sched_mode);
2543 this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
2544
2545 debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
2546
2547 if (0) {
2548 assign_regs_trivial();
2549 allocated = true;
2550 break;
2551 }
2552
2553 /* We should only spill registers on the last scheduling. */
2554 assert(!spilled_any_registers);
2555
2556 allocated = assign_regs(false, spill_all);
2557 if (allocated)
2558 break;
2559
2560 /* Save the maximum register pressure */
2561 uint32_t this_pressure = compute_max_register_pressure();
2562
2563 if (0) {
2564 fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
2565 scheduler_mode_name[sched_mode], this_pressure);
2566 }
2567
2568 if (this_pressure < best_register_pressure) {
2569 best_register_pressure = this_pressure;
2570 best_sched = sched_mode;
2571 delete[] best_pressure_order;
2572 best_pressure_order = save_instruction_order(cfg);
2573 }
2574
2575 /* Reset back to the original order before trying the next mode */
2576 restore_instruction_order(cfg, orig_order);
2577 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2578 }
2579
2580 ralloc_free(scheduler_ctx);
2581
2582 if (!allocated) {
2583 if (0) {
2584 fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
2585 scheduler_mode_name[best_sched]);
2586 }
2587 restore_instruction_order(cfg, best_pressure_order);
2588 shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
2589
2590 allocated = assign_regs(allow_spilling, spill_all);
2591 }
2592
2593 delete[] orig_order;
2594 delete[] best_pressure_order;
2595
2596 if (!allocated) {
2597 fail("Failure to register allocate. Reduce number of "
2598 "live scalar values to avoid this.");
2599 } else if (spilled_any_registers) {
2600 brw_shader_perf_log(compiler, log_data,
2601 "%s shader triggered register spilling. "
2602 "Try reducing the number of live scalar "
2603 "values to improve performance.\n",
2604 _mesa_shader_stage_to_string(stage));
2605 }
2606
2607 if (failed)
2608 return;
2609
2610 brw_fs_opt_bank_conflicts(*this);
2611
2612 schedule_instructions_post_ra();
2613
2614 if (last_scratch > 0) {
2615 ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
2616
2617 /* Take the max of any previously compiled variant of the shader. In the
2618 * case of bindless shaders with return parts, this will also take the
2619 * max of all parts.
2620 */
2621 prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
2622 prog_data->total_scratch);
2623
2624 /* We currently only support up to 2MB of scratch space. If we
2625 * need to support more eventually, the documentation suggests
2626 * that we could allocate a larger buffer, and partition it out
2627 * ourselves. We'd just have to undo the hardware's address
2628 * calculation by subtracting (FFTID * Per Thread Scratch Space)
2629 * and then add FFTID * (Larger Per Thread Scratch Space).
2630 *
2631 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
2632 * Thread Group Tracking > Local Memory/Scratch Space.
2633 */
2634 assert(prog_data->total_scratch < max_scratch_size);
2635 }
2636
2637 brw_fs_lower_scoreboard(*this);
2638 }
2639
2640 bool
run_vs()2641 fs_visitor::run_vs()
2642 {
2643 assert(stage == MESA_SHADER_VERTEX);
2644
2645 payload_ = new vs_thread_payload(*this);
2646
2647 nir_to_brw(this);
2648
2649 if (failed)
2650 return false;
2651
2652 emit_urb_writes();
2653
2654 calculate_cfg();
2655
2656 brw_fs_optimize(*this);
2657
2658 assign_curb_setup();
2659 assign_vs_urb_setup();
2660
2661 brw_fs_lower_3src_null_dest(*this);
2662 brw_fs_workaround_memory_fence_before_eot(*this);
2663 brw_fs_workaround_emit_dummy_mov_instruction(*this);
2664
2665 allocate_registers(true /* allow_spilling */);
2666
2667 return !failed;
2668 }
2669
2670 void
set_tcs_invocation_id()2671 fs_visitor::set_tcs_invocation_id()
2672 {
2673 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2674 struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2675 const fs_builder bld = fs_builder(this).at_end();
2676
2677 const unsigned instance_id_mask =
2678 (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
2679 (devinfo->ver >= 11) ? INTEL_MASK(22, 16) :
2680 INTEL_MASK(23, 17);
2681 const unsigned instance_id_shift =
2682 (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
2683
2684 /* Get instance number from g0.2 bits:
2685 * * 7:0 on DG2+
2686 * * 22:16 on gfx11+
2687 * * 23:17 otherwise
2688 */
2689 fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
2690 bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
2691 brw_imm_ud(instance_id_mask));
2692
2693 invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
2694
2695 if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
2696 /* gl_InvocationID is just the thread number */
2697 bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
2698 return;
2699 }
2700
2701 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
2702
2703 fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
2704 fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
2705 bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
2706 bld.MOV(channels_ud, channels_uw);
2707
2708 if (tcs_prog_data->instances == 1) {
2709 invocation_id = channels_ud;
2710 } else {
2711 fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
2712 bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
2713 bld.ADD(invocation_id, instance_times_8, channels_ud);
2714 }
2715 }
2716
2717 void
emit_tcs_thread_end()2718 fs_visitor::emit_tcs_thread_end()
2719 {
2720 /* Try and tag the last URB write with EOT instead of emitting a whole
2721 * separate write just to finish the thread. There isn't guaranteed to
2722 * be one, so this may not succeed.
2723 */
2724 if (mark_last_urb_write_with_eot())
2725 return;
2726
2727 const fs_builder bld = fs_builder(this).at_end();
2728
2729 /* Emit a URB write to end the thread. On Broadwell, we use this to write
2730 * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
2731 * algorithm to set it optimally). On other platforms, we simply write
2732 * zero to a reserved/MBZ patch header DWord which has no consequence.
2733 */
2734 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2735 srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
2736 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
2737 srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
2738 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
2739 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
2740 reg_undef, srcs, ARRAY_SIZE(srcs));
2741 inst->eot = true;
2742 }
2743
2744 bool
run_tcs()2745 fs_visitor::run_tcs()
2746 {
2747 assert(stage == MESA_SHADER_TESS_CTRL);
2748
2749 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2750 const fs_builder bld = fs_builder(this).at_end();
2751
2752 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
2753 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
2754
2755 payload_ = new tcs_thread_payload(*this);
2756
2757 /* Initialize gl_InvocationID */
2758 set_tcs_invocation_id();
2759
2760 const bool fix_dispatch_mask =
2761 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
2762 (nir->info.tess.tcs_vertices_out % 8) != 0;
2763
2764 /* Fix the disptach mask */
2765 if (fix_dispatch_mask) {
2766 bld.CMP(bld.null_reg_ud(), invocation_id,
2767 brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
2768 bld.IF(BRW_PREDICATE_NORMAL);
2769 }
2770
2771 nir_to_brw(this);
2772
2773 if (fix_dispatch_mask) {
2774 bld.emit(BRW_OPCODE_ENDIF);
2775 }
2776
2777 emit_tcs_thread_end();
2778
2779 if (failed)
2780 return false;
2781
2782 calculate_cfg();
2783
2784 brw_fs_optimize(*this);
2785
2786 assign_curb_setup();
2787 assign_tcs_urb_setup();
2788
2789 brw_fs_lower_3src_null_dest(*this);
2790 brw_fs_workaround_memory_fence_before_eot(*this);
2791 brw_fs_workaround_emit_dummy_mov_instruction(*this);
2792
2793 allocate_registers(true /* allow_spilling */);
2794
2795 return !failed;
2796 }
2797
2798 bool
run_tes()2799 fs_visitor::run_tes()
2800 {
2801 assert(stage == MESA_SHADER_TESS_EVAL);
2802
2803 payload_ = new tes_thread_payload(*this);
2804
2805 nir_to_brw(this);
2806
2807 if (failed)
2808 return false;
2809
2810 emit_urb_writes();
2811
2812 calculate_cfg();
2813
2814 brw_fs_optimize(*this);
2815
2816 assign_curb_setup();
2817 assign_tes_urb_setup();
2818
2819 brw_fs_lower_3src_null_dest(*this);
2820 brw_fs_workaround_memory_fence_before_eot(*this);
2821 brw_fs_workaround_emit_dummy_mov_instruction(*this);
2822
2823 allocate_registers(true /* allow_spilling */);
2824
2825 return !failed;
2826 }
2827
2828 bool
run_gs()2829 fs_visitor::run_gs()
2830 {
2831 assert(stage == MESA_SHADER_GEOMETRY);
2832
2833 payload_ = new gs_thread_payload(*this);
2834
2835 this->final_gs_vertex_count = vgrf(glsl_uint_type());
2836
2837 if (gs_compile->control_data_header_size_bits > 0) {
2838 /* Create a VGRF to store accumulated control data bits. */
2839 this->control_data_bits = vgrf(glsl_uint_type());
2840
2841 /* If we're outputting more than 32 control data bits, then EmitVertex()
2842 * will set control_data_bits to 0 after emitting the first vertex.
2843 * Otherwise, we need to initialize it to 0 here.
2844 */
2845 if (gs_compile->control_data_header_size_bits <= 32) {
2846 const fs_builder bld = fs_builder(this).at_end();
2847 const fs_builder abld = bld.annotate("initialize control data bits");
2848 abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2849 }
2850 }
2851
2852 nir_to_brw(this);
2853
2854 emit_gs_thread_end();
2855
2856 if (failed)
2857 return false;
2858
2859 calculate_cfg();
2860
2861 brw_fs_optimize(*this);
2862
2863 assign_curb_setup();
2864 assign_gs_urb_setup();
2865
2866 brw_fs_lower_3src_null_dest(*this);
2867 brw_fs_workaround_memory_fence_before_eot(*this);
2868 brw_fs_workaround_emit_dummy_mov_instruction(*this);
2869
2870 allocate_registers(true /* allow_spilling */);
2871
2872 return !failed;
2873 }
2874
2875 /* From the SKL PRM, Volume 16, Workarounds:
2876 *
2877 * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
2878 * only header phases (R0-R2)
2879 *
2880 * WA: Enable a non-header phase (e.g. push constant) when dispatch would
2881 * have been header only.
2882 *
2883 * Instead of enabling push constants one can alternatively enable one of the
2884 * inputs. Here one simply chooses "layer" which shouldn't impose much
2885 * overhead.
2886 */
2887 static void
gfx9_ps_header_only_workaround(struct brw_wm_prog_data * wm_prog_data)2888 gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
2889 {
2890 if (wm_prog_data->num_varying_inputs)
2891 return;
2892
2893 if (wm_prog_data->base.curb_read_length)
2894 return;
2895
2896 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
2897 wm_prog_data->num_varying_inputs = 1;
2898
2899 brw_compute_urb_setup_index(wm_prog_data);
2900 }
2901
2902 bool
run_fs(bool allow_spilling,bool do_rep_send)2903 fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
2904 {
2905 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
2906 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
2907 const fs_builder bld = fs_builder(this).at_end();
2908
2909 assert(stage == MESA_SHADER_FRAGMENT);
2910
2911 payload_ = new fs_thread_payload(*this, source_depth_to_render_target);
2912
2913 if (do_rep_send) {
2914 assert(dispatch_width == 16);
2915 emit_repclear_shader();
2916 } else {
2917 if (nir->info.inputs_read > 0 ||
2918 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
2919 (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
2920 emit_interpolation_setup();
2921 }
2922
2923 /* We handle discards by keeping track of the still-live pixels in f0.1.
2924 * Initialize it with the dispatched pixels.
2925 */
2926 if (wm_prog_data->uses_kill) {
2927 const unsigned lower_width = MIN2(dispatch_width, 16);
2928 for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
2929 /* According to the "PS Thread Payload for Normal
2930 * Dispatch" pages on the BSpec, the dispatch mask is
2931 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
2932 * gfx6+.
2933 */
2934 const fs_reg dispatch_mask =
2935 devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
2936 brw_vec1_grf(i + 1, 7);
2937 bld.exec_all().group(1, 0)
2938 .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
2939 retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
2940 }
2941 }
2942
2943 if (nir->info.writes_memory)
2944 wm_prog_data->has_side_effects = true;
2945
2946 nir_to_brw(this);
2947
2948 if (failed)
2949 return false;
2950
2951 emit_fb_writes();
2952
2953 calculate_cfg();
2954
2955 brw_fs_optimize(*this);
2956
2957 assign_curb_setup();
2958
2959 if (devinfo->ver == 9)
2960 gfx9_ps_header_only_workaround(wm_prog_data);
2961
2962 assign_urb_setup();
2963
2964 brw_fs_lower_3src_null_dest(*this);
2965 brw_fs_workaround_memory_fence_before_eot(*this);
2966 brw_fs_workaround_emit_dummy_mov_instruction(*this);
2967
2968 allocate_registers(allow_spilling);
2969 }
2970
2971 return !failed;
2972 }
2973
2974 bool
run_cs(bool allow_spilling)2975 fs_visitor::run_cs(bool allow_spilling)
2976 {
2977 assert(gl_shader_stage_is_compute(stage));
2978 const fs_builder bld = fs_builder(this).at_end();
2979
2980 payload_ = new cs_thread_payload(*this);
2981
2982 if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
2983 /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
2984 const fs_builder abld = bld.exec_all().group(1, 0);
2985 abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
2986 suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
2987 }
2988
2989 nir_to_brw(this);
2990
2991 if (failed)
2992 return false;
2993
2994 emit_cs_terminate();
2995
2996 calculate_cfg();
2997
2998 brw_fs_optimize(*this);
2999
3000 assign_curb_setup();
3001
3002 brw_fs_lower_3src_null_dest(*this);
3003 brw_fs_workaround_memory_fence_before_eot(*this);
3004 brw_fs_workaround_emit_dummy_mov_instruction(*this);
3005
3006 allocate_registers(allow_spilling);
3007
3008 return !failed;
3009 }
3010
3011 bool
run_bs(bool allow_spilling)3012 fs_visitor::run_bs(bool allow_spilling)
3013 {
3014 assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
3015
3016 payload_ = new bs_thread_payload(*this);
3017
3018 nir_to_brw(this);
3019
3020 if (failed)
3021 return false;
3022
3023 /* TODO(RT): Perhaps rename this? */
3024 emit_cs_terminate();
3025
3026 calculate_cfg();
3027
3028 brw_fs_optimize(*this);
3029
3030 assign_curb_setup();
3031
3032 brw_fs_lower_3src_null_dest(*this);
3033 brw_fs_workaround_memory_fence_before_eot(*this);
3034 brw_fs_workaround_emit_dummy_mov_instruction(*this);
3035
3036 allocate_registers(allow_spilling);
3037
3038 return !failed;
3039 }
3040
3041 bool
run_task(bool allow_spilling)3042 fs_visitor::run_task(bool allow_spilling)
3043 {
3044 assert(stage == MESA_SHADER_TASK);
3045
3046 payload_ = new task_mesh_thread_payload(*this);
3047
3048 nir_to_brw(this);
3049
3050 if (failed)
3051 return false;
3052
3053 emit_urb_fence();
3054
3055 emit_cs_terminate();
3056
3057 calculate_cfg();
3058
3059 brw_fs_optimize(*this);
3060
3061 assign_curb_setup();
3062
3063 brw_fs_lower_3src_null_dest(*this);
3064 brw_fs_workaround_memory_fence_before_eot(*this);
3065 brw_fs_workaround_emit_dummy_mov_instruction(*this);
3066
3067 allocate_registers(allow_spilling);
3068
3069 return !failed;
3070 }
3071
3072 bool
run_mesh(bool allow_spilling)3073 fs_visitor::run_mesh(bool allow_spilling)
3074 {
3075 assert(stage == MESA_SHADER_MESH);
3076
3077 payload_ = new task_mesh_thread_payload(*this);
3078
3079 nir_to_brw(this);
3080
3081 if (failed)
3082 return false;
3083
3084 emit_urb_fence();
3085
3086 emit_cs_terminate();
3087
3088 calculate_cfg();
3089
3090 brw_fs_optimize(*this);
3091
3092 assign_curb_setup();
3093
3094 brw_fs_lower_3src_null_dest(*this);
3095 brw_fs_workaround_memory_fence_before_eot(*this);
3096 brw_fs_workaround_emit_dummy_mov_instruction(*this);
3097
3098 allocate_registers(allow_spilling);
3099
3100 return !failed;
3101 }
3102
3103 static bool
is_used_in_not_interp_frag_coord(nir_def * def)3104 is_used_in_not_interp_frag_coord(nir_def *def)
3105 {
3106 nir_foreach_use_including_if(src, def) {
3107 if (nir_src_is_if(src))
3108 return true;
3109
3110 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
3111 return true;
3112
3113 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
3114 if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
3115 return true;
3116 }
3117
3118 return false;
3119 }
3120
3121 /**
3122 * Return a bitfield where bit n is set if barycentric interpolation mode n
3123 * (see enum brw_barycentric_mode) is needed by the fragment shader.
3124 *
3125 * We examine the load_barycentric intrinsics rather than looking at input
3126 * variables so that we catch interpolateAtCentroid() messages too, which
3127 * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
3128 */
3129 static unsigned
brw_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)3130 brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
3131 const nir_shader *shader)
3132 {
3133 unsigned barycentric_interp_modes = 0;
3134
3135 nir_foreach_function_impl(impl, shader) {
3136 nir_foreach_block(block, impl) {
3137 nir_foreach_instr(instr, block) {
3138 if (instr->type != nir_instr_type_intrinsic)
3139 continue;
3140
3141 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
3142 switch (intrin->intrinsic) {
3143 case nir_intrinsic_load_barycentric_pixel:
3144 case nir_intrinsic_load_barycentric_centroid:
3145 case nir_intrinsic_load_barycentric_sample:
3146 case nir_intrinsic_load_barycentric_at_sample:
3147 case nir_intrinsic_load_barycentric_at_offset:
3148 break;
3149 default:
3150 continue;
3151 }
3152
3153 /* Ignore WPOS; it doesn't require interpolation. */
3154 if (!is_used_in_not_interp_frag_coord(&intrin->def))
3155 continue;
3156
3157 nir_intrinsic_op bary_op = intrin->intrinsic;
3158 enum brw_barycentric_mode bary =
3159 brw_barycentric_mode(intrin);
3160
3161 barycentric_interp_modes |= 1 << bary;
3162
3163 if (devinfo->needs_unlit_centroid_workaround &&
3164 bary_op == nir_intrinsic_load_barycentric_centroid)
3165 barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
3166 }
3167 }
3168 }
3169
3170 return barycentric_interp_modes;
3171 }
3172
3173 static void
brw_compute_flat_inputs(struct brw_wm_prog_data * prog_data,const nir_shader * shader)3174 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
3175 const nir_shader *shader)
3176 {
3177 prog_data->flat_inputs = 0;
3178
3179 nir_foreach_shader_in_variable(var, shader) {
3180 /* flat shading */
3181 if (var->data.interpolation != INTERP_MODE_FLAT)
3182 continue;
3183
3184 if (var->data.per_primitive)
3185 continue;
3186
3187 unsigned slots = glsl_count_attribute_slots(var->type, false);
3188 for (unsigned s = 0; s < slots; s++) {
3189 int input_index = prog_data->urb_setup[var->data.location + s];
3190
3191 if (input_index >= 0)
3192 prog_data->flat_inputs |= 1 << input_index;
3193 }
3194 }
3195 }
3196
3197 static uint8_t
computed_depth_mode(const nir_shader * shader)3198 computed_depth_mode(const nir_shader *shader)
3199 {
3200 if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3201 switch (shader->info.fs.depth_layout) {
3202 case FRAG_DEPTH_LAYOUT_NONE:
3203 case FRAG_DEPTH_LAYOUT_ANY:
3204 return BRW_PSCDEPTH_ON;
3205 case FRAG_DEPTH_LAYOUT_GREATER:
3206 return BRW_PSCDEPTH_ON_GE;
3207 case FRAG_DEPTH_LAYOUT_LESS:
3208 return BRW_PSCDEPTH_ON_LE;
3209 case FRAG_DEPTH_LAYOUT_UNCHANGED:
3210 /* We initially set this to OFF, but having the shader write the
3211 * depth means we allocate register space in the SEND message. The
3212 * difference between the SEND register count and the OFF state
3213 * programming makes the HW hang.
3214 *
3215 * Removing the depth writes also leads to test failures. So use
3216 * LesserThanOrEqual, which fits writing the same value
3217 * (unchanged/equal).
3218 *
3219 */
3220 return BRW_PSCDEPTH_ON_LE;
3221 }
3222 }
3223 return BRW_PSCDEPTH_OFF;
3224 }
3225
3226 /**
3227 * Move load_interpolated_input with simple (payload-based) barycentric modes
3228 * to the top of the program so we don't emit multiple PLNs for the same input.
3229 *
3230 * This works around CSE not being able to handle non-dominating cases
3231 * such as:
3232 *
3233 * if (...) {
3234 * interpolate input
3235 * } else {
3236 * interpolate the same exact input
3237 * }
3238 *
3239 * This should be replaced by global value numbering someday.
3240 */
3241 bool
brw_nir_move_interpolation_to_top(nir_shader * nir)3242 brw_nir_move_interpolation_to_top(nir_shader *nir)
3243 {
3244 bool progress = false;
3245
3246 nir_foreach_function_impl(impl, nir) {
3247 nir_block *top = nir_start_block(impl);
3248 nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
3249 bool impl_progress = false;
3250
3251 for (nir_block *block = nir_block_cf_tree_next(top);
3252 block != NULL;
3253 block = nir_block_cf_tree_next(block)) {
3254
3255 nir_foreach_instr_safe(instr, block) {
3256 if (instr->type != nir_instr_type_intrinsic)
3257 continue;
3258
3259 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
3260 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
3261 continue;
3262 nir_intrinsic_instr *bary_intrinsic =
3263 nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
3264 nir_intrinsic_op op = bary_intrinsic->intrinsic;
3265
3266 /* Leave interpolateAtSample/Offset() where they are. */
3267 if (op == nir_intrinsic_load_barycentric_at_sample ||
3268 op == nir_intrinsic_load_barycentric_at_offset)
3269 continue;
3270
3271 nir_instr *move[3] = {
3272 &bary_intrinsic->instr,
3273 intrin->src[1].ssa->parent_instr,
3274 instr
3275 };
3276
3277 for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
3278 if (move[i]->block != top) {
3279 nir_instr_move(cursor, move[i]);
3280 impl_progress = true;
3281 }
3282 }
3283 }
3284 }
3285
3286 progress = progress || impl_progress;
3287
3288 nir_metadata_preserve(impl, impl_progress ? (nir_metadata_block_index |
3289 nir_metadata_dominance)
3290 : nir_metadata_all);
3291 }
3292
3293 return progress;
3294 }
3295
3296 static void
brw_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const struct brw_mue_map * mue_map)3297 brw_nir_populate_wm_prog_data(nir_shader *shader,
3298 const struct intel_device_info *devinfo,
3299 const struct brw_wm_prog_key *key,
3300 struct brw_wm_prog_data *prog_data,
3301 const struct brw_mue_map *mue_map)
3302 {
3303 prog_data->uses_kill = shader->info.fs.uses_discard ||
3304 shader->info.fs.uses_demote;
3305 prog_data->uses_omask = !key->ignore_sample_mask_out &&
3306 (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
3307 prog_data->color_outputs_written = key->color_outputs_valid;
3308 prog_data->max_polygons = 1;
3309 prog_data->computed_depth_mode = computed_depth_mode(shader);
3310 prog_data->computed_stencil =
3311 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
3312
3313 prog_data->sample_shading =
3314 shader->info.fs.uses_sample_shading ||
3315 shader->info.outputs_read;
3316
3317 assert(key->multisample_fbo != BRW_NEVER ||
3318 key->persample_interp == BRW_NEVER);
3319
3320 prog_data->persample_dispatch = key->persample_interp;
3321 if (prog_data->sample_shading)
3322 prog_data->persample_dispatch = BRW_ALWAYS;
3323
3324 /* We can only persample dispatch if we have a multisample FBO */
3325 prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
3326 key->multisample_fbo);
3327
3328 /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
3329 * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
3330 * to definitively tell whether alpha_to_coverage is on or off.
3331 */
3332 prog_data->alpha_to_coverage = key->alpha_to_coverage;
3333 assert(prog_data->alpha_to_coverage != BRW_SOMETIMES ||
3334 prog_data->persample_dispatch == BRW_SOMETIMES);
3335
3336 prog_data->uses_sample_mask =
3337 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
3338
3339 /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
3340 *
3341 * "MSDISPMODE_PERSAMPLE is required in order to select
3342 * POSOFFSET_SAMPLE"
3343 *
3344 * So we can only really get sample positions if we are doing real
3345 * per-sample dispatch. If we need gl_SamplePosition and we don't have
3346 * persample dispatch, we hard-code it to 0.5.
3347 */
3348 prog_data->uses_pos_offset =
3349 prog_data->persample_dispatch != BRW_NEVER &&
3350 (BITSET_TEST(shader->info.system_values_read,
3351 SYSTEM_VALUE_SAMPLE_POS) ||
3352 BITSET_TEST(shader->info.system_values_read,
3353 SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
3354
3355 prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
3356 prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
3357 prog_data->inner_coverage = shader->info.fs.inner_coverage;
3358
3359 prog_data->barycentric_interp_modes =
3360 brw_compute_barycentric_interp_modes(devinfo, shader);
3361
3362 /* From the BDW PRM documentation for 3DSTATE_WM:
3363 *
3364 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
3365 * Sample or Non- perspective Sample barycentric coordinates."
3366 *
3367 * So cleanup any potentially set sample barycentric mode when not in per
3368 * sample dispatch.
3369 */
3370 if (prog_data->persample_dispatch == BRW_NEVER) {
3371 prog_data->barycentric_interp_modes &=
3372 ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
3373 }
3374
3375 prog_data->uses_nonperspective_interp_modes |=
3376 (prog_data->barycentric_interp_modes &
3377 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
3378
3379 /* The current VK_EXT_graphics_pipeline_library specification requires
3380 * coarse to specified at compile time. But per sample interpolation can be
3381 * dynamic. So we should never be in a situation where coarse &
3382 * persample_interp are both respectively true & BRW_ALWAYS.
3383 *
3384 * Coarse will dynamically turned off when persample_interp is active.
3385 */
3386 assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
3387
3388 prog_data->coarse_pixel_dispatch =
3389 brw_sometimes_invert(prog_data->persample_dispatch);
3390 if (!key->coarse_pixel ||
3391 prog_data->uses_omask ||
3392 prog_data->sample_shading ||
3393 prog_data->uses_sample_mask ||
3394 (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
3395 prog_data->computed_stencil) {
3396 prog_data->coarse_pixel_dispatch = BRW_NEVER;
3397 }
3398
3399 /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
3400 * Message Descriptor :
3401 *
3402 * "Message Type. Specifies the type of message being sent when
3403 * pixel-rate evaluation is requested :
3404 *
3405 * Format = U2
3406 * 0: Per Message Offset (eval_snapped with immediate offset)
3407 * 1: Sample Position Offset (eval_sindex)
3408 * 2: Centroid Position Offset (eval_centroid)
3409 * 3: Per Slot Offset (eval_snapped with register offset)
3410 *
3411 * Message Type. Specifies the type of message being sent when
3412 * coarse-rate evaluation is requested :
3413 *
3414 * Format = U2
3415 * 0: Coarse to Pixel Mapping Message (internal message)
3416 * 1: Reserved
3417 * 2: Coarse Centroid Position (eval_centroid)
3418 * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
3419 *
3420 * The Sample Position Offset is marked as reserved for coarse rate
3421 * evaluation and leads to hangs if we try to use it. So disable coarse
3422 * pixel shading if we have any intrinsic that will result in a pixel
3423 * interpolater message at sample.
3424 */
3425 if (intel_nir_pulls_at_sample(shader))
3426 prog_data->coarse_pixel_dispatch = BRW_NEVER;
3427
3428 /* We choose to always enable VMask prior to XeHP, as it would cause
3429 * us to lose out on the eliminate_find_live_channel() optimization.
3430 */
3431 prog_data->uses_vmask = devinfo->verx10 < 125 ||
3432 shader->info.fs.needs_quad_helper_invocations ||
3433 shader->info.uses_wide_subgroup_intrinsics ||
3434 prog_data->coarse_pixel_dispatch != BRW_NEVER;
3435
3436 prog_data->uses_src_w =
3437 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
3438 prog_data->uses_src_depth =
3439 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
3440 prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
3441 prog_data->uses_depth_w_coefficients =
3442 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
3443 prog_data->coarse_pixel_dispatch != BRW_NEVER;
3444
3445 calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
3446 brw_compute_flat_inputs(prog_data, shader);
3447 }
3448
3449 /**
3450 * Pre-gfx6, the register file of the EUs was shared between threads,
3451 * and each thread used some subset allocated on a 16-register block
3452 * granularity. The unit states wanted these block counts.
3453 */
3454 static inline int
brw_register_blocks(int reg_count)3455 brw_register_blocks(int reg_count)
3456 {
3457 return ALIGN(reg_count, 16) / 16 - 1;
3458 }
3459
3460 const unsigned *
brw_compile_fs(const struct brw_compiler * compiler,struct brw_compile_fs_params * params)3461 brw_compile_fs(const struct brw_compiler *compiler,
3462 struct brw_compile_fs_params *params)
3463 {
3464 struct nir_shader *nir = params->base.nir;
3465 const struct brw_wm_prog_key *key = params->key;
3466 struct brw_wm_prog_data *prog_data = params->prog_data;
3467 bool allow_spilling = params->allow_spilling;
3468 const bool debug_enabled =
3469 brw_should_print_shader(nir, params->base.debug_flag ?
3470 params->base.debug_flag : DEBUG_WM);
3471
3472 prog_data->base.stage = MESA_SHADER_FRAGMENT;
3473 prog_data->base.ray_queries = nir->info.ray_queries;
3474 prog_data->base.total_scratch = 0;
3475
3476 const struct intel_device_info *devinfo = compiler->devinfo;
3477 const unsigned max_subgroup_size = 32;
3478
3479 brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
3480 brw_nir_lower_fs_inputs(nir, devinfo, key);
3481 brw_nir_lower_fs_outputs(nir);
3482
3483 /* From the SKL PRM, Volume 7, "Alpha Coverage":
3484 * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
3485 * hardware, regardless of the state setting for this feature."
3486 */
3487 if (key->alpha_to_coverage != BRW_NEVER) {
3488 /* Run constant fold optimization in order to get the correct source
3489 * offset to determine render target 0 store instruction in
3490 * emit_alpha_to_coverage pass.
3491 */
3492 NIR_PASS(_, nir, nir_opt_constant_folding);
3493 NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
3494 }
3495
3496 NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
3497 brw_postprocess_nir(nir, compiler, debug_enabled,
3498 key->base.robust_flags);
3499
3500 brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
3501 params->mue_map);
3502
3503 std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
3504 cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
3505 *multi_cfg = NULL;
3506 float throughput = 0;
3507 bool has_spilled = false;
3508
3509 if (devinfo->ver < 20) {
3510 v8 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
3511 prog_data, nir, 8, 1,
3512 params->base.stats != NULL,
3513 debug_enabled);
3514 if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
3515 params->base.error_str = ralloc_strdup(params->base.mem_ctx,
3516 v8->fail_msg);
3517 return NULL;
3518 } else if (INTEL_SIMD(FS, 8)) {
3519 simd8_cfg = v8->cfg;
3520
3521 assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
3522 prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
3523
3524 prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
3525 const performance &perf = v8->performance_analysis.require();
3526 throughput = MAX2(throughput, perf.throughput);
3527 has_spilled = v8->spilled_any_registers;
3528 allow_spilling = false;
3529 }
3530 }
3531
3532 if (key->coarse_pixel && devinfo->ver < 20) {
3533 if (prog_data->dual_src_blend) {
3534 v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
3535 " use SIMD8 messages.\n");
3536 }
3537 v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
3538 " pixel shading.\n");
3539 }
3540
3541 if (nir->info.ray_queries > 0 && v8)
3542 v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
3543
3544 if (!has_spilled &&
3545 (!v8 || v8->max_dispatch_width >= 16) &&
3546 (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
3547 /* Try a SIMD16 compile */
3548 v16 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
3549 prog_data, nir, 16, 1,
3550 params->base.stats != NULL,
3551 debug_enabled);
3552 if (v8)
3553 v16->import_uniforms(v8.get());
3554 if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
3555 brw_shader_perf_log(compiler, params->base.log_data,
3556 "SIMD16 shader failed to compile: %s\n",
3557 v16->fail_msg);
3558 } else {
3559 simd16_cfg = v16->cfg;
3560
3561 assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
3562 prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
3563
3564 prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
3565 const performance &perf = v16->performance_analysis.require();
3566 throughput = MAX2(throughput, perf.throughput);
3567 has_spilled = v16->spilled_any_registers;
3568 allow_spilling = false;
3569 }
3570 }
3571
3572 const bool simd16_failed = v16 && !simd16_cfg;
3573
3574 /* Currently, the compiler only supports SIMD32 on SNB+ */
3575 if (!has_spilled &&
3576 (!v8 || v8->max_dispatch_width >= 32) &&
3577 (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
3578 !simd16_failed &&
3579 INTEL_SIMD(FS, 32)) {
3580 /* Try a SIMD32 compile */
3581 v32 = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
3582 prog_data, nir, 32, 1,
3583 params->base.stats != NULL,
3584 debug_enabled);
3585 if (v8)
3586 v32->import_uniforms(v8.get());
3587 else if (v16)
3588 v32->import_uniforms(v16.get());
3589
3590 if (!v32->run_fs(allow_spilling, false)) {
3591 brw_shader_perf_log(compiler, params->base.log_data,
3592 "SIMD32 shader failed to compile: %s\n",
3593 v32->fail_msg);
3594 } else {
3595 const performance &perf = v32->performance_analysis.require();
3596
3597 if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
3598 brw_shader_perf_log(compiler, params->base.log_data,
3599 "SIMD32 shader inefficient\n");
3600 } else {
3601 simd32_cfg = v32->cfg;
3602
3603 assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
3604 prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
3605
3606 prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
3607 throughput = MAX2(throughput, perf.throughput);
3608 }
3609 }
3610 }
3611
3612 if (devinfo->ver >= 12 && !has_spilled &&
3613 params->max_polygons >= 2 && !key->coarse_pixel) {
3614 fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
3615 assert(vbase);
3616
3617 if (devinfo->ver >= 20 &&
3618 params->max_polygons >= 4 &&
3619 vbase->max_dispatch_width >= 32 &&
3620 4 * prog_data->num_varying_inputs <= MAX_VARYING &&
3621 INTEL_SIMD(FS, 4X8)) {
3622 /* Try a quad-SIMD8 compile */
3623 vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
3624 prog_data, nir, 32, 4,
3625 params->base.stats != NULL,
3626 debug_enabled);
3627 vmulti->import_uniforms(vbase);
3628 if (!vmulti->run_fs(false, params->use_rep_send)) {
3629 brw_shader_perf_log(compiler, params->base.log_data,
3630 "Quad-SIMD8 shader failed to compile: %s\n",
3631 vmulti->fail_msg);
3632 } else {
3633 multi_cfg = vmulti->cfg;
3634 assert(!vmulti->spilled_any_registers);
3635 }
3636 }
3637
3638 if (!multi_cfg && devinfo->ver >= 20 &&
3639 vbase->max_dispatch_width >= 32 &&
3640 2 * prog_data->num_varying_inputs <= MAX_VARYING &&
3641 INTEL_SIMD(FS, 2X16)) {
3642 /* Try a dual-SIMD16 compile */
3643 vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
3644 prog_data, nir, 32, 2,
3645 params->base.stats != NULL,
3646 debug_enabled);
3647 vmulti->import_uniforms(vbase);
3648 if (!vmulti->run_fs(false, params->use_rep_send)) {
3649 brw_shader_perf_log(compiler, params->base.log_data,
3650 "Dual-SIMD16 shader failed to compile: %s\n",
3651 vmulti->fail_msg);
3652 } else {
3653 multi_cfg = vmulti->cfg;
3654 assert(!vmulti->spilled_any_registers);
3655 }
3656 }
3657
3658 if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
3659 2 * prog_data->num_varying_inputs <= MAX_VARYING &&
3660 INTEL_SIMD(FS, 2X8)) {
3661 /* Try a dual-SIMD8 compile */
3662 vmulti = std::make_unique<fs_visitor>(compiler, ¶ms->base, key,
3663 prog_data, nir, 16, 2,
3664 params->base.stats != NULL,
3665 debug_enabled);
3666 vmulti->import_uniforms(vbase);
3667 if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
3668 brw_shader_perf_log(compiler, params->base.log_data,
3669 "Dual-SIMD8 shader failed to compile: %s\n",
3670 vmulti->fail_msg);
3671 } else {
3672 multi_cfg = vmulti->cfg;
3673 }
3674 }
3675
3676 if (multi_cfg) {
3677 assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
3678 prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
3679
3680 prog_data->reg_blocks_8 = brw_register_blocks(vmulti->grf_used);
3681 }
3682 }
3683
3684 /* When the caller requests a repclear shader, they want SIMD16-only */
3685 if (params->use_rep_send)
3686 simd8_cfg = NULL;
3687
3688 fs_generator g(compiler, ¶ms->base, &prog_data->base,
3689 MESA_SHADER_FRAGMENT);
3690
3691 if (unlikely(debug_enabled)) {
3692 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
3693 "%s fragment shader %s",
3694 nir->info.label ?
3695 nir->info.label : "unnamed",
3696 nir->info.name));
3697 }
3698
3699 struct brw_compile_stats *stats = params->base.stats;
3700 uint32_t max_dispatch_width = 0;
3701
3702 if (multi_cfg) {
3703 prog_data->dispatch_multi = vmulti->dispatch_width;
3704 prog_data->max_polygons = vmulti->max_polygons;
3705 g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
3706 vmulti->performance_analysis.require(),
3707 stats, vmulti->max_polygons);
3708 stats = stats ? stats + 1 : NULL;
3709 max_dispatch_width = vmulti->dispatch_width;
3710
3711 } else if (simd8_cfg) {
3712 prog_data->dispatch_8 = true;
3713 g.generate_code(simd8_cfg, 8, v8->shader_stats,
3714 v8->performance_analysis.require(), stats, 1);
3715 stats = stats ? stats + 1 : NULL;
3716 max_dispatch_width = 8;
3717 }
3718
3719 if (simd16_cfg) {
3720 prog_data->dispatch_16 = true;
3721 prog_data->prog_offset_16 = g.generate_code(
3722 simd16_cfg, 16, v16->shader_stats,
3723 v16->performance_analysis.require(), stats, 1);
3724 stats = stats ? stats + 1 : NULL;
3725 max_dispatch_width = 16;
3726 }
3727
3728 if (simd32_cfg) {
3729 prog_data->dispatch_32 = true;
3730 prog_data->prog_offset_32 = g.generate_code(
3731 simd32_cfg, 32, v32->shader_stats,
3732 v32->performance_analysis.require(), stats, 1);
3733 stats = stats ? stats + 1 : NULL;
3734 max_dispatch_width = 32;
3735 }
3736
3737 for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
3738 s->max_dispatch_width = max_dispatch_width;
3739
3740 g.add_const_data(nir->constant_data, nir->constant_data_size);
3741 return g.get_assembly();
3742 }
3743
3744 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)3745 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
3746 unsigned threads)
3747 {
3748 assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
3749 assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
3750 return cs_prog_data->push.per_thread.size * threads +
3751 cs_prog_data->push.cross_thread.size;
3752 }
3753
3754 static void
fill_push_const_block_info(struct brw_push_const_block * block,unsigned dwords)3755 fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
3756 {
3757 block->dwords = dwords;
3758 block->regs = DIV_ROUND_UP(dwords, 8);
3759 block->size = block->regs * 32;
3760 }
3761
3762 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct brw_cs_prog_data * cs_prog_data)3763 cs_fill_push_const_info(const struct intel_device_info *devinfo,
3764 struct brw_cs_prog_data *cs_prog_data)
3765 {
3766 const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
3767 int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data);
3768
3769 /* The thread ID should be stored in the last param dword */
3770 assert(subgroup_id_index == -1 ||
3771 subgroup_id_index == (int)prog_data->nr_params - 1);
3772
3773 unsigned cross_thread_dwords, per_thread_dwords;
3774 if (subgroup_id_index >= 0) {
3775 /* Fill all but the last register with cross-thread payload */
3776 cross_thread_dwords = 8 * (subgroup_id_index / 8);
3777 per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
3778 assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
3779 } else {
3780 /* Fill all data using cross-thread payload */
3781 cross_thread_dwords = prog_data->nr_params;
3782 per_thread_dwords = 0u;
3783 }
3784
3785 fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
3786 fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
3787
3788 assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
3789 cs_prog_data->push.per_thread.size == 0);
3790 assert(cs_prog_data->push.cross_thread.dwords +
3791 cs_prog_data->push.per_thread.dwords ==
3792 prog_data->nr_params);
3793 }
3794
3795 static bool
filter_simd(const nir_instr * instr,const void *)3796 filter_simd(const nir_instr *instr, const void * /* options */)
3797 {
3798 if (instr->type != nir_instr_type_intrinsic)
3799 return false;
3800
3801 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
3802 case nir_intrinsic_load_simd_width_intel:
3803 case nir_intrinsic_load_subgroup_id:
3804 return true;
3805
3806 default:
3807 return false;
3808 }
3809 }
3810
3811 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)3812 lower_simd(nir_builder *b, nir_instr *instr, void *options)
3813 {
3814 uintptr_t simd_width = (uintptr_t)options;
3815
3816 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
3817 case nir_intrinsic_load_simd_width_intel:
3818 return nir_imm_int(b, simd_width);
3819
3820 case nir_intrinsic_load_subgroup_id:
3821 /* If the whole workgroup fits in one thread, we can lower subgroup_id
3822 * to a constant zero.
3823 */
3824 if (!b->shader->info.workgroup_size_variable) {
3825 unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
3826 b->shader->info.workgroup_size[1] *
3827 b->shader->info.workgroup_size[2];
3828 if (local_workgroup_size <= simd_width)
3829 return nir_imm_int(b, 0);
3830 }
3831 return NULL;
3832
3833 default:
3834 return NULL;
3835 }
3836 }
3837
3838 bool
brw_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)3839 brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
3840 {
3841 return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
3842 (void *)(uintptr_t)dispatch_width);
3843 }
3844
3845 const unsigned *
brw_compile_cs(const struct brw_compiler * compiler,struct brw_compile_cs_params * params)3846 brw_compile_cs(const struct brw_compiler *compiler,
3847 struct brw_compile_cs_params *params)
3848 {
3849 const nir_shader *nir = params->base.nir;
3850 const struct brw_cs_prog_key *key = params->key;
3851 struct brw_cs_prog_data *prog_data = params->prog_data;
3852
3853 const bool debug_enabled =
3854 brw_should_print_shader(nir, params->base.debug_flag ?
3855 params->base.debug_flag : DEBUG_CS);
3856
3857 prog_data->base.stage = MESA_SHADER_COMPUTE;
3858 prog_data->base.total_shared = nir->info.shared_size;
3859 prog_data->base.ray_queries = nir->info.ray_queries;
3860 prog_data->base.total_scratch = 0;
3861
3862 if (!nir->info.workgroup_size_variable) {
3863 prog_data->local_size[0] = nir->info.workgroup_size[0];
3864 prog_data->local_size[1] = nir->info.workgroup_size[1];
3865 prog_data->local_size[2] = nir->info.workgroup_size[2];
3866 }
3867
3868 brw_simd_selection_state simd_state{
3869 .devinfo = compiler->devinfo,
3870 .prog_data = prog_data,
3871 .required_width = brw_required_dispatch_width(&nir->info),
3872 };
3873
3874 std::unique_ptr<fs_visitor> v[3];
3875
3876 for (unsigned simd = 0; simd < 3; simd++) {
3877 if (!brw_simd_should_compile(simd_state, simd))
3878 continue;
3879
3880 const unsigned dispatch_width = 8u << simd;
3881
3882 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
3883 brw_nir_apply_key(shader, compiler, &key->base,
3884 dispatch_width);
3885
3886 NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
3887
3888 /* Clean up after the local index and ID calculations. */
3889 NIR_PASS(_, shader, nir_opt_constant_folding);
3890 NIR_PASS(_, shader, nir_opt_dce);
3891
3892 brw_postprocess_nir(shader, compiler, debug_enabled,
3893 key->base.robust_flags);
3894
3895 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
3896 &key->base,
3897 &prog_data->base,
3898 shader, dispatch_width,
3899 params->base.stats != NULL,
3900 debug_enabled);
3901
3902 const int first = brw_simd_first_compiled(simd_state);
3903 if (first >= 0)
3904 v[simd]->import_uniforms(v[first].get());
3905
3906 const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
3907
3908 if (v[simd]->run_cs(allow_spilling)) {
3909 cs_fill_push_const_info(compiler->devinfo, prog_data);
3910
3911 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
3912 } else {
3913 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
3914 if (simd > 0) {
3915 brw_shader_perf_log(compiler, params->base.log_data,
3916 "SIMD%u shader failed to compile: %s\n",
3917 dispatch_width, v[simd]->fail_msg);
3918 }
3919 }
3920 }
3921
3922 const int selected_simd = brw_simd_select(simd_state);
3923 if (selected_simd < 0) {
3924 params->base.error_str =
3925 ralloc_asprintf(params->base.mem_ctx,
3926 "Can't compile shader: "
3927 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
3928 simd_state.error[0], simd_state.error[1],
3929 simd_state.error[2]);
3930 return NULL;
3931 }
3932
3933 assert(selected_simd < 3);
3934
3935 if (!nir->info.workgroup_size_variable)
3936 prog_data->prog_mask = 1 << selected_simd;
3937
3938 fs_generator g(compiler, ¶ms->base, &prog_data->base,
3939 MESA_SHADER_COMPUTE);
3940 if (unlikely(debug_enabled)) {
3941 char *name = ralloc_asprintf(params->base.mem_ctx,
3942 "%s compute shader %s",
3943 nir->info.label ?
3944 nir->info.label : "unnamed",
3945 nir->info.name);
3946 g.enable_debug(name);
3947 }
3948
3949 uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
3950
3951 struct brw_compile_stats *stats = params->base.stats;
3952 for (unsigned simd = 0; simd < 3; simd++) {
3953 if (prog_data->prog_mask & (1u << simd)) {
3954 assert(v[simd]);
3955 prog_data->prog_offset[simd] =
3956 g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
3957 v[simd]->performance_analysis.require(), stats);
3958 if (stats)
3959 stats->max_dispatch_width = max_dispatch_width;
3960 stats = stats ? stats + 1 : NULL;
3961 max_dispatch_width = 8u << simd;
3962 }
3963 }
3964
3965 g.add_const_data(nir->constant_data, nir->constant_data_size);
3966
3967 return g.get_assembly();
3968 }
3969
3970 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)3971 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
3972 const struct brw_cs_prog_data *prog_data,
3973 const unsigned *override_local_size)
3974 {
3975 struct intel_cs_dispatch_info info = {};
3976
3977 const unsigned *sizes =
3978 override_local_size ? override_local_size :
3979 prog_data->local_size;
3980
3981 const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
3982 assert(simd >= 0 && simd < 3);
3983
3984 info.group_size = sizes[0] * sizes[1] * sizes[2];
3985 info.simd_size = 8u << simd;
3986 info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
3987
3988 const uint32_t remainder = info.group_size & (info.simd_size - 1);
3989 if (remainder > 0)
3990 info.right_mask = ~0u >> (32 - remainder);
3991 else
3992 info.right_mask = ~0u >> (32 - info.simd_size);
3993
3994 return info;
3995 }
3996
3997 static uint8_t
compile_single_bs(const struct brw_compiler * compiler,struct brw_compile_bs_params * params,const struct brw_bs_prog_key * key,struct brw_bs_prog_data * prog_data,nir_shader * shader,fs_generator * g,struct brw_compile_stats * stats,int * prog_offset)3998 compile_single_bs(const struct brw_compiler *compiler,
3999 struct brw_compile_bs_params *params,
4000 const struct brw_bs_prog_key *key,
4001 struct brw_bs_prog_data *prog_data,
4002 nir_shader *shader,
4003 fs_generator *g,
4004 struct brw_compile_stats *stats,
4005 int *prog_offset)
4006 {
4007 const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
4008
4009 prog_data->base.stage = shader->info.stage;
4010 prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
4011 shader->scratch_size);
4012
4013 const unsigned max_dispatch_width = 16;
4014 brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width);
4015 brw_postprocess_nir(shader, compiler, debug_enabled,
4016 key->base.robust_flags);
4017
4018 brw_simd_selection_state simd_state{
4019 .devinfo = compiler->devinfo,
4020 .prog_data = prog_data,
4021
4022 /* Since divergence is a lot more likely in RT than compute, it makes
4023 * sense to limit ourselves to the smallest available SIMD for now.
4024 */
4025 .required_width = compiler->devinfo->ver >= 20 ? 16u : 8u,
4026 };
4027
4028 std::unique_ptr<fs_visitor> v[2];
4029
4030 for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) {
4031 if (!brw_simd_should_compile(simd_state, simd))
4032 continue;
4033
4034 const unsigned dispatch_width = 8u << simd;
4035
4036 if (dispatch_width == 8 && compiler->devinfo->ver >= 20)
4037 continue;
4038
4039 v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
4040 &key->base,
4041 &prog_data->base, shader,
4042 dispatch_width,
4043 stats != NULL,
4044 debug_enabled);
4045
4046 const bool allow_spilling = !brw_simd_any_compiled(simd_state);
4047 if (v[simd]->run_bs(allow_spilling)) {
4048 brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
4049 } else {
4050 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx,
4051 v[simd]->fail_msg);
4052 if (simd > 0) {
4053 brw_shader_perf_log(compiler, params->base.log_data,
4054 "SIMD%u shader failed to compile: %s",
4055 dispatch_width, v[simd]->fail_msg);
4056 }
4057 }
4058 }
4059
4060 const int selected_simd = brw_simd_select(simd_state);
4061 if (selected_simd < 0) {
4062 params->base.error_str =
4063 ralloc_asprintf(params->base.mem_ctx,
4064 "Can't compile shader: "
4065 "SIMD8 '%s' and SIMD16 '%s'.\n",
4066 simd_state.error[0], simd_state.error[1]);
4067 return 0;
4068 }
4069
4070 assert(selected_simd < int(ARRAY_SIZE(v)));
4071 fs_visitor *selected = v[selected_simd].get();
4072 assert(selected);
4073
4074 const unsigned dispatch_width = selected->dispatch_width;
4075
4076 int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats,
4077 selected->performance_analysis.require(), stats);
4078 if (prog_offset)
4079 *prog_offset = offset;
4080 else
4081 assert(offset == 0);
4082
4083 return dispatch_width;
4084 }
4085
4086 uint64_t
brw_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)4087 brw_bsr(const struct intel_device_info *devinfo,
4088 uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
4089 {
4090 assert(offset % 64 == 0);
4091 assert(simd_size == 8 || simd_size == 16);
4092 assert(local_arg_offset % 8 == 0);
4093
4094 return offset |
4095 SET_BITS(simd_size == 8, 4, 4) |
4096 SET_BITS(local_arg_offset / 8, 2, 0);
4097 }
4098
4099 const unsigned *
brw_compile_bs(const struct brw_compiler * compiler,struct brw_compile_bs_params * params)4100 brw_compile_bs(const struct brw_compiler *compiler,
4101 struct brw_compile_bs_params *params)
4102 {
4103 nir_shader *shader = params->base.nir;
4104 struct brw_bs_prog_data *prog_data = params->prog_data;
4105 unsigned num_resume_shaders = params->num_resume_shaders;
4106 nir_shader **resume_shaders = params->resume_shaders;
4107 const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
4108
4109 prog_data->base.stage = shader->info.stage;
4110 prog_data->base.ray_queries = shader->info.ray_queries;
4111 prog_data->base.total_scratch = 0;
4112
4113 prog_data->max_stack_size = 0;
4114 prog_data->num_resume_shaders = num_resume_shaders;
4115
4116 fs_generator g(compiler, ¶ms->base, &prog_data->base,
4117 shader->info.stage);
4118 if (unlikely(debug_enabled)) {
4119 char *name = ralloc_asprintf(params->base.mem_ctx,
4120 "%s %s shader %s",
4121 shader->info.label ?
4122 shader->info.label : "unnamed",
4123 gl_shader_stage_name(shader->info.stage),
4124 shader->info.name);
4125 g.enable_debug(name);
4126 }
4127
4128 prog_data->simd_size =
4129 compile_single_bs(compiler, params, params->key, prog_data,
4130 shader, &g, params->base.stats, NULL);
4131 if (prog_data->simd_size == 0)
4132 return NULL;
4133
4134 uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx,
4135 uint64_t, num_resume_shaders);
4136 for (unsigned i = 0; i < num_resume_shaders; i++) {
4137 if (INTEL_DEBUG(DEBUG_RT)) {
4138 char *name = ralloc_asprintf(params->base.mem_ctx,
4139 "%s %s resume(%u) shader %s",
4140 shader->info.label ?
4141 shader->info.label : "unnamed",
4142 gl_shader_stage_name(shader->info.stage),
4143 i, shader->info.name);
4144 g.enable_debug(name);
4145 }
4146
4147 /* TODO: Figure out shader stats etc. for resume shaders */
4148 int offset = 0;
4149 uint8_t simd_size =
4150 compile_single_bs(compiler, params, params->key,
4151 prog_data, resume_shaders[i], &g, NULL, &offset);
4152 if (simd_size == 0)
4153 return NULL;
4154
4155 assert(offset > 0);
4156 resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
4157 }
4158
4159 /* We only have one constant data so we want to make sure they're all the
4160 * same.
4161 */
4162 for (unsigned i = 0; i < num_resume_shaders; i++) {
4163 assert(resume_shaders[i]->constant_data_size ==
4164 shader->constant_data_size);
4165 assert(memcmp(resume_shaders[i]->constant_data,
4166 shader->constant_data,
4167 shader->constant_data_size) == 0);
4168 }
4169
4170 g.add_const_data(shader->constant_data, shader->constant_data_size);
4171 g.add_resume_sbt(num_resume_shaders, resume_sbt);
4172
4173 return g.get_assembly();
4174 }
4175
4176 /**
4177 * Test the dispatch mask packing assumptions of
4178 * brw_stage_has_packed_dispatch(). Call this from e.g. the top of
4179 * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
4180 * executed with an unexpected dispatch mask.
4181 */
4182 static UNUSED void
brw_fs_test_dispatch_packing(const fs_builder & bld)4183 brw_fs_test_dispatch_packing(const fs_builder &bld)
4184 {
4185 const fs_visitor *shader = static_cast<const fs_visitor *>(bld.shader);
4186 const gl_shader_stage stage = shader->stage;
4187 const bool uses_vmask =
4188 stage == MESA_SHADER_FRAGMENT &&
4189 brw_wm_prog_data(shader->stage_prog_data)->uses_vmask;
4190
4191 if (brw_stage_has_packed_dispatch(shader->devinfo, stage,
4192 shader->max_polygons,
4193 shader->stage_prog_data)) {
4194 const fs_builder ubld = bld.exec_all().group(1, 0);
4195 const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
4196 const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
4197
4198 ubld.ADD(tmp, mask, brw_imm_ud(1));
4199 ubld.AND(tmp, mask, tmp);
4200
4201 /* This will loop forever if the dispatch mask doesn't have the expected
4202 * form '2^n-1', in which case tmp will be non-zero.
4203 */
4204 bld.emit(BRW_OPCODE_DO);
4205 bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
4206 set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
4207 }
4208 }
4209
4210 unsigned
workgroup_size() const4211 fs_visitor::workgroup_size() const
4212 {
4213 assert(gl_shader_stage_uses_workgroup(stage));
4214 const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
4215 return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
4216 }
4217
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)4218 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
4219 {
4220 return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
4221 }
4222
4223 namespace brw {
4224 fs_reg
fetch_payload_reg(const brw::fs_builder & bld,uint8_t regs[2],brw_reg_type type,unsigned n)4225 fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
4226 brw_reg_type type, unsigned n)
4227 {
4228 if (!regs[0])
4229 return fs_reg();
4230
4231 if (bld.dispatch_width() > 16) {
4232 const fs_reg tmp = bld.vgrf(type, n);
4233 const brw::fs_builder hbld = bld.exec_all().group(16, 0);
4234 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
4235 fs_reg *const components = new fs_reg[m * n];
4236
4237 for (unsigned c = 0; c < n; c++) {
4238 for (unsigned g = 0; g < m; g++)
4239 components[c * m + g] =
4240 offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
4241 }
4242
4243 hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
4244
4245 delete[] components;
4246 return tmp;
4247
4248 } else {
4249 return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
4250 }
4251 }
4252
4253 fs_reg
fetch_barycentric_reg(const brw::fs_builder & bld,uint8_t regs[2])4254 fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
4255 {
4256 if (!regs[0])
4257 return fs_reg();
4258 else if (bld.shader->devinfo->ver >= 20)
4259 return fetch_payload_reg(bld, regs, BRW_REGISTER_TYPE_F, 2);
4260
4261 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
4262 const brw::fs_builder hbld = bld.exec_all().group(8, 0);
4263 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
4264 fs_reg *const components = new fs_reg[2 * m];
4265
4266 for (unsigned c = 0; c < 2; c++) {
4267 for (unsigned g = 0; g < m; g++)
4268 components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
4269 hbld, c + 2 * (g % 2));
4270 }
4271
4272 hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
4273
4274 delete[] components;
4275 return tmp;
4276 }
4277
4278 void
check_dynamic_msaa_flag(const fs_builder & bld,const struct brw_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)4279 check_dynamic_msaa_flag(const fs_builder &bld,
4280 const struct brw_wm_prog_data *wm_prog_data,
4281 enum intel_msaa_flags flag)
4282 {
4283 fs_inst *inst = bld.AND(bld.null_reg_ud(),
4284 dynamic_msaa_flags(wm_prog_data),
4285 brw_imm_ud(flag));
4286 inst->conditional_mod = BRW_CONDITIONAL_NZ;
4287 }
4288 }
4289