• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs.cpp
25  *
26  * This file drives the GLSL IR -> LIR translation, contains the
27  * optimizations on the LIR, and drives the generation of native code
28  * from the LIR.
29  */
30 
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_fs_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_dead_control_flow.h"
38 #include "brw_private.h"
39 #include "intel_nir.h"
40 #include "shader_enums.h"
41 #include "dev/intel_debug.h"
42 #include "dev/intel_wa.h"
43 #include "compiler/glsl_types.h"
44 #include "compiler/nir/nir_builder.h"
45 #include "util/u_math.h"
46 
47 #include <memory>
48 
49 using namespace brw;
50 
51 void
init(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
53               const fs_reg *src, unsigned sources)
54 {
55    memset((void*)this, 0, sizeof(*this));
56 
57    this->src = new fs_reg[MAX2(sources, 3)];
58    for (unsigned i = 0; i < sources; i++)
59       this->src[i] = src[i];
60 
61    this->opcode = opcode;
62    this->dst = dst;
63    this->sources = sources;
64    this->exec_size = exec_size;
65 
66    assert(dst.file != IMM && dst.file != UNIFORM);
67 
68    assert(this->exec_size != 0);
69 
70    this->conditional_mod = BRW_CONDITIONAL_NONE;
71 
72    /* This will be the case for almost all instructions. */
73    switch (dst.file) {
74    case VGRF:
75    case ARF:
76    case FIXED_GRF:
77    case ATTR:
78       this->size_written = dst.component_size(exec_size);
79       break;
80    case BAD_FILE:
81       this->size_written = 0;
82       break;
83    case IMM:
84    case UNIFORM:
85       unreachable("Invalid destination register file");
86    }
87 
88    this->writes_accumulator = false;
89 }
90 
fs_inst()91 fs_inst::fs_inst()
92 {
93    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
94 }
95 
fs_inst(enum opcode opcode,uint8_t exec_size)96 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
97 {
98    init(opcode, exec_size, reg_undef, NULL, 0);
99 }
100 
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst)101 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
102 {
103    init(opcode, exec_size, dst, NULL, 0);
104 }
105 
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg & src0)106 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
107                  const fs_reg &src0)
108 {
109    const fs_reg src[1] = { src0 };
110    init(opcode, exec_size, dst, src, 1);
111 }
112 
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg & src0,const fs_reg & src1)113 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
114                  const fs_reg &src0, const fs_reg &src1)
115 {
116    const fs_reg src[2] = { src0, src1 };
117    init(opcode, exec_size, dst, src, 2);
118 }
119 
fs_inst(enum opcode opcode,uint8_t exec_size,const fs_reg & dst,const fs_reg & src0,const fs_reg & src1,const fs_reg & src2)120 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
121                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
122 {
123    const fs_reg src[3] = { src0, src1, src2 };
124    init(opcode, exec_size, dst, src, 3);
125 }
126 
fs_inst(enum opcode opcode,uint8_t exec_width,const fs_reg & dst,const fs_reg src[],unsigned sources)127 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
128                  const fs_reg src[], unsigned sources)
129 {
130    init(opcode, exec_width, dst, src, sources);
131 }
132 
fs_inst(const fs_inst & that)133 fs_inst::fs_inst(const fs_inst &that)
134 {
135    memcpy((void*)this, &that, sizeof(that));
136 
137    this->src = new fs_reg[MAX2(that.sources, 3)];
138 
139    for (unsigned i = 0; i < that.sources; i++)
140       this->src[i] = that.src[i];
141 }
142 
~fs_inst()143 fs_inst::~fs_inst()
144 {
145    delete[] this->src;
146 }
147 
148 void
resize_sources(uint8_t num_sources)149 fs_inst::resize_sources(uint8_t num_sources)
150 {
151    if (this->sources != num_sources) {
152       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
153 
154       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
155          src[i] = this->src[i];
156 
157       delete[] this->src;
158       this->src = src;
159       this->sources = num_sources;
160    }
161 }
162 
163 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const fs_reg & dst,const fs_reg & surface,const fs_reg & surface_handle,const fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)164 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
165                                        const fs_reg &dst,
166                                        const fs_reg &surface,
167                                        const fs_reg &surface_handle,
168                                        const fs_reg &varying_offset,
169                                        uint32_t const_offset,
170                                        uint8_t alignment,
171                                        unsigned components)
172 {
173    assert(components <= 4);
174 
175    /* We have our constant surface use a pitch of 4 bytes, so our index can
176     * be any component of a vector, and then we load 4 contiguous
177     * components starting from that.  TODO: Support loading fewer than 4.
178     */
179    fs_reg total_offset = vgrf(glsl_uint_type());
180    bld.ADD(total_offset, varying_offset, brw_imm_ud(const_offset));
181 
182    /* The pull load message will load a vec4 (16 bytes). If we are loading
183     * a double this means we are only loading 2 elements worth of data.
184     * We also want to use a 32-bit data type for the dst of the load operation
185     * so other parts of the driver don't get confused about the size of the
186     * result.
187     */
188    fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
189 
190    fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
191    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
192    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
193    srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
194    srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = brw_imm_ud(alignment);
195 
196    fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
197                             vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
198    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
199 
200    shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
201 }
202 
203 bool
is_send_from_grf() const204 fs_inst::is_send_from_grf() const
205 {
206    switch (opcode) {
207    case SHADER_OPCODE_SEND:
208    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
209    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
210    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
211    case SHADER_OPCODE_INTERLOCK:
212    case SHADER_OPCODE_MEMORY_FENCE:
213    case SHADER_OPCODE_BARRIER:
214       return true;
215    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
216       return src[1].file == VGRF;
217    case FS_OPCODE_FB_READ:
218       return src[0].file == VGRF;
219    default:
220       return false;
221    }
222 }
223 
224 bool
is_control_source(unsigned arg) const225 fs_inst::is_control_source(unsigned arg) const
226 {
227    switch (opcode) {
228    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
229       return arg == 0;
230 
231    case SHADER_OPCODE_BROADCAST:
232    case SHADER_OPCODE_SHUFFLE:
233    case SHADER_OPCODE_QUAD_SWIZZLE:
234    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
235    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
236    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
237       return arg == 1;
238 
239    case SHADER_OPCODE_MOV_INDIRECT:
240    case SHADER_OPCODE_CLUSTER_BROADCAST:
241    case SHADER_OPCODE_TEX:
242    case FS_OPCODE_TXB:
243    case SHADER_OPCODE_TXD:
244    case SHADER_OPCODE_TXF:
245    case SHADER_OPCODE_TXF_LZ:
246    case SHADER_OPCODE_TXF_CMS:
247    case SHADER_OPCODE_TXF_CMS_W:
248    case SHADER_OPCODE_TXF_UMS:
249    case SHADER_OPCODE_TXF_MCS:
250    case SHADER_OPCODE_TXL:
251    case SHADER_OPCODE_TXL_LZ:
252    case SHADER_OPCODE_TXS:
253    case SHADER_OPCODE_LOD:
254    case SHADER_OPCODE_TG4:
255    case SHADER_OPCODE_TG4_OFFSET:
256    case SHADER_OPCODE_TG4_BIAS:
257    case SHADER_OPCODE_TG4_EXPLICIT_LOD:
258    case SHADER_OPCODE_TG4_IMPLICIT_LOD:
259    case SHADER_OPCODE_TG4_OFFSET_LOD:
260    case SHADER_OPCODE_TG4_OFFSET_BIAS:
261    case SHADER_OPCODE_SAMPLEINFO:
262       return arg == 1 || arg == 2;
263 
264    case SHADER_OPCODE_SEND:
265       return arg == 0 || arg == 1;
266 
267    default:
268       return false;
269    }
270 }
271 
272 bool
is_payload(unsigned arg) const273 fs_inst::is_payload(unsigned arg) const
274 {
275    switch (opcode) {
276    case FS_OPCODE_FB_READ:
277    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
278    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
279    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
280    case SHADER_OPCODE_INTERLOCK:
281    case SHADER_OPCODE_MEMORY_FENCE:
282    case SHADER_OPCODE_BARRIER:
283    case SHADER_OPCODE_TEX:
284    case FS_OPCODE_TXB:
285    case SHADER_OPCODE_TXD:
286    case SHADER_OPCODE_TXF:
287    case SHADER_OPCODE_TXF_LZ:
288    case SHADER_OPCODE_TXF_CMS:
289    case SHADER_OPCODE_TXF_CMS_W:
290    case SHADER_OPCODE_TXF_UMS:
291    case SHADER_OPCODE_TXF_MCS:
292    case SHADER_OPCODE_TXL:
293    case SHADER_OPCODE_TXL_LZ:
294    case SHADER_OPCODE_TXS:
295    case SHADER_OPCODE_LOD:
296    case SHADER_OPCODE_TG4:
297    case SHADER_OPCODE_TG4_OFFSET:
298    case SHADER_OPCODE_TG4_BIAS:
299    case SHADER_OPCODE_TG4_EXPLICIT_LOD:
300    case SHADER_OPCODE_TG4_IMPLICIT_LOD:
301    case SHADER_OPCODE_TG4_OFFSET_LOD:
302    case SHADER_OPCODE_TG4_OFFSET_BIAS:
303    case SHADER_OPCODE_SAMPLEINFO:
304       return arg == 0;
305 
306    case SHADER_OPCODE_SEND:
307       return arg == 2 || arg == 3;
308 
309    default:
310       return false;
311    }
312 }
313 
314 /**
315  * Returns true if this instruction's sources and destinations cannot
316  * safely be the same register.
317  *
318  * In most cases, a register can be written over safely by the same
319  * instruction that is its last use.  For a single instruction, the
320  * sources are dereferenced before writing of the destination starts
321  * (naturally).
322  *
323  * However, there are a few cases where this can be problematic:
324  *
325  * - Virtual opcodes that translate to multiple instructions in the
326  *   code generator: if src == dst and one instruction writes the
327  *   destination before a later instruction reads the source, then
328  *   src will have been clobbered.
329  *
330  * - SIMD16 compressed instructions with certain regioning (see below).
331  *
332  * The register allocator uses this information to set up conflicts between
333  * GRF sources and the destination.
334  */
335 bool
has_source_and_destination_hazard() const336 fs_inst::has_source_and_destination_hazard() const
337 {
338    switch (opcode) {
339    case FS_OPCODE_PACK_HALF_2x16_SPLIT:
340       /* Multiple partial writes to the destination */
341       return true;
342    case SHADER_OPCODE_SHUFFLE:
343       /* This instruction returns an arbitrary channel from the source and
344        * gets split into smaller instructions in the generator.  It's possible
345        * that one of the instructions will read from a channel corresponding
346        * to an earlier instruction.
347        */
348    case SHADER_OPCODE_SEL_EXEC:
349       /* This is implemented as
350        *
351        * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
352        * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
353        *
354        * Because the source is only read in the second instruction, the first
355        * may stomp all over it.
356        */
357       return true;
358    case SHADER_OPCODE_QUAD_SWIZZLE:
359       switch (src[1].ud) {
360       case BRW_SWIZZLE_XXXX:
361       case BRW_SWIZZLE_YYYY:
362       case BRW_SWIZZLE_ZZZZ:
363       case BRW_SWIZZLE_WWWW:
364       case BRW_SWIZZLE_XXZZ:
365       case BRW_SWIZZLE_YYWW:
366       case BRW_SWIZZLE_XYXY:
367       case BRW_SWIZZLE_ZWZW:
368          /* These can be implemented as a single Align1 region on all
369           * platforms, so there's never a hazard between source and
370           * destination.  C.f. fs_generator::generate_quad_swizzle().
371           */
372          return false;
373       default:
374          return !is_uniform(src[0]);
375       }
376    case BRW_OPCODE_DPAS:
377       /* This is overly conservative. The actual hazard is more complicated to
378        * describe. When the repeat count is N, the single instruction behaves
379        * like N instructions with a repeat count of one, but the destination
380        * and source registers are incremented (in somewhat complex ways) for
381        * each instruction.
382        *
383        * This means the source and destination register is actually a range of
384        * registers. The hazard exists of an earlier iteration would write a
385        * register that should be read by a later iteration.
386        *
387        * There may be some advantage to properly modeling this, but for now,
388        * be overly conservative.
389        */
390       return rcount > 1;
391    default:
392       /* The SIMD16 compressed instruction
393        *
394        * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
395        *
396        * is actually decoded in hardware as:
397        *
398        * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
399        * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
400        *
401        * Which is safe.  However, if we have uniform accesses
402        * happening, we get into trouble:
403        *
404        * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
405        * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
406        *
407        * Now our destination for the first instruction overwrote the
408        * second instruction's src0, and we get garbage for those 8
409        * pixels.  There's a similar issue for the pre-gfx6
410        * pixel_x/pixel_y, which are registers of 16-bit values and thus
411        * would get stomped by the first decode as well.
412        */
413       if (exec_size == 16) {
414          for (int i = 0; i < sources; i++) {
415             if (src[i].file == VGRF && (src[i].stride == 0 ||
416                                         src[i].type == BRW_REGISTER_TYPE_UW ||
417                                         src[i].type == BRW_REGISTER_TYPE_W ||
418                                         src[i].type == BRW_REGISTER_TYPE_UB ||
419                                         src[i].type == BRW_REGISTER_TYPE_B)) {
420                return true;
421             }
422          }
423       }
424       return false;
425    }
426 }
427 
428 bool
can_do_source_mods(const struct intel_device_info * devinfo) const429 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
430 {
431    if (is_send_from_grf())
432       return false;
433 
434    /* From Wa_1604601757:
435     *
436     * "When multiplying a DW and any lower precision integer, source modifier
437     *  is not supported."
438     */
439    if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
440                               opcode == BRW_OPCODE_MAD)) {
441       const brw_reg_type exec_type = get_exec_type(this);
442       const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
443          MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
444          MIN2(type_sz(src[0].type), type_sz(src[1].type));
445 
446       if (brw_reg_type_is_integer(exec_type) &&
447           type_sz(exec_type) >= 4 &&
448           type_sz(exec_type) != min_type_sz)
449          return false;
450    }
451 
452    if (!backend_instruction::can_do_source_mods())
453       return false;
454 
455    return true;
456 }
457 
458 bool
can_do_cmod()459 fs_inst::can_do_cmod()
460 {
461    if (!backend_instruction::can_do_cmod())
462       return false;
463 
464    /* The accumulator result appears to get used for the conditional modifier
465     * generation.  When negating a UD value, there is a 33rd bit generated for
466     * the sign in the accumulator value, so now you can't check, for example,
467     * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
468     */
469    for (unsigned i = 0; i < sources; i++) {
470       if (brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
471          return false;
472    }
473 
474    return true;
475 }
476 
477 bool
can_change_types() const478 fs_inst::can_change_types() const
479 {
480    return dst.type == src[0].type &&
481           !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
482           (opcode == BRW_OPCODE_MOV ||
483            (opcode == BRW_OPCODE_SEL &&
484             dst.type == src[1].type &&
485             predicate != BRW_PREDICATE_NONE &&
486             !src[1].abs && !src[1].negate && src[1].file != ATTR));
487 }
488 
489 void
init()490 fs_reg::init()
491 {
492    memset((void*)this, 0, sizeof(*this));
493    type = BRW_REGISTER_TYPE_UD;
494    stride = 1;
495 }
496 
497 /** Generic unset register constructor. */
fs_reg()498 fs_reg::fs_reg()
499 {
500    init();
501    this->file = BAD_FILE;
502 }
503 
fs_reg(struct::brw_reg reg)504 fs_reg::fs_reg(struct ::brw_reg reg) :
505    backend_reg(reg)
506 {
507    this->offset = 0;
508    this->stride = 1;
509    if (this->file == IMM &&
510        (this->type != BRW_REGISTER_TYPE_V &&
511         this->type != BRW_REGISTER_TYPE_UV &&
512         this->type != BRW_REGISTER_TYPE_VF)) {
513       this->stride = 0;
514    }
515 }
516 
517 bool
equals(const fs_reg & r) const518 fs_reg::equals(const fs_reg &r) const
519 {
520    return (this->backend_reg::equals(r) &&
521            stride == r.stride);
522 }
523 
524 bool
negative_equals(const fs_reg & r) const525 fs_reg::negative_equals(const fs_reg &r) const
526 {
527    return (this->backend_reg::negative_equals(r) &&
528            stride == r.stride);
529 }
530 
531 bool
is_contiguous() const532 fs_reg::is_contiguous() const
533 {
534    switch (file) {
535    case ARF:
536    case FIXED_GRF:
537       return hstride == BRW_HORIZONTAL_STRIDE_1 &&
538              vstride == width + hstride;
539    case VGRF:
540    case ATTR:
541       return stride == 1;
542    case UNIFORM:
543    case IMM:
544    case BAD_FILE:
545       return true;
546    }
547 
548    unreachable("Invalid register file");
549 }
550 
551 unsigned
component_size(unsigned width) const552 fs_reg::component_size(unsigned width) const
553 {
554    if (file == ARF || file == FIXED_GRF) {
555       const unsigned w = MIN2(width, 1u << this->width);
556       const unsigned h = width >> this->width;
557       const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
558       const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
559       assert(w > 0);
560       return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
561    } else {
562       return MAX2(width * stride, 1) * type_sz(type);
563    }
564 }
565 
566 void
vfail(const char * format,va_list va)567 fs_visitor::vfail(const char *format, va_list va)
568 {
569    char *msg;
570 
571    if (failed)
572       return;
573 
574    failed = true;
575 
576    msg = ralloc_vasprintf(mem_ctx, format, va);
577    msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
578          dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
579 
580    this->fail_msg = msg;
581 
582    if (unlikely(debug_enabled)) {
583       fprintf(stderr, "%s",  msg);
584    }
585 }
586 
587 void
fail(const char * format,...)588 fs_visitor::fail(const char *format, ...)
589 {
590    va_list va;
591 
592    va_start(va, format);
593    vfail(format, va);
594    va_end(va);
595 }
596 
597 /**
598  * Mark this program as impossible to compile with dispatch width greater
599  * than n.
600  *
601  * During the SIMD8 compile (which happens first), we can detect and flag
602  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
603  * SIMD16+ compile altogether.
604  *
605  * During a compile of dispatch width greater than n (if one happens anyway),
606  * this just calls fail().
607  */
608 void
limit_dispatch_width(unsigned n,const char * msg)609 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
610 {
611    if (dispatch_width > n) {
612       fail("%s", msg);
613    } else {
614       max_dispatch_width = MIN2(max_dispatch_width, n);
615       brw_shader_perf_log(compiler, log_data,
616                           "Shader dispatch width limited to SIMD%d: %s\n",
617                           n, msg);
618    }
619 }
620 
621 /**
622  * Returns true if the instruction has a flag that means it won't
623  * update an entire destination register.
624  *
625  * For example, dead code elimination and live variable analysis want to know
626  * when a write to a variable screens off any preceding values that were in
627  * it.
628  */
629 bool
is_partial_write() const630 fs_inst::is_partial_write() const
631 {
632    if (this->predicate && !this->predicate_trivial &&
633        this->opcode != BRW_OPCODE_SEL)
634       return true;
635 
636    if (this->dst.offset % REG_SIZE != 0)
637       return true;
638 
639    /* SEND instructions always write whole registers */
640    if (this->opcode == SHADER_OPCODE_SEND)
641       return false;
642 
643    /* Special case UNDEF since a lot of places in the backend do things like this :
644     *
645     *  fs_builder ubld = bld.exec_all().group(1, 0);
646     *  fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
647     *  ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
648     */
649    if (this->opcode == SHADER_OPCODE_UNDEF) {
650       assert(this->dst.is_contiguous());
651       return this->size_written < 32;
652    }
653 
654    return this->exec_size * type_sz(this->dst.type) < 32 ||
655           !this->dst.is_contiguous();
656 }
657 
658 unsigned
components_read(unsigned i) const659 fs_inst::components_read(unsigned i) const
660 {
661    /* Return zero if the source is not present. */
662    if (src[i].file == BAD_FILE)
663       return 0;
664 
665    switch (opcode) {
666    case FS_OPCODE_LINTERP:
667       if (i == 0)
668          return 2;
669       else
670          return 1;
671 
672    case FS_OPCODE_PIXEL_X:
673    case FS_OPCODE_PIXEL_Y:
674       assert(i < 2);
675       if (i == 0)
676          return 2;
677       else
678          return 1;
679 
680    case FS_OPCODE_FB_WRITE_LOGICAL:
681       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
682       /* First/second FB write color. */
683       if (i < 2)
684          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
685       else
686          return 1;
687 
688    case SHADER_OPCODE_TEX_LOGICAL:
689    case SHADER_OPCODE_TXD_LOGICAL:
690    case SHADER_OPCODE_TXF_LOGICAL:
691    case SHADER_OPCODE_TXL_LOGICAL:
692    case SHADER_OPCODE_TXS_LOGICAL:
693    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
694    case FS_OPCODE_TXB_LOGICAL:
695    case SHADER_OPCODE_TXF_CMS_LOGICAL:
696    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
697    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
698    case SHADER_OPCODE_TXF_UMS_LOGICAL:
699    case SHADER_OPCODE_TXF_MCS_LOGICAL:
700    case SHADER_OPCODE_LOD_LOGICAL:
701    case SHADER_OPCODE_TG4_LOGICAL:
702    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
703    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
704    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
705    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
706    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
707    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
708    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
709       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
710              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
711              src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
712       /* Texture coordinates. */
713       if (i == TEX_LOGICAL_SRC_COORDINATE)
714          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
715       /* Texture derivatives. */
716       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
717                opcode == SHADER_OPCODE_TXD_LOGICAL)
718          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
719       /* Texture offset. */
720       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
721          return 2;
722       /* MCS */
723       else if (i == TEX_LOGICAL_SRC_MCS) {
724          if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
725             return 2;
726          else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
727             return 4;
728          else
729             return 1;
730       } else
731          return 1;
732 
733    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
734    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
735       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
736       /* Surface coordinates. */
737       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
738          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
739       /* Surface operation source (ignored for reads). */
740       else if (i == SURFACE_LOGICAL_SRC_DATA)
741          return 0;
742       else
743          return 1;
744 
745    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
746    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
747       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
748              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
749       /* Surface coordinates. */
750       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
751          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
752       /* Surface operation source. */
753       else if (i == SURFACE_LOGICAL_SRC_DATA)
754          return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
755       else
756          return 1;
757 
758    case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
759    case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
760    case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
761       assert(src[A64_LOGICAL_ARG].file == IMM);
762       return 1;
763 
764    case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
765       assert(src[A64_LOGICAL_ARG].file == IMM);
766       if (i == A64_LOGICAL_SRC) { /* data to write */
767          const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
768          assert(comps > 0);
769          return comps;
770       } else {
771          return 1;
772       }
773 
774    case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
775       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
776       return 1;
777 
778    case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
779       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
780       if (i == SURFACE_LOGICAL_SRC_DATA) {
781          const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
782          assert(comps > 0);
783          return comps;
784       } else {
785          return 1;
786       }
787 
788    case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
789       assert(src[A64_LOGICAL_ARG].file == IMM);
790       return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
791 
792    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
793       assert(src[A64_LOGICAL_ARG].file == IMM);
794       return i == A64_LOGICAL_SRC ?
795              lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
796 
797    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
798    case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
799       /* Scattered logical opcodes use the following params:
800        * src[0] Surface coordinates
801        * src[1] Surface operation source (ignored for reads)
802        * src[2] Surface
803        * src[3] IMM with always 1 dimension.
804        * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
805        */
806       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
807              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
808       return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
809 
810    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
811    case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
812       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
813              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
814       return 1;
815 
816    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
817    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
818       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
819              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
820       const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
821       /* Surface coordinates. */
822       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
823          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
824       /* Surface operation source. */
825       else if (i == SURFACE_LOGICAL_SRC_DATA)
826          return lsc_op_num_data_values(op);
827       else
828          return 1;
829    }
830    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
831       return (i == 0 ? 2 : 1);
832 
833    case SHADER_OPCODE_URB_WRITE_LOGICAL:
834       assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
835 
836       if (i == URB_LOGICAL_SRC_DATA)
837          return src[URB_LOGICAL_SRC_COMPONENTS].ud;
838       else
839          return 1;
840 
841    case BRW_OPCODE_DPAS:
842       unreachable("Do not use components_read() for DPAS.");
843 
844    default:
845       return 1;
846    }
847 }
848 
849 unsigned
size_read(int arg) const850 fs_inst::size_read(int arg) const
851 {
852    switch (opcode) {
853    case SHADER_OPCODE_SEND:
854       if (arg == 2) {
855          return mlen * REG_SIZE;
856       } else if (arg == 3) {
857          return ex_mlen * REG_SIZE;
858       }
859       break;
860 
861    case FS_OPCODE_FB_READ:
862    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
863    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
864       if (arg == 0)
865          return mlen * REG_SIZE;
866       break;
867 
868    case FS_OPCODE_LINTERP:
869       if (arg == 1)
870          return 16;
871       break;
872 
873    case SHADER_OPCODE_LOAD_PAYLOAD:
874       if (arg < this->header_size)
875          return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8);
876       break;
877 
878    case CS_OPCODE_CS_TERMINATE:
879    case SHADER_OPCODE_BARRIER:
880       return REG_SIZE;
881 
882    case SHADER_OPCODE_MOV_INDIRECT:
883       if (arg == 0) {
884          assert(src[2].file == IMM);
885          return src[2].ud;
886       }
887       break;
888 
889    case BRW_OPCODE_DPAS:
890       switch (arg) {
891       case 0:
892          if (src[0].type == BRW_REGISTER_TYPE_HF) {
893             return rcount * REG_SIZE / 2;
894          } else {
895             return rcount * REG_SIZE;
896          }
897       case 1:
898          return sdepth * REG_SIZE;
899       case 2:
900          /* This is simpler than the formula described in the Bspec, but it
901           * covers all of the cases that we support on DG2.
902           */
903          return rcount * REG_SIZE;
904       default:
905          unreachable("Invalid source number.");
906       }
907       break;
908 
909    case SHADER_OPCODE_TEX:
910    case FS_OPCODE_TXB:
911    case SHADER_OPCODE_TXD:
912    case SHADER_OPCODE_TXF:
913    case SHADER_OPCODE_TXF_LZ:
914    case SHADER_OPCODE_TXF_CMS:
915    case SHADER_OPCODE_TXF_CMS_W:
916    case SHADER_OPCODE_TXF_UMS:
917    case SHADER_OPCODE_TXF_MCS:
918    case SHADER_OPCODE_TXL:
919    case SHADER_OPCODE_TXL_LZ:
920    case SHADER_OPCODE_TXS:
921    case SHADER_OPCODE_LOD:
922    case SHADER_OPCODE_TG4:
923    case SHADER_OPCODE_TG4_OFFSET:
924    case SHADER_OPCODE_TG4_BIAS:
925    case SHADER_OPCODE_TG4_EXPLICIT_LOD:
926    case SHADER_OPCODE_TG4_IMPLICIT_LOD:
927    case SHADER_OPCODE_TG4_OFFSET_LOD:
928    case SHADER_OPCODE_TG4_OFFSET_BIAS:
929    case SHADER_OPCODE_SAMPLEINFO:
930       if (arg == 0 && src[0].file == VGRF)
931          return mlen * REG_SIZE;
932       break;
933 
934    default:
935       break;
936    }
937 
938    switch (src[arg].file) {
939    case UNIFORM:
940    case IMM:
941       return components_read(arg) * type_sz(src[arg].type);
942    case BAD_FILE:
943    case ARF:
944    case FIXED_GRF:
945    case VGRF:
946    case ATTR:
947       return components_read(arg) * src[arg].component_size(exec_size);
948    }
949    return 0;
950 }
951 
952 namespace {
953    unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)954    predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
955    {
956       if (devinfo->ver >= 20) {
957          return 1;
958       } else {
959          switch (predicate) {
960          case BRW_PREDICATE_NONE:            return 1;
961          case BRW_PREDICATE_NORMAL:          return 1;
962          case BRW_PREDICATE_ALIGN1_ANY2H:    return 2;
963          case BRW_PREDICATE_ALIGN1_ALL2H:    return 2;
964          case BRW_PREDICATE_ALIGN1_ANY4H:    return 4;
965          case BRW_PREDICATE_ALIGN1_ALL4H:    return 4;
966          case BRW_PREDICATE_ALIGN1_ANY8H:    return 8;
967          case BRW_PREDICATE_ALIGN1_ALL8H:    return 8;
968          case BRW_PREDICATE_ALIGN1_ANY16H:   return 16;
969          case BRW_PREDICATE_ALIGN1_ALL16H:   return 16;
970          case BRW_PREDICATE_ALIGN1_ANY32H:   return 32;
971          case BRW_PREDICATE_ALIGN1_ALL32H:   return 32;
972          default: unreachable("Unsupported predicate");
973          }
974       }
975    }
976 }
977 
978 unsigned
flags_read(const intel_device_info * devinfo) const979 fs_inst::flags_read(const intel_device_info *devinfo) const
980 {
981    if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
982                              predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
983       /* The vertical predication modes combine corresponding bits from
984        * f0.0 and f1.0 on Gfx7+.
985        */
986       const unsigned shift = 4;
987       return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
988    } else if (predicate) {
989       return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
990    } else {
991       unsigned mask = 0;
992       for (int i = 0; i < sources; i++) {
993          mask |= brw_fs_flag_mask(src[i], size_read(i));
994       }
995       return mask;
996    }
997 }
998 
999 unsigned
flags_written(const intel_device_info * devinfo) const1000 fs_inst::flags_written(const intel_device_info *devinfo) const
1001 {
1002    if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1003                            opcode != BRW_OPCODE_CSEL &&
1004                            opcode != BRW_OPCODE_IF &&
1005                            opcode != BRW_OPCODE_WHILE)) {
1006       return brw_fs_flag_mask(this, 1);
1007    } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1008               opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1009               opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
1010       return brw_fs_flag_mask(this, 32);
1011    } else {
1012       return brw_fs_flag_mask(dst, size_written);
1013    }
1014 }
1015 
1016 bool
has_sampler_residency() const1017 fs_inst::has_sampler_residency() const
1018 {
1019    switch (opcode) {
1020    case SHADER_OPCODE_TEX_LOGICAL:
1021    case FS_OPCODE_TXB_LOGICAL:
1022    case SHADER_OPCODE_TXL_LOGICAL:
1023    case SHADER_OPCODE_TXD_LOGICAL:
1024    case SHADER_OPCODE_TXF_LOGICAL:
1025    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1026    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1027    case SHADER_OPCODE_TXF_CMS_LOGICAL:
1028    case SHADER_OPCODE_TXS_LOGICAL:
1029    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1030    case SHADER_OPCODE_TG4_LOGICAL:
1031    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
1032    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
1033    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
1034    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
1035    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
1036       assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1037       return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1038    default:
1039       return false;
1040    }
1041 }
1042 
1043 fs_reg
vgrf(const glsl_type * const type)1044 fs_visitor::vgrf(const glsl_type *const type)
1045 {
1046    int reg_width = dispatch_width / 8;
1047    return fs_reg(VGRF,
1048                  alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1049                  brw_type_for_base_type(type));
1050 }
1051 
fs_reg(enum brw_reg_file file,unsigned nr)1052 fs_reg::fs_reg(enum brw_reg_file file, unsigned nr)
1053 {
1054    init();
1055    this->file = file;
1056    this->nr = nr;
1057    this->type = BRW_REGISTER_TYPE_F;
1058    this->stride = (file == UNIFORM ? 0 : 1);
1059 }
1060 
fs_reg(enum brw_reg_file file,unsigned nr,enum brw_reg_type type)1061 fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type)
1062 {
1063    init();
1064    this->file = file;
1065    this->nr = nr;
1066    this->type = type;
1067    this->stride = (file == UNIFORM ? 0 : 1);
1068 }
1069 
1070 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1071  * This brings in those uniform definitions
1072  */
1073 void
import_uniforms(fs_visitor * v)1074 fs_visitor::import_uniforms(fs_visitor *v)
1075 {
1076    this->push_constant_loc = v->push_constant_loc;
1077    this->uniforms = v->uniforms;
1078 }
1079 
1080 enum brw_barycentric_mode
brw_barycentric_mode(nir_intrinsic_instr * intr)1081 brw_barycentric_mode(nir_intrinsic_instr *intr)
1082 {
1083    const glsl_interp_mode mode =
1084       (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1085 
1086    /* Barycentric modes don't make sense for flat inputs. */
1087    assert(mode != INTERP_MODE_FLAT);
1088 
1089    unsigned bary;
1090    switch (intr->intrinsic) {
1091    case nir_intrinsic_load_barycentric_pixel:
1092    case nir_intrinsic_load_barycentric_at_offset:
1093       bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
1094       break;
1095    case nir_intrinsic_load_barycentric_centroid:
1096       bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
1097       break;
1098    case nir_intrinsic_load_barycentric_sample:
1099    case nir_intrinsic_load_barycentric_at_sample:
1100       bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1101       break;
1102    default:
1103       unreachable("invalid intrinsic");
1104    }
1105 
1106    if (mode == INTERP_MODE_NOPERSPECTIVE)
1107       bary += 3;
1108 
1109    return (enum brw_barycentric_mode) bary;
1110 }
1111 
1112 /**
1113  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1114  */
1115 static enum brw_barycentric_mode
centroid_to_pixel(enum brw_barycentric_mode bary)1116 centroid_to_pixel(enum brw_barycentric_mode bary)
1117 {
1118    assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1119           bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1120    return (enum brw_barycentric_mode) ((unsigned) bary - 1);
1121 }
1122 
1123 /**
1124  * Walk backwards from the end of the program looking for a URB write that
1125  * isn't in control flow, and mark it with EOT.
1126  *
1127  * Return true if successful or false if a separate EOT write is needed.
1128  */
1129 bool
mark_last_urb_write_with_eot()1130 fs_visitor::mark_last_urb_write_with_eot()
1131 {
1132    foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1133       if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
1134          prev->eot = true;
1135 
1136          /* Delete now dead instructions. */
1137          foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1138             if (dead == prev)
1139                break;
1140             dead->remove();
1141          }
1142          return true;
1143       } else if (prev->is_control_flow() || prev->has_side_effects()) {
1144          break;
1145       }
1146    }
1147 
1148    return false;
1149 }
1150 
1151 void
emit_gs_thread_end()1152 fs_visitor::emit_gs_thread_end()
1153 {
1154    assert(stage == MESA_SHADER_GEOMETRY);
1155 
1156    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1157 
1158    if (gs_compile->control_data_header_size_bits > 0) {
1159       emit_gs_control_data_bits(this->final_gs_vertex_count);
1160    }
1161 
1162    const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1163    fs_inst *inst;
1164 
1165    if (gs_prog_data->static_vertex_count != -1) {
1166       /* Try and tag the last URB write with EOT instead of emitting a whole
1167        * separate write just to finish the thread.
1168        */
1169       if (mark_last_urb_write_with_eot())
1170          return;
1171 
1172       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1173       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1174       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
1175       inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1176                        srcs, ARRAY_SIZE(srcs));
1177    } else {
1178       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1179       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1180       srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1181       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
1182       inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1183                        srcs, ARRAY_SIZE(srcs));
1184    }
1185    inst->eot = true;
1186    inst->offset = 0;
1187 }
1188 
1189 void
assign_curb_setup()1190 fs_visitor::assign_curb_setup()
1191 {
1192    unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1193 
1194    unsigned ubo_push_length = 0;
1195    unsigned ubo_push_start[4];
1196    for (int i = 0; i < 4; i++) {
1197       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1198       ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1199    }
1200 
1201    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1202 
1203    uint64_t used = 0;
1204    bool is_compute = gl_shader_stage_is_compute(stage);
1205 
1206    if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
1207       /* With COMPUTE_WALKER, we can push up to one register worth of data via
1208        * the inline data parameter in the COMPUTE_WALKER command itself.
1209        *
1210        * TODO: Support inline data and push at the same time.
1211        */
1212       assert(devinfo->verx10 >= 125);
1213       assert(uniform_push_length <= reg_unit(devinfo));
1214    } else if (is_compute && devinfo->verx10 >= 125) {
1215       assert(devinfo->has_lsc);
1216       fs_builder ubld = fs_builder(this, 1).exec_all().at(
1217          cfg->first_block(), cfg->first_block()->start());
1218 
1219       /* The base offset for our push data is passed in as R0.0[31:6]. We have
1220        * to mask off the bottom 6 bits.
1221        */
1222       fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1223       ubld.AND(base_addr,
1224                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
1225                brw_imm_ud(INTEL_MASK(31, 6)));
1226 
1227       /* On Gfx12-HP we load constants at the start of the program using A32
1228        * stateless messages.
1229        */
1230       for (unsigned i = 0; i < uniform_push_length;) {
1231          /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
1232          unsigned num_regs = MIN2(uniform_push_length - i, 8);
1233          assert(num_regs > 0);
1234          num_regs = 1 << util_logbase2(num_regs);
1235 
1236          fs_reg addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1237          ubld.ADD(addr, base_addr, brw_imm_ud(i * REG_SIZE));
1238 
1239          fs_reg srcs[4] = {
1240             brw_imm_ud(0), /* desc */
1241             brw_imm_ud(0), /* ex_desc */
1242             addr,          /* payload */
1243             fs_reg(),      /* payload2 */
1244          };
1245 
1246          fs_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
1247                               BRW_REGISTER_TYPE_UD);
1248          fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
1249 
1250          send->sfid = GFX12_SFID_UGM;
1251          send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1252                                    1 /* exec_size */,
1253                                    LSC_ADDR_SURFTYPE_FLAT,
1254                                    LSC_ADDR_SIZE_A32,
1255                                    1 /* num_coordinates */,
1256                                    LSC_DATA_SIZE_D32,
1257                                    num_regs * 8 /* num_channels */,
1258                                    true /* transpose */,
1259                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1260                                    true /* has_dest */);
1261          send->header_size = 0;
1262          send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
1263          send->size_written =
1264             lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
1265          send->send_is_volatile = true;
1266 
1267          i += num_regs;
1268       }
1269 
1270       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1271    }
1272 
1273    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1274    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1275       for (unsigned int i = 0; i < inst->sources; i++) {
1276 	 if (inst->src[i].file == UNIFORM) {
1277             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1278             int constant_nr;
1279             if (inst->src[i].nr >= UBO_START) {
1280                /* constant_nr is in 32-bit units, the rest are in bytes */
1281                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1282                              inst->src[i].offset / 4;
1283             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1284                constant_nr = push_constant_loc[uniform_nr];
1285             } else {
1286                /* Section 5.11 of the OpenGL 4.1 spec says:
1287                 * "Out-of-bounds reads return undefined values, which include
1288                 *  values from other variables of the active program or zero."
1289                 * Just return the first push constant.
1290                 */
1291                constant_nr = 0;
1292             }
1293 
1294             assert(constant_nr / 8 < 64);
1295             used |= BITFIELD64_BIT(constant_nr / 8);
1296 
1297 	    struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1298 						  constant_nr / 8,
1299 						  constant_nr % 8);
1300             brw_reg.abs = inst->src[i].abs;
1301             brw_reg.negate = inst->src[i].negate;
1302 
1303             assert(inst->src[i].stride == 0);
1304             inst->src[i] = byte_offset(
1305                retype(brw_reg, inst->src[i].type),
1306                inst->src[i].offset % 4);
1307 	 }
1308       }
1309    }
1310 
1311    uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1312    if (want_zero) {
1313       fs_builder ubld = fs_builder(this, 8).exec_all().at(
1314          cfg->first_block(), cfg->first_block()->start());
1315 
1316       /* push_reg_mask_param is in 32-bit units */
1317       unsigned mask_param = stage_prog_data->push_reg_mask_param;
1318       struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1319                                                               mask_param % 8);
1320 
1321       fs_reg b32;
1322       for (unsigned i = 0; i < 64; i++) {
1323          if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1324             fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
1325             ubld.SHL(horiz_offset(shifted, 8),
1326                      byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
1327                      brw_imm_v(0x01234567));
1328             ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1329 
1330             fs_builder ubld16 = ubld.group(16, 0);
1331             b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
1332             ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1333          }
1334 
1335          if (want_zero & BITFIELD64_BIT(i)) {
1336             assert(i < prog_data->curb_read_length);
1337             struct brw_reg push_reg =
1338                retype(brw_vec8_grf(payload().num_regs + i, 0),
1339                       BRW_REGISTER_TYPE_D);
1340 
1341             ubld.AND(push_reg, push_reg, component(b32, i % 16));
1342          }
1343       }
1344 
1345       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1346    }
1347 
1348    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1349    this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1350 }
1351 
1352 /*
1353  * Build up an array of indices into the urb_setup array that
1354  * references the active entries of the urb_setup array.
1355  * Used to accelerate walking the active entries of the urb_setup array
1356  * on each upload.
1357  */
1358 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1359 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1360 {
1361    /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1362     * skip per-primitive attributes here.
1363     */
1364 
1365    /* Make sure uint8_t is sufficient */
1366    STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1367    uint8_t index = 0;
1368    for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1369       if (wm_prog_data->urb_setup[attr] >= 0) {
1370          wm_prog_data->urb_setup_attribs[index++] = attr;
1371       }
1372    }
1373    wm_prog_data->urb_setup_attribs_count = index;
1374 }
1375 
1376 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const nir_shader * nir,const struct brw_mue_map * mue_map)1377 calculate_urb_setup(const struct intel_device_info *devinfo,
1378                     const struct brw_wm_prog_key *key,
1379                     struct brw_wm_prog_data *prog_data,
1380                     const nir_shader *nir,
1381                     const struct brw_mue_map *mue_map)
1382 {
1383    memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1384    memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1385 
1386    int urb_next = 0; /* in vec4s */
1387 
1388    const uint64_t inputs_read =
1389       nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1390 
1391    /* Figure out where each of the incoming setup attributes lands. */
1392    if (key->mesh_input != BRW_NEVER) {
1393       /* Per-Primitive Attributes are laid out by Hardware before the regular
1394        * attributes, so order them like this to make easy later to map setup
1395        * into real HW registers.
1396        */
1397       if (nir->info.per_primitive_inputs) {
1398          uint64_t per_prim_inputs_read =
1399                nir->info.inputs_read & nir->info.per_primitive_inputs;
1400 
1401          /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
1402           * are always at the beginning, because they come from MUE
1403           * Primitive Header, not Per-Primitive Attributes.
1404           */
1405          const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
1406                                                 VARYING_BIT_LAYER |
1407                                                 VARYING_BIT_PRIMITIVE_SHADING_RATE;
1408 
1409          if (mue_map) {
1410             unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
1411             unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
1412 
1413             bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
1414 
1415             if (reads_header || mue_map->user_data_in_primitive_header) {
1416                /* Primitive Shading Rate, Layer and Viewport live in the same
1417                 * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
1418                 * is dword 2).
1419                 */
1420                if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
1421                   prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
1422 
1423                if (per_prim_inputs_read & VARYING_BIT_LAYER)
1424                   prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
1425 
1426                if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
1427                   prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
1428 
1429                per_prim_inputs_read &= ~primitive_header_bits;
1430             } else {
1431                /* If fs doesn't need primitive header, then it won't be made
1432                 * available through SBE_MESH, so we have to skip them when
1433                 * calculating offset from start of per-prim data.
1434                 */
1435                per_prim_start_dw += mue_map->per_primitive_header_size_dw;
1436                per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
1437             }
1438 
1439             u_foreach_bit64(i, per_prim_inputs_read) {
1440                int start = mue_map->start_dw[i];
1441 
1442                assert(start >= 0);
1443                assert(mue_map->len_dw[i] > 0);
1444 
1445                assert(unsigned(start) >= per_prim_start_dw);
1446                unsigned pos_dw = unsigned(start) - per_prim_start_dw;
1447 
1448                prog_data->urb_setup[i] = urb_next + pos_dw / 4;
1449                prog_data->urb_setup_channel[i] = pos_dw % 4;
1450             }
1451 
1452             urb_next = per_prim_size_dw / 4;
1453          } else {
1454             /* With no MUE map, we never read the primitive header, and
1455              * per-primitive attributes won't be packed either, so just lay
1456              * them in varying order.
1457              */
1458             per_prim_inputs_read &= ~primitive_header_bits;
1459 
1460             for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
1461                if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
1462                   prog_data->urb_setup[i] = urb_next++;
1463                }
1464             }
1465 
1466             /* The actual setup attributes later must be aligned to a full GRF. */
1467             urb_next = ALIGN(urb_next, 2);
1468          }
1469 
1470          prog_data->num_per_primitive_inputs = urb_next;
1471       }
1472 
1473       const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
1474                                       VARYING_BIT_CLIP_DIST1;
1475 
1476       uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
1477 
1478       if (inputs_read & clip_dist_bits) {
1479          assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
1480          unique_fs_attrs &= ~clip_dist_bits;
1481       }
1482 
1483       if (mue_map) {
1484          unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
1485          unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
1486 
1487          /* Per-Vertex header is available to fragment shader only if there's
1488           * user data there.
1489           */
1490          if (!mue_map->user_data_in_vertex_header) {
1491             per_vertex_start_dw += 8;
1492             per_vertex_size_dw -= 8;
1493          }
1494 
1495          /* In Mesh, CLIP_DIST slots are always at the beginning, because
1496           * they come from MUE Vertex Header, not Per-Vertex Attributes.
1497           */
1498          if (inputs_read & clip_dist_bits) {
1499             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
1500             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
1501          } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
1502             /* Clip distances are in MUE, but we are not reading them in FS. */
1503             per_vertex_start_dw += 8;
1504             per_vertex_size_dw -= 8;
1505          }
1506 
1507          /* Per-Vertex attributes are laid out ordered.  Because we always link
1508           * Mesh and Fragment shaders, the which slots are written and read by
1509           * each of them will match. */
1510          u_foreach_bit64(i, unique_fs_attrs) {
1511             int start = mue_map->start_dw[i];
1512 
1513             assert(start >= 0);
1514             assert(mue_map->len_dw[i] > 0);
1515 
1516             assert(unsigned(start) >= per_vertex_start_dw);
1517             unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
1518 
1519             prog_data->urb_setup[i] = urb_next + pos_dw / 4;
1520             prog_data->urb_setup_channel[i] = pos_dw % 4;
1521          }
1522 
1523          urb_next += per_vertex_size_dw / 4;
1524       } else {
1525          /* If we don't have an MUE map, just lay down the inputs the FS reads
1526           * in varying order, as we do for the legacy pipeline.
1527           */
1528          if (inputs_read & clip_dist_bits) {
1529             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
1530             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
1531          }
1532 
1533          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1534             if (unique_fs_attrs & BITFIELD64_BIT(i))
1535                prog_data->urb_setup[i] = urb_next++;
1536          }
1537       }
1538    } else {
1539       assert(!nir->info.per_primitive_inputs);
1540 
1541       uint64_t vue_header_bits =
1542          VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1543 
1544       uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
1545 
1546       /* VUE header fields all live in the same URB slot, so we pass them
1547        * as a single FS input attribute.  We want to only count them once.
1548        */
1549       if (inputs_read & vue_header_bits) {
1550          unique_fs_attrs &= ~vue_header_bits;
1551          unique_fs_attrs |= VARYING_BIT_PSIZ;
1552       }
1553 
1554       if (util_bitcount64(unique_fs_attrs) <= 16) {
1555          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1556           * first 16 varying inputs, so we can put them wherever we want.
1557           * Just put them in order.
1558           *
1559           * This is useful because it means that (a) inputs not used by the
1560           * fragment shader won't take up valuable register space, and (b) we
1561           * won't have to recompile the fragment shader if it gets paired with
1562           * a different vertex (or geometry) shader.
1563           *
1564           * VUE header fields share the same FS input attribute.
1565           */
1566          if (inputs_read & vue_header_bits) {
1567             if (inputs_read & VARYING_BIT_PSIZ)
1568                prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1569             if (inputs_read & VARYING_BIT_LAYER)
1570                prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1571             if (inputs_read & VARYING_BIT_VIEWPORT)
1572                prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1573 
1574             urb_next++;
1575          }
1576 
1577          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1578             if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1579                 BITFIELD64_BIT(i)) {
1580                prog_data->urb_setup[i] = urb_next++;
1581             }
1582          }
1583       } else {
1584          /* We have enough input varyings that the SF/SBE pipeline stage can't
1585           * arbitrarily rearrange them to suit our whim; we have to put them
1586           * in an order that matches the output of the previous pipeline stage
1587           * (geometry or vertex shader).
1588           */
1589 
1590          /* Re-compute the VUE map here in the case that the one coming from
1591           * geometry has more than one position slot (used for Primitive
1592           * Replication).
1593           */
1594          struct intel_vue_map prev_stage_vue_map;
1595          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1596                              key->input_slots_valid,
1597                              nir->info.separate_shader, 1);
1598 
1599          int first_slot =
1600             brw_compute_first_urb_slot_required(inputs_read,
1601                                                 &prev_stage_vue_map);
1602 
1603          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1604          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1605               slot++) {
1606             int varying = prev_stage_vue_map.slot_to_varying[slot];
1607             if (varying != BRW_VARYING_SLOT_PAD &&
1608                 (inputs_read & BRW_FS_VARYING_INPUT_MASK &
1609                  BITFIELD64_BIT(varying))) {
1610                prog_data->urb_setup[varying] = slot - first_slot;
1611             }
1612          }
1613          urb_next = prev_stage_vue_map.num_slots - first_slot;
1614       }
1615    }
1616 
1617    prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1618    prog_data->inputs = inputs_read;
1619 
1620    brw_compute_urb_setup_index(prog_data);
1621 }
1622 
1623 void
assign_urb_setup()1624 fs_visitor::assign_urb_setup()
1625 {
1626    assert(stage == MESA_SHADER_FRAGMENT);
1627    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1628 
1629    int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1630 
1631    /* Offset all the urb_setup[] index by the actual position of the
1632     * setup regs, now that the location of the constants has been chosen.
1633     */
1634    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1635       for (int i = 0; i < inst->sources; i++) {
1636          if (inst->src[i].file == ATTR) {
1637             /* ATTR fs_reg::nr in the FS is in units of logical scalar
1638              * inputs each of which consumes 16B on Gfx4-Gfx12.  In
1639              * single polygon mode this leads to the following layout
1640              * of the vertex setup plane parameters in the ATTR
1641              * register file:
1642              *
1643              *  fs_reg::nr   Input   Comp0  Comp1  Comp2  Comp3
1644              *      0       Attr0.x  a1-a0  a2-a0   N/A    a0
1645              *      1       Attr0.y  a1-a0  a2-a0   N/A    a0
1646              *      2       Attr0.z  a1-a0  a2-a0   N/A    a0
1647              *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
1648              *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
1649              *     ...
1650              *
1651              * In multipolygon mode that no longer works since
1652              * different channels may be processing polygons with
1653              * different plane parameters, so each parameter above is
1654              * represented as a dispatch_width-wide vector:
1655              *
1656              *  fs_reg::nr     fs_reg::offset    Input      Comp0     ...    CompN
1657              *      0                 0          Attr0.x  a1[0]-a0[0] ... a1[N]-a0[N]
1658              *      0        4 * dispatch_width  Attr0.x  a2[0]-a0[0] ... a2[N]-a0[N]
1659              *      0        8 * dispatch_width  Attr0.x     N/A      ...     N/A
1660              *      0       12 * dispatch_width  Attr0.x    a0[0]     ...    a0[N]
1661              *      1                 0          Attr0.y  a1[0]-a0[0] ... a1[N]-a0[N]
1662              *     ...
1663              *
1664              * Note that many of the components on a single row above
1665              * are likely to be replicated multiple times (if, say, a
1666              * single SIMD thread is only processing 2 different
1667              * polygons), so plane parameters aren't actually stored
1668              * in GRF memory with that layout to avoid wasting space.
1669              * Instead we compose ATTR register regions with a 2D
1670              * region that walks through the parameters of each
1671              * polygon with the correct stride, reading the parameter
1672              * corresponding to each channel directly from the PS
1673              * thread payload.
1674              *
1675              * The latter layout corresponds to a param_width equal to
1676              * dispatch_width, while the former (scalar parameter)
1677              * layout has a param_width of 1.
1678              *
1679              * Gfx20+ represent plane parameters in a format similar
1680              * to the above, except the parameters are packed in 12B
1681              * and ordered like "a0, a1-a0, a2-a0" instead of the
1682              * above vec4 representation with a missing component.
1683              */
1684             const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
1685 
1686             /* Size of a single scalar component of a plane parameter
1687              * in bytes.
1688              */
1689             const unsigned chan_sz = 4;
1690             struct brw_reg reg;
1691             assert(max_polygons > 0);
1692 
1693             /* Calculate the base register on the thread payload of
1694              * either the block of vertex setup data or the block of
1695              * per-primitive constant data depending on whether we're
1696              * accessing a primitive or vertex input.  Also calculate
1697              * the index of the input within that block.
1698              */
1699             const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1700             const unsigned base = urb_start +
1701                (per_prim ? 0 :
1702                 ALIGN(prog_data->num_per_primitive_inputs / 2,
1703                       reg_unit(devinfo)) * max_polygons);
1704             const unsigned idx = per_prim ? inst->src[i].nr :
1705                inst->src[i].nr - prog_data->num_per_primitive_inputs;
1706 
1707             /* Translate the offset within the param_width-wide
1708              * representation described above into an offset and a
1709              * grf, which contains the plane parameters for the first
1710              * polygon processed by the thread.
1711              */
1712             if (devinfo->ver >= 20 && !per_prim) {
1713                /* Gfx20+ is able to pack 5 logical input components
1714                 * per 64B register for vertex setup data.
1715                 */
1716                const unsigned grf = base + idx / 5 * 2 * max_polygons;
1717                assert(inst->src[i].offset / param_width < 12);
1718                const unsigned delta = idx % 5 * 12 +
1719                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1720                   inst->src[i].offset % chan_sz;
1721                reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1722                                  delta);
1723             } else {
1724                /* Earlier platforms and per-primitive block pack 2 logical
1725                 * input components per 32B register.
1726                 */
1727                const unsigned grf = base + idx / 2 * max_polygons;
1728                assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1729                const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1730                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1731                   inst->src[i].offset % chan_sz;
1732                reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1733                                  delta);
1734             }
1735 
1736             if (max_polygons > 1) {
1737                assert(devinfo->ver >= 12);
1738                /* Misaligned channel strides that would lead to
1739                 * cross-channel access in the representation above are
1740                 * disallowed.
1741                 */
1742                assert(inst->src[i].stride * type_sz(inst->src[i].type) == chan_sz);
1743 
1744                /* Number of channels processing the same polygon. */
1745                const unsigned poly_width = dispatch_width / max_polygons;
1746                assert(dispatch_width % max_polygons == 0);
1747 
1748                /* Accessing a subset of channels of a parameter vector
1749                 * starting from "chan" is necessary to handle
1750                 * SIMD-lowered instructions though.
1751                 */
1752                const unsigned chan = inst->src[i].offset %
1753                   (param_width * chan_sz) / chan_sz;
1754                assert(chan < dispatch_width);
1755                assert(chan % poly_width == 0);
1756                const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
1757                reg = byte_offset(reg, chan / poly_width * reg_size);
1758 
1759                if (inst->exec_size > poly_width) {
1760                   /* Accessing the parameters for multiple polygons.
1761                    * Corresponding parameters for different polygons
1762                    * are stored a GRF apart on the thread payload, so
1763                    * use that as vertical stride.
1764                    */
1765                   const unsigned vstride = reg_size / type_sz(inst->src[i].type);
1766                   assert(vstride <= 32);
1767                   assert(chan % poly_width == 0);
1768                   reg = stride(reg, vstride, poly_width, 0);
1769                } else {
1770                   /* Accessing one parameter for a single polygon --
1771                    * Translate to a scalar region.
1772                    */
1773                   assert(chan % poly_width + inst->exec_size <= poly_width);
1774                   reg = stride(reg, 0, 1, 0);
1775                }
1776 
1777             } else {
1778                const unsigned width = inst->src[i].stride == 0 ?
1779                   1 : MIN2(inst->exec_size, 8);
1780                reg = stride(reg, width * inst->src[i].stride,
1781                             width, inst->src[i].stride);
1782             }
1783 
1784             reg.abs = inst->src[i].abs;
1785             reg.negate = inst->src[i].negate;
1786             inst->src[i] = reg;
1787          }
1788       }
1789    }
1790 
1791    /* Each attribute is 4 setup channels, each of which is half a reg,
1792     * but they may be replicated multiple times for multipolygon
1793     * dispatch.
1794     */
1795    this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
1796 
1797    /* Unlike regular attributes, per-primitive attributes have all 4 channels
1798     * in the same slot, so each GRF can store two slots.
1799     */
1800    assert(prog_data->num_per_primitive_inputs % 2 == 0);
1801    this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
1802 }
1803 
1804 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1805 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1806 {
1807    for (int i = 0; i < inst->sources; i++) {
1808       if (inst->src[i].file == ATTR) {
1809          assert(inst->src[i].nr == 0);
1810          int grf = payload().num_regs +
1811                    prog_data->curb_read_length +
1812                    inst->src[i].offset / REG_SIZE;
1813 
1814          /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1815           *
1816           * VertStride must be used to cross GRF register boundaries. This
1817           * rule implies that elements within a 'Width' cannot cross GRF
1818           * boundaries.
1819           *
1820           * So, for registers that are large enough, we have to split the exec
1821           * size in two and trust the compression state to sort it out.
1822           */
1823          unsigned total_size = inst->exec_size *
1824                                inst->src[i].stride *
1825                                type_sz(inst->src[i].type);
1826 
1827          assert(total_size <= 2 * REG_SIZE);
1828          const unsigned exec_size =
1829             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1830 
1831          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1832          struct brw_reg reg =
1833             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1834                                inst->src[i].offset % REG_SIZE),
1835                    exec_size * inst->src[i].stride,
1836                    width, inst->src[i].stride);
1837          reg.abs = inst->src[i].abs;
1838          reg.negate = inst->src[i].negate;
1839 
1840          inst->src[i] = reg;
1841       }
1842    }
1843 }
1844 
1845 void
assign_vs_urb_setup()1846 fs_visitor::assign_vs_urb_setup()
1847 {
1848    struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
1849 
1850    assert(stage == MESA_SHADER_VERTEX);
1851 
1852    /* Each attribute is 4 regs. */
1853    this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1854 
1855    assert(vs_prog_data->base.urb_read_length <= 15);
1856 
1857    /* Rewrite all ATTR file references to the hw grf that they land in. */
1858    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1859       convert_attr_sources_to_hw_regs(inst);
1860    }
1861 }
1862 
1863 void
assign_tcs_urb_setup()1864 fs_visitor::assign_tcs_urb_setup()
1865 {
1866    assert(stage == MESA_SHADER_TESS_CTRL);
1867 
1868    /* Rewrite all ATTR file references to HW_REGs. */
1869    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1870       convert_attr_sources_to_hw_regs(inst);
1871    }
1872 }
1873 
1874 void
assign_tes_urb_setup()1875 fs_visitor::assign_tes_urb_setup()
1876 {
1877    assert(stage == MESA_SHADER_TESS_EVAL);
1878 
1879    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1880 
1881    first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1882 
1883    /* Rewrite all ATTR file references to HW_REGs. */
1884    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1885       convert_attr_sources_to_hw_regs(inst);
1886    }
1887 }
1888 
1889 void
assign_gs_urb_setup()1890 fs_visitor::assign_gs_urb_setup()
1891 {
1892    assert(stage == MESA_SHADER_GEOMETRY);
1893 
1894    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1895 
1896    first_non_payload_grf +=
1897       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1898 
1899    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1900       /* Rewrite all ATTR file references to GRFs. */
1901       convert_attr_sources_to_hw_regs(inst);
1902    }
1903 }
1904 
1905 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1906 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1907                                 const brw_stage_prog_data *prog_data)
1908 {
1909    if (prog_data->nr_params == 0)
1910       return -1;
1911 
1912    if (devinfo->verx10 >= 125)
1913       return -1;
1914 
1915    /* The local thread id is always the last parameter in the list */
1916    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1917    if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1918       return prog_data->nr_params - 1;
1919 
1920    return -1;
1921 }
1922 
1923 /**
1924  * Assign UNIFORM file registers to either push constants or pull constants.
1925  *
1926  * We allow a fragment shader to have more than the specified minimum
1927  * maximum number of fragment shader uniform components (64).  If
1928  * there are too many of these, they'd fill up all of register space.
1929  * So, this will push some of them out to the pull constant buffer and
1930  * update the program to load them.
1931  */
1932 void
assign_constant_locations()1933 fs_visitor::assign_constant_locations()
1934 {
1935    /* Only the first compile gets to decide on locations. */
1936    if (push_constant_loc)
1937       return;
1938 
1939    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1940    for (unsigned u = 0; u < uniforms; u++)
1941       push_constant_loc[u] = u;
1942 
1943    /* Now that we know how many regular uniforms we'll push, reduce the
1944     * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1945     *
1946     * If changing this value, note the limitation about total_regs in
1947     * brw_curbe.c/crocus_state.c
1948     */
1949    const unsigned max_push_length = 64;
1950    unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1951    for (int i = 0; i < 4; i++) {
1952       struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
1953 
1954       if (push_length + range->length > max_push_length)
1955          range->length = max_push_length - push_length;
1956 
1957       push_length += range->length;
1958    }
1959    assert(push_length <= max_push_length);
1960 }
1961 
1962 bool
get_pull_locs(const fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)1963 fs_visitor::get_pull_locs(const fs_reg &src,
1964                           unsigned *out_surf_index,
1965                           unsigned *out_pull_index)
1966 {
1967    assert(src.file == UNIFORM);
1968 
1969    if (src.nr < UBO_START)
1970       return false;
1971 
1972    const struct brw_ubo_range *range =
1973       &prog_data->ubo_ranges[src.nr - UBO_START];
1974 
1975    /* If this access is in our (reduced) range, use the push data. */
1976    if (src.offset / 32 < range->length)
1977       return false;
1978 
1979    *out_surf_index = range->block;
1980    *out_pull_index = (32 * range->start + src.offset) / 4;
1981 
1982    prog_data->has_ubo_pull = true;
1983 
1984    return true;
1985 }
1986 
1987 /**
1988  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
1989  * instructions to FS_OPCODE_REP_FB_WRITE.
1990  */
1991 void
emit_repclear_shader()1992 fs_visitor::emit_repclear_shader()
1993 {
1994    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1995    fs_inst *write = NULL;
1996 
1997    assert(uniforms == 0);
1998    assume(key->nr_color_regions > 0);
1999 
2000    fs_reg color_output = retype(brw_vec4_grf(127, 0), BRW_REGISTER_TYPE_UD);
2001    fs_reg header = retype(brw_vec8_grf(125, 0), BRW_REGISTER_TYPE_UD);
2002 
2003    /* We pass the clear color as a flat input.  Copy it to the output. */
2004    fs_reg color_input =
2005       brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,
2006               BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
2007               BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2008 
2009    const fs_builder bld = fs_builder(this).at_end();
2010    bld.exec_all().group(4, 0).MOV(color_output, color_input);
2011 
2012    if (key->nr_color_regions > 1) {
2013       /* Copy g0..g1 as the message header */
2014       bld.exec_all().group(16, 0)
2015          .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2016    }
2017 
2018    for (int i = 0; i < key->nr_color_regions; ++i) {
2019       if (i > 0)
2020          bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
2021 
2022       write = bld.emit(SHADER_OPCODE_SEND);
2023       write->resize_sources(3);
2024       write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
2025       write->src[0] = brw_imm_ud(0);
2026       write->src[1] = brw_imm_ud(0);
2027       write->src[2] = i == 0 ? color_output : header;
2028       write->check_tdr = true;
2029       write->send_has_side_effects = true;
2030       write->desc = brw_fb_write_desc(devinfo, i,
2031          BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
2032          i == key->nr_color_regions - 1, false);
2033 
2034       /* We can use a headerless message for the first render target */
2035       write->header_size = i == 0 ? 0 : 2;
2036       write->mlen = 1 + write->header_size;
2037    }
2038    write->eot = true;
2039    write->last_rt = true;
2040 
2041    calculate_cfg();
2042 
2043    this->first_non_payload_grf = payload().num_regs;
2044 
2045    brw_fs_lower_scoreboard(*this);
2046 }
2047 
2048 /**
2049  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
2050  * by discard.  Due to the layout of the sample mask in the fragment shader
2051  * thread payload, \p bld is required to have a dispatch_width() not greater
2052  * than 16 for fragment shaders.
2053  */
2054 fs_reg
brw_sample_mask_reg(const fs_builder & bld)2055 brw_sample_mask_reg(const fs_builder &bld)
2056 {
2057    const fs_visitor &s = *bld.shader;
2058 
2059    if (s.stage != MESA_SHADER_FRAGMENT) {
2060       return brw_imm_ud(0xffffffff);
2061    } else if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) {
2062       assert(bld.dispatch_width() <= 16);
2063       return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
2064    } else {
2065       assert(bld.dispatch_width() <= 16);
2066       assert(s.devinfo->ver < 20);
2067       return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
2068                     BRW_REGISTER_TYPE_UW);
2069    }
2070 }
2071 
2072 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)2073 brw_fb_write_msg_control(const fs_inst *inst,
2074                          const struct brw_wm_prog_data *prog_data)
2075 {
2076    uint32_t mctl;
2077 
2078    if (prog_data->dual_src_blend) {
2079       assert(inst->exec_size == 8);
2080 
2081       if (inst->group % 16 == 0)
2082          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
2083       else if (inst->group % 16 == 8)
2084          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
2085       else
2086          unreachable("Invalid dual-source FB write instruction group");
2087    } else {
2088       assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
2089 
2090       if (inst->exec_size == 16)
2091          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
2092       else if (inst->exec_size == 8)
2093          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
2094       else
2095          unreachable("Invalid FB write execution size");
2096    }
2097 
2098    return mctl;
2099 }
2100 
2101  /**
2102  * Predicate the specified instruction on the sample mask.
2103  */
2104 void
brw_emit_predicate_on_sample_mask(const fs_builder & bld,fs_inst * inst)2105 brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
2106 {
2107    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
2108           bld.group() == inst->group &&
2109           bld.dispatch_width() == inst->exec_size);
2110 
2111    const fs_visitor &s = *bld.shader;
2112    const fs_reg sample_mask = brw_sample_mask_reg(bld);
2113    const unsigned subreg = sample_mask_flag_subreg(s);
2114 
2115    if (brw_wm_prog_data(s.stage_prog_data)->uses_kill) {
2116       assert(sample_mask.file == ARF &&
2117              sample_mask.nr == brw_flag_subreg(subreg).nr &&
2118              sample_mask.subnr == brw_flag_subreg(
2119                 subreg + inst->group / 16).subnr);
2120    } else {
2121       bld.group(1, 0).exec_all()
2122          .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
2123    }
2124 
2125    if (inst->predicate) {
2126       assert(inst->predicate == BRW_PREDICATE_NORMAL);
2127       assert(!inst->predicate_inverse);
2128       assert(inst->flag_subreg == 0);
2129       assert(s.devinfo->ver < 20);
2130       /* Combine the sample mask with the existing predicate by using a
2131        * vertical predication mode.
2132        */
2133       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
2134    } else {
2135       inst->flag_subreg = subreg;
2136       inst->predicate = BRW_PREDICATE_NORMAL;
2137       inst->predicate_inverse = false;
2138    }
2139 }
2140 
2141 void
dump_instructions_to_file(FILE * file) const2142 fs_visitor::dump_instructions_to_file(FILE *file) const
2143 {
2144    if (cfg) {
2145       const register_pressure &rp = regpressure_analysis.require();
2146       unsigned ip = 0, max_pressure = 0;
2147       unsigned cf_count = 0;
2148       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
2149          if (inst->is_control_flow_end())
2150             cf_count -= 1;
2151 
2152          max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
2153          fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
2154          for (unsigned i = 0; i < cf_count; i++)
2155             fprintf(file, "  ");
2156          dump_instruction(inst, file);
2157          ip++;
2158 
2159          if (inst->is_control_flow_begin())
2160             cf_count += 1;
2161       }
2162       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2163    } else {
2164       int ip = 0;
2165       foreach_in_list(backend_instruction, inst, &instructions) {
2166          fprintf(file, "%4d: ", ip++);
2167          dump_instruction(inst, file);
2168       }
2169    }
2170 }
2171 
2172 void
dump_instruction_to_file(const backend_instruction * be_inst,FILE * file) const2173 fs_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *file) const
2174 {
2175    const fs_inst *inst = (const fs_inst *)be_inst;
2176 
2177    if (inst->predicate) {
2178       fprintf(file, "(%cf%d.%d) ",
2179               inst->predicate_inverse ? '-' : '+',
2180               inst->flag_subreg / 2,
2181               inst->flag_subreg % 2);
2182    }
2183 
2184    fprintf(file, "%s", brw_instruction_name(&compiler->isa, inst->opcode));
2185    if (inst->saturate)
2186       fprintf(file, ".sat");
2187    if (inst->conditional_mod) {
2188       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2189       if (!inst->predicate &&
2190           (inst->opcode != BRW_OPCODE_SEL &&
2191            inst->opcode != BRW_OPCODE_CSEL &&
2192            inst->opcode != BRW_OPCODE_IF &&
2193            inst->opcode != BRW_OPCODE_WHILE)) {
2194          fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
2195                  inst->flag_subreg % 2);
2196       }
2197    }
2198    fprintf(file, "(%d) ", inst->exec_size);
2199 
2200    if (inst->mlen) {
2201       fprintf(file, "(mlen: %d) ", inst->mlen);
2202    }
2203 
2204    if (inst->ex_mlen) {
2205       fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
2206    }
2207 
2208    if (inst->eot) {
2209       fprintf(file, "(EOT) ");
2210    }
2211 
2212    switch (inst->dst.file) {
2213    case VGRF:
2214       fprintf(file, "vgrf%d", inst->dst.nr);
2215       break;
2216    case FIXED_GRF:
2217       fprintf(file, "g%d", inst->dst.nr);
2218       break;
2219    case BAD_FILE:
2220       fprintf(file, "(null)");
2221       break;
2222    case UNIFORM:
2223       fprintf(file, "***u%d***", inst->dst.nr);
2224       break;
2225    case ATTR:
2226       fprintf(file, "***attr%d***", inst->dst.nr);
2227       break;
2228    case ARF:
2229       switch (inst->dst.nr) {
2230       case BRW_ARF_NULL:
2231          fprintf(file, "null");
2232          break;
2233       case BRW_ARF_ADDRESS:
2234          fprintf(file, "a0.%d", inst->dst.subnr);
2235          break;
2236       case BRW_ARF_ACCUMULATOR:
2237          fprintf(file, "acc%d", inst->dst.subnr);
2238          break;
2239       case BRW_ARF_FLAG:
2240          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
2241          break;
2242       default:
2243          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
2244          break;
2245       }
2246       break;
2247    case IMM:
2248       unreachable("not reached");
2249    }
2250 
2251    if (inst->dst.offset ||
2252        (inst->dst.file == VGRF &&
2253         alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
2254       const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
2255       fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
2256               inst->dst.offset % reg_size);
2257    }
2258 
2259    if (inst->dst.stride != 1)
2260       fprintf(file, "<%u>", inst->dst.stride);
2261    fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
2262 
2263    for (int i = 0; i < inst->sources; i++) {
2264       if (inst->src[i].negate)
2265          fprintf(file, "-");
2266       if (inst->src[i].abs)
2267          fprintf(file, "|");
2268       switch (inst->src[i].file) {
2269       case VGRF:
2270          fprintf(file, "vgrf%d", inst->src[i].nr);
2271          break;
2272       case FIXED_GRF:
2273          fprintf(file, "g%d", inst->src[i].nr);
2274          break;
2275       case ATTR:
2276          fprintf(file, "attr%d", inst->src[i].nr);
2277          break;
2278       case UNIFORM:
2279          fprintf(file, "u%d", inst->src[i].nr);
2280          break;
2281       case BAD_FILE:
2282          fprintf(file, "(null)");
2283          break;
2284       case IMM:
2285          switch (inst->src[i].type) {
2286          case BRW_REGISTER_TYPE_HF:
2287             fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
2288             break;
2289          case BRW_REGISTER_TYPE_F:
2290             fprintf(file, "%-gf", inst->src[i].f);
2291             break;
2292          case BRW_REGISTER_TYPE_DF:
2293             fprintf(file, "%fdf", inst->src[i].df);
2294             break;
2295          case BRW_REGISTER_TYPE_W:
2296          case BRW_REGISTER_TYPE_D:
2297             fprintf(file, "%dd", inst->src[i].d);
2298             break;
2299          case BRW_REGISTER_TYPE_UW:
2300          case BRW_REGISTER_TYPE_UD:
2301             fprintf(file, "%uu", inst->src[i].ud);
2302             break;
2303          case BRW_REGISTER_TYPE_Q:
2304             fprintf(file, "%" PRId64 "q", inst->src[i].d64);
2305             break;
2306          case BRW_REGISTER_TYPE_UQ:
2307             fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
2308             break;
2309          case BRW_REGISTER_TYPE_VF:
2310             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
2311                     brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
2312                     brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
2313                     brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
2314                     brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
2315             break;
2316          case BRW_REGISTER_TYPE_V:
2317          case BRW_REGISTER_TYPE_UV:
2318             fprintf(file, "%08x%s", inst->src[i].ud,
2319                     inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
2320             break;
2321          default:
2322             fprintf(file, "???");
2323             break;
2324          }
2325          break;
2326       case ARF:
2327          switch (inst->src[i].nr) {
2328          case BRW_ARF_NULL:
2329             fprintf(file, "null");
2330             break;
2331          case BRW_ARF_ADDRESS:
2332             fprintf(file, "a0.%d", inst->src[i].subnr);
2333             break;
2334          case BRW_ARF_ACCUMULATOR:
2335             fprintf(file, "acc%d", inst->src[i].subnr);
2336             break;
2337          case BRW_ARF_FLAG:
2338             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
2339             break;
2340          default:
2341             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
2342             break;
2343          }
2344          break;
2345       }
2346 
2347       if (inst->src[i].offset ||
2348           (inst->src[i].file == VGRF &&
2349            alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
2350          const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
2351          fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
2352                  inst->src[i].offset % reg_size);
2353       }
2354 
2355       if (inst->src[i].abs)
2356          fprintf(file, "|");
2357 
2358       if (inst->src[i].file != IMM) {
2359          unsigned stride;
2360          if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
2361             unsigned hstride = inst->src[i].hstride;
2362             stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
2363          } else {
2364             stride = inst->src[i].stride;
2365          }
2366          if (stride != 1)
2367             fprintf(file, "<%u>", stride);
2368 
2369          fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
2370       }
2371 
2372       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2373          fprintf(file, ", ");
2374    }
2375 
2376    fprintf(file, " ");
2377 
2378    if (inst->force_writemask_all)
2379       fprintf(file, "NoMask ");
2380 
2381    if (inst->exec_size != dispatch_width)
2382       fprintf(file, "group%d ", inst->group);
2383 
2384    fprintf(file, "\n");
2385 }
2386 
register_pressure(const fs_visitor * v)2387 brw::register_pressure::register_pressure(const fs_visitor *v)
2388 {
2389    const fs_live_variables &live = v->live_analysis.require();
2390    const unsigned num_instructions = v->cfg->num_blocks ?
2391       v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
2392 
2393    regs_live_at_ip = new unsigned[num_instructions]();
2394 
2395    for (unsigned reg = 0; reg < v->alloc.count; reg++) {
2396       for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
2397          regs_live_at_ip[ip] += v->alloc.sizes[reg];
2398    }
2399 
2400    const unsigned payload_count = v->first_non_payload_grf;
2401 
2402    int *payload_last_use_ip = new int[payload_count];
2403    v->calculate_payload_ranges(payload_count, payload_last_use_ip);
2404 
2405    for (unsigned reg = 0; reg < payload_count; reg++) {
2406       for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
2407          ++regs_live_at_ip[ip];
2408    }
2409 
2410    delete[] payload_last_use_ip;
2411 }
2412 
~register_pressure()2413 brw::register_pressure::~register_pressure()
2414 {
2415    delete[] regs_live_at_ip;
2416 }
2417 
2418 void
invalidate_analysis(brw::analysis_dependency_class c)2419 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
2420 {
2421    backend_shader::invalidate_analysis(c);
2422    live_analysis.invalidate(c);
2423    regpressure_analysis.invalidate(c);
2424 }
2425 
2426 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const2427 fs_visitor::debug_optimizer(const nir_shader *nir,
2428                             const char *pass_name,
2429                             int iteration, int pass_num) const
2430 {
2431    if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
2432       return;
2433 
2434    char *filename;
2435    int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
2436                       debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
2437                       _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
2438                       iteration, pass_num, pass_name);
2439    if (ret == -1)
2440       return;
2441    dump_instructions(filename);
2442    free(filename);
2443 }
2444 
2445 uint32_t
compute_max_register_pressure()2446 fs_visitor::compute_max_register_pressure()
2447 {
2448    const register_pressure &rp = regpressure_analysis.require();
2449    uint32_t ip = 0, max_pressure = 0;
2450    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
2451       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
2452       ip++;
2453    }
2454    return max_pressure;
2455 }
2456 
2457 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)2458 save_instruction_order(const struct cfg_t *cfg)
2459 {
2460    /* Before we schedule anything, stash off the instruction order as an array
2461     * of fs_inst *.  This way, we can reset it between scheduling passes to
2462     * prevent dependencies between the different scheduling modes.
2463     */
2464    int num_insts = cfg->last_block()->end_ip + 1;
2465    fs_inst **inst_arr = new fs_inst * [num_insts];
2466 
2467    int ip = 0;
2468    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2469       assert(ip >= block->start_ip && ip <= block->end_ip);
2470       inst_arr[ip++] = inst;
2471    }
2472    assert(ip == num_insts);
2473 
2474    return inst_arr;
2475 }
2476 
2477 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)2478 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
2479 {
2480    ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
2481 
2482    int ip = 0;
2483    foreach_block (block, cfg) {
2484       block->instructions.make_empty();
2485 
2486       assert(ip == block->start_ip);
2487       for (; ip <= block->end_ip; ip++)
2488          block->instructions.push_tail(inst_arr[ip]);
2489    }
2490    assert(ip == num_insts);
2491 }
2492 
2493 void
allocate_registers(bool allow_spilling)2494 fs_visitor::allocate_registers(bool allow_spilling)
2495 {
2496    bool allocated;
2497 
2498    static const enum instruction_scheduler_mode pre_modes[] = {
2499       SCHEDULE_PRE,
2500       SCHEDULE_PRE_NON_LIFO,
2501       SCHEDULE_NONE,
2502       SCHEDULE_PRE_LIFO,
2503    };
2504 
2505    static const char *scheduler_mode_name[] = {
2506       [SCHEDULE_PRE] = "top-down",
2507       [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
2508       [SCHEDULE_PRE_LIFO] = "lifo",
2509       [SCHEDULE_POST] = "post",
2510       [SCHEDULE_NONE] = "none",
2511    };
2512 
2513    uint32_t best_register_pressure = UINT32_MAX;
2514    enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
2515 
2516    brw_fs_opt_compact_virtual_grfs(*this);
2517 
2518    if (needs_register_pressure)
2519       shader_stats.max_register_pressure = compute_max_register_pressure();
2520 
2521    debug_optimizer(nir, "pre_register_allocate", 90, 90);
2522 
2523    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
2524 
2525    /* Before we schedule anything, stash off the instruction order as an array
2526     * of fs_inst *.  This way, we can reset it between scheduling passes to
2527     * prevent dependencies between the different scheduling modes.
2528     */
2529    fs_inst **orig_order = save_instruction_order(cfg);
2530    fs_inst **best_pressure_order = NULL;
2531 
2532    void *scheduler_ctx = ralloc_context(NULL);
2533    fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
2534 
2535    /* Try each scheduling heuristic to see if it can successfully register
2536     * allocate without spilling.  They should be ordered by decreasing
2537     * performance but increasing likelihood of allocating.
2538     */
2539    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
2540       enum instruction_scheduler_mode sched_mode = pre_modes[i];
2541 
2542       schedule_instructions_pre_ra(sched, sched_mode);
2543       this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
2544 
2545       debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
2546 
2547       if (0) {
2548          assign_regs_trivial();
2549          allocated = true;
2550          break;
2551       }
2552 
2553       /* We should only spill registers on the last scheduling. */
2554       assert(!spilled_any_registers);
2555 
2556       allocated = assign_regs(false, spill_all);
2557       if (allocated)
2558          break;
2559 
2560       /* Save the maximum register pressure */
2561       uint32_t this_pressure = compute_max_register_pressure();
2562 
2563       if (0) {
2564          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
2565                  scheduler_mode_name[sched_mode], this_pressure);
2566       }
2567 
2568       if (this_pressure < best_register_pressure) {
2569          best_register_pressure = this_pressure;
2570          best_sched = sched_mode;
2571          delete[] best_pressure_order;
2572          best_pressure_order = save_instruction_order(cfg);
2573       }
2574 
2575       /* Reset back to the original order before trying the next mode */
2576       restore_instruction_order(cfg, orig_order);
2577       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2578    }
2579 
2580    ralloc_free(scheduler_ctx);
2581 
2582    if (!allocated) {
2583       if (0) {
2584          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
2585                  scheduler_mode_name[best_sched]);
2586       }
2587       restore_instruction_order(cfg, best_pressure_order);
2588       shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
2589 
2590       allocated = assign_regs(allow_spilling, spill_all);
2591    }
2592 
2593    delete[] orig_order;
2594    delete[] best_pressure_order;
2595 
2596    if (!allocated) {
2597       fail("Failure to register allocate.  Reduce number of "
2598            "live scalar values to avoid this.");
2599    } else if (spilled_any_registers) {
2600       brw_shader_perf_log(compiler, log_data,
2601                           "%s shader triggered register spilling.  "
2602                           "Try reducing the number of live scalar "
2603                           "values to improve performance.\n",
2604                           _mesa_shader_stage_to_string(stage));
2605    }
2606 
2607    if (failed)
2608       return;
2609 
2610    brw_fs_opt_bank_conflicts(*this);
2611 
2612    schedule_instructions_post_ra();
2613 
2614    if (last_scratch > 0) {
2615       ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
2616 
2617       /* Take the max of any previously compiled variant of the shader. In the
2618        * case of bindless shaders with return parts, this will also take the
2619        * max of all parts.
2620        */
2621       prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
2622                                       prog_data->total_scratch);
2623 
2624       /* We currently only support up to 2MB of scratch space.  If we
2625        * need to support more eventually, the documentation suggests
2626        * that we could allocate a larger buffer, and partition it out
2627        * ourselves.  We'd just have to undo the hardware's address
2628        * calculation by subtracting (FFTID * Per Thread Scratch Space)
2629        * and then add FFTID * (Larger Per Thread Scratch Space).
2630        *
2631        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
2632        * Thread Group Tracking > Local Memory/Scratch Space.
2633        */
2634       assert(prog_data->total_scratch < max_scratch_size);
2635    }
2636 
2637    brw_fs_lower_scoreboard(*this);
2638 }
2639 
2640 bool
run_vs()2641 fs_visitor::run_vs()
2642 {
2643    assert(stage == MESA_SHADER_VERTEX);
2644 
2645    payload_ = new vs_thread_payload(*this);
2646 
2647    nir_to_brw(this);
2648 
2649    if (failed)
2650       return false;
2651 
2652    emit_urb_writes();
2653 
2654    calculate_cfg();
2655 
2656    brw_fs_optimize(*this);
2657 
2658    assign_curb_setup();
2659    assign_vs_urb_setup();
2660 
2661    brw_fs_lower_3src_null_dest(*this);
2662    brw_fs_workaround_memory_fence_before_eot(*this);
2663    brw_fs_workaround_emit_dummy_mov_instruction(*this);
2664 
2665    allocate_registers(true /* allow_spilling */);
2666 
2667    return !failed;
2668 }
2669 
2670 void
set_tcs_invocation_id()2671 fs_visitor::set_tcs_invocation_id()
2672 {
2673    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2674    struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2675    const fs_builder bld = fs_builder(this).at_end();
2676 
2677    const unsigned instance_id_mask =
2678       (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
2679       (devinfo->ver >= 11)     ? INTEL_MASK(22, 16) :
2680                                  INTEL_MASK(23, 17);
2681    const unsigned instance_id_shift =
2682       (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
2683 
2684    /* Get instance number from g0.2 bits:
2685     *  * 7:0 on DG2+
2686     *  * 22:16 on gfx11+
2687     *  * 23:17 otherwise
2688     */
2689    fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
2690    bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
2691            brw_imm_ud(instance_id_mask));
2692 
2693    invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
2694 
2695    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
2696       /* gl_InvocationID is just the thread number */
2697       bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
2698       return;
2699    }
2700 
2701    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
2702 
2703    fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
2704    fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
2705    bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
2706    bld.MOV(channels_ud, channels_uw);
2707 
2708    if (tcs_prog_data->instances == 1) {
2709       invocation_id = channels_ud;
2710    } else {
2711       fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
2712       bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
2713       bld.ADD(invocation_id, instance_times_8, channels_ud);
2714    }
2715 }
2716 
2717 void
emit_tcs_thread_end()2718 fs_visitor::emit_tcs_thread_end()
2719 {
2720    /* Try and tag the last URB write with EOT instead of emitting a whole
2721     * separate write just to finish the thread.  There isn't guaranteed to
2722     * be one, so this may not succeed.
2723     */
2724    if (mark_last_urb_write_with_eot())
2725       return;
2726 
2727    const fs_builder bld = fs_builder(this).at_end();
2728 
2729    /* Emit a URB write to end the thread.  On Broadwell, we use this to write
2730     * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
2731     * algorithm to set it optimally).  On other platforms, we simply write
2732     * zero to a reserved/MBZ patch header DWord which has no consequence.
2733     */
2734    fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2735    srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
2736    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
2737    srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
2738    srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
2739    fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
2740                             reg_undef, srcs, ARRAY_SIZE(srcs));
2741    inst->eot = true;
2742 }
2743 
2744 bool
run_tcs()2745 fs_visitor::run_tcs()
2746 {
2747    assert(stage == MESA_SHADER_TESS_CTRL);
2748 
2749    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2750    const fs_builder bld = fs_builder(this).at_end();
2751 
2752    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
2753           vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
2754 
2755    payload_ = new tcs_thread_payload(*this);
2756 
2757    /* Initialize gl_InvocationID */
2758    set_tcs_invocation_id();
2759 
2760    const bool fix_dispatch_mask =
2761       vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
2762       (nir->info.tess.tcs_vertices_out % 8) != 0;
2763 
2764    /* Fix the disptach mask */
2765    if (fix_dispatch_mask) {
2766       bld.CMP(bld.null_reg_ud(), invocation_id,
2767               brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
2768       bld.IF(BRW_PREDICATE_NORMAL);
2769    }
2770 
2771    nir_to_brw(this);
2772 
2773    if (fix_dispatch_mask) {
2774       bld.emit(BRW_OPCODE_ENDIF);
2775    }
2776 
2777    emit_tcs_thread_end();
2778 
2779    if (failed)
2780       return false;
2781 
2782    calculate_cfg();
2783 
2784    brw_fs_optimize(*this);
2785 
2786    assign_curb_setup();
2787    assign_tcs_urb_setup();
2788 
2789    brw_fs_lower_3src_null_dest(*this);
2790    brw_fs_workaround_memory_fence_before_eot(*this);
2791    brw_fs_workaround_emit_dummy_mov_instruction(*this);
2792 
2793    allocate_registers(true /* allow_spilling */);
2794 
2795    return !failed;
2796 }
2797 
2798 bool
run_tes()2799 fs_visitor::run_tes()
2800 {
2801    assert(stage == MESA_SHADER_TESS_EVAL);
2802 
2803    payload_ = new tes_thread_payload(*this);
2804 
2805    nir_to_brw(this);
2806 
2807    if (failed)
2808       return false;
2809 
2810    emit_urb_writes();
2811 
2812    calculate_cfg();
2813 
2814    brw_fs_optimize(*this);
2815 
2816    assign_curb_setup();
2817    assign_tes_urb_setup();
2818 
2819    brw_fs_lower_3src_null_dest(*this);
2820    brw_fs_workaround_memory_fence_before_eot(*this);
2821    brw_fs_workaround_emit_dummy_mov_instruction(*this);
2822 
2823    allocate_registers(true /* allow_spilling */);
2824 
2825    return !failed;
2826 }
2827 
2828 bool
run_gs()2829 fs_visitor::run_gs()
2830 {
2831    assert(stage == MESA_SHADER_GEOMETRY);
2832 
2833    payload_ = new gs_thread_payload(*this);
2834 
2835    this->final_gs_vertex_count = vgrf(glsl_uint_type());
2836 
2837    if (gs_compile->control_data_header_size_bits > 0) {
2838       /* Create a VGRF to store accumulated control data bits. */
2839       this->control_data_bits = vgrf(glsl_uint_type());
2840 
2841       /* If we're outputting more than 32 control data bits, then EmitVertex()
2842        * will set control_data_bits to 0 after emitting the first vertex.
2843        * Otherwise, we need to initialize it to 0 here.
2844        */
2845       if (gs_compile->control_data_header_size_bits <= 32) {
2846          const fs_builder bld = fs_builder(this).at_end();
2847          const fs_builder abld = bld.annotate("initialize control data bits");
2848          abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2849       }
2850    }
2851 
2852    nir_to_brw(this);
2853 
2854    emit_gs_thread_end();
2855 
2856    if (failed)
2857       return false;
2858 
2859    calculate_cfg();
2860 
2861    brw_fs_optimize(*this);
2862 
2863    assign_curb_setup();
2864    assign_gs_urb_setup();
2865 
2866    brw_fs_lower_3src_null_dest(*this);
2867    brw_fs_workaround_memory_fence_before_eot(*this);
2868    brw_fs_workaround_emit_dummy_mov_instruction(*this);
2869 
2870    allocate_registers(true /* allow_spilling */);
2871 
2872    return !failed;
2873 }
2874 
2875 /* From the SKL PRM, Volume 16, Workarounds:
2876  *
2877  *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
2878  *              only header phases (R0-R2)
2879  *
2880  *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
2881  *       have been header only.
2882  *
2883  * Instead of enabling push constants one can alternatively enable one of the
2884  * inputs. Here one simply chooses "layer" which shouldn't impose much
2885  * overhead.
2886  */
2887 static void
gfx9_ps_header_only_workaround(struct brw_wm_prog_data * wm_prog_data)2888 gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
2889 {
2890    if (wm_prog_data->num_varying_inputs)
2891       return;
2892 
2893    if (wm_prog_data->base.curb_read_length)
2894       return;
2895 
2896    wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
2897    wm_prog_data->num_varying_inputs = 1;
2898 
2899    brw_compute_urb_setup_index(wm_prog_data);
2900 }
2901 
2902 bool
run_fs(bool allow_spilling,bool do_rep_send)2903 fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
2904 {
2905    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
2906    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
2907    const fs_builder bld = fs_builder(this).at_end();
2908 
2909    assert(stage == MESA_SHADER_FRAGMENT);
2910 
2911    payload_ = new fs_thread_payload(*this, source_depth_to_render_target);
2912 
2913    if (do_rep_send) {
2914       assert(dispatch_width == 16);
2915       emit_repclear_shader();
2916    } else {
2917       if (nir->info.inputs_read > 0 ||
2918           BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
2919           (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
2920          emit_interpolation_setup();
2921       }
2922 
2923       /* We handle discards by keeping track of the still-live pixels in f0.1.
2924        * Initialize it with the dispatched pixels.
2925        */
2926       if (wm_prog_data->uses_kill) {
2927          const unsigned lower_width = MIN2(dispatch_width, 16);
2928          for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
2929             /* According to the "PS Thread Payload for Normal
2930              * Dispatch" pages on the BSpec, the dispatch mask is
2931              * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
2932              * gfx6+.
2933              */
2934             const fs_reg dispatch_mask =
2935                devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
2936                                     brw_vec1_grf(i + 1, 7);
2937             bld.exec_all().group(1, 0)
2938                .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
2939                     retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
2940          }
2941       }
2942 
2943       if (nir->info.writes_memory)
2944          wm_prog_data->has_side_effects = true;
2945 
2946       nir_to_brw(this);
2947 
2948       if (failed)
2949 	 return false;
2950 
2951       emit_fb_writes();
2952 
2953       calculate_cfg();
2954 
2955       brw_fs_optimize(*this);
2956 
2957       assign_curb_setup();
2958 
2959       if (devinfo->ver == 9)
2960          gfx9_ps_header_only_workaround(wm_prog_data);
2961 
2962       assign_urb_setup();
2963 
2964       brw_fs_lower_3src_null_dest(*this);
2965       brw_fs_workaround_memory_fence_before_eot(*this);
2966       brw_fs_workaround_emit_dummy_mov_instruction(*this);
2967 
2968       allocate_registers(allow_spilling);
2969    }
2970 
2971    return !failed;
2972 }
2973 
2974 bool
run_cs(bool allow_spilling)2975 fs_visitor::run_cs(bool allow_spilling)
2976 {
2977    assert(gl_shader_stage_is_compute(stage));
2978    const fs_builder bld = fs_builder(this).at_end();
2979 
2980    payload_ = new cs_thread_payload(*this);
2981 
2982    if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
2983       /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
2984       const fs_builder abld = bld.exec_all().group(1, 0);
2985       abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
2986                suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
2987    }
2988 
2989    nir_to_brw(this);
2990 
2991    if (failed)
2992       return false;
2993 
2994    emit_cs_terminate();
2995 
2996    calculate_cfg();
2997 
2998    brw_fs_optimize(*this);
2999 
3000    assign_curb_setup();
3001 
3002    brw_fs_lower_3src_null_dest(*this);
3003    brw_fs_workaround_memory_fence_before_eot(*this);
3004    brw_fs_workaround_emit_dummy_mov_instruction(*this);
3005 
3006    allocate_registers(allow_spilling);
3007 
3008    return !failed;
3009 }
3010 
3011 bool
run_bs(bool allow_spilling)3012 fs_visitor::run_bs(bool allow_spilling)
3013 {
3014    assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
3015 
3016    payload_ = new bs_thread_payload(*this);
3017 
3018    nir_to_brw(this);
3019 
3020    if (failed)
3021       return false;
3022 
3023    /* TODO(RT): Perhaps rename this? */
3024    emit_cs_terminate();
3025 
3026    calculate_cfg();
3027 
3028    brw_fs_optimize(*this);
3029 
3030    assign_curb_setup();
3031 
3032    brw_fs_lower_3src_null_dest(*this);
3033    brw_fs_workaround_memory_fence_before_eot(*this);
3034    brw_fs_workaround_emit_dummy_mov_instruction(*this);
3035 
3036    allocate_registers(allow_spilling);
3037 
3038    return !failed;
3039 }
3040 
3041 bool
run_task(bool allow_spilling)3042 fs_visitor::run_task(bool allow_spilling)
3043 {
3044    assert(stage == MESA_SHADER_TASK);
3045 
3046    payload_ = new task_mesh_thread_payload(*this);
3047 
3048    nir_to_brw(this);
3049 
3050    if (failed)
3051       return false;
3052 
3053    emit_urb_fence();
3054 
3055    emit_cs_terminate();
3056 
3057    calculate_cfg();
3058 
3059    brw_fs_optimize(*this);
3060 
3061    assign_curb_setup();
3062 
3063    brw_fs_lower_3src_null_dest(*this);
3064    brw_fs_workaround_memory_fence_before_eot(*this);
3065    brw_fs_workaround_emit_dummy_mov_instruction(*this);
3066 
3067    allocate_registers(allow_spilling);
3068 
3069    return !failed;
3070 }
3071 
3072 bool
run_mesh(bool allow_spilling)3073 fs_visitor::run_mesh(bool allow_spilling)
3074 {
3075    assert(stage == MESA_SHADER_MESH);
3076 
3077    payload_ = new task_mesh_thread_payload(*this);
3078 
3079    nir_to_brw(this);
3080 
3081    if (failed)
3082       return false;
3083 
3084    emit_urb_fence();
3085 
3086    emit_cs_terminate();
3087 
3088    calculate_cfg();
3089 
3090    brw_fs_optimize(*this);
3091 
3092    assign_curb_setup();
3093 
3094    brw_fs_lower_3src_null_dest(*this);
3095    brw_fs_workaround_memory_fence_before_eot(*this);
3096    brw_fs_workaround_emit_dummy_mov_instruction(*this);
3097 
3098    allocate_registers(allow_spilling);
3099 
3100    return !failed;
3101 }
3102 
3103 static bool
is_used_in_not_interp_frag_coord(nir_def * def)3104 is_used_in_not_interp_frag_coord(nir_def *def)
3105 {
3106    nir_foreach_use_including_if(src, def) {
3107       if (nir_src_is_if(src))
3108          return true;
3109 
3110       if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
3111          return true;
3112 
3113       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
3114       if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
3115          return true;
3116    }
3117 
3118    return false;
3119 }
3120 
3121 /**
3122  * Return a bitfield where bit n is set if barycentric interpolation mode n
3123  * (see enum brw_barycentric_mode) is needed by the fragment shader.
3124  *
3125  * We examine the load_barycentric intrinsics rather than looking at input
3126  * variables so that we catch interpolateAtCentroid() messages too, which
3127  * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
3128  */
3129 static unsigned
brw_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)3130 brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
3131                                      const nir_shader *shader)
3132 {
3133    unsigned barycentric_interp_modes = 0;
3134 
3135    nir_foreach_function_impl(impl, shader) {
3136       nir_foreach_block(block, impl) {
3137          nir_foreach_instr(instr, block) {
3138             if (instr->type != nir_instr_type_intrinsic)
3139                continue;
3140 
3141             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
3142             switch (intrin->intrinsic) {
3143             case nir_intrinsic_load_barycentric_pixel:
3144             case nir_intrinsic_load_barycentric_centroid:
3145             case nir_intrinsic_load_barycentric_sample:
3146             case nir_intrinsic_load_barycentric_at_sample:
3147             case nir_intrinsic_load_barycentric_at_offset:
3148                break;
3149             default:
3150                continue;
3151             }
3152 
3153             /* Ignore WPOS; it doesn't require interpolation. */
3154             if (!is_used_in_not_interp_frag_coord(&intrin->def))
3155                continue;
3156 
3157             nir_intrinsic_op bary_op = intrin->intrinsic;
3158             enum brw_barycentric_mode bary =
3159                brw_barycentric_mode(intrin);
3160 
3161             barycentric_interp_modes |= 1 << bary;
3162 
3163             if (devinfo->needs_unlit_centroid_workaround &&
3164                 bary_op == nir_intrinsic_load_barycentric_centroid)
3165                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
3166          }
3167       }
3168    }
3169 
3170    return barycentric_interp_modes;
3171 }
3172 
3173 static void
brw_compute_flat_inputs(struct brw_wm_prog_data * prog_data,const nir_shader * shader)3174 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
3175                         const nir_shader *shader)
3176 {
3177    prog_data->flat_inputs = 0;
3178 
3179    nir_foreach_shader_in_variable(var, shader) {
3180       /* flat shading */
3181       if (var->data.interpolation != INTERP_MODE_FLAT)
3182          continue;
3183 
3184       if (var->data.per_primitive)
3185          continue;
3186 
3187       unsigned slots = glsl_count_attribute_slots(var->type, false);
3188       for (unsigned s = 0; s < slots; s++) {
3189          int input_index = prog_data->urb_setup[var->data.location + s];
3190 
3191          if (input_index >= 0)
3192             prog_data->flat_inputs |= 1 << input_index;
3193       }
3194    }
3195 }
3196 
3197 static uint8_t
computed_depth_mode(const nir_shader * shader)3198 computed_depth_mode(const nir_shader *shader)
3199 {
3200    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3201       switch (shader->info.fs.depth_layout) {
3202       case FRAG_DEPTH_LAYOUT_NONE:
3203       case FRAG_DEPTH_LAYOUT_ANY:
3204          return BRW_PSCDEPTH_ON;
3205       case FRAG_DEPTH_LAYOUT_GREATER:
3206          return BRW_PSCDEPTH_ON_GE;
3207       case FRAG_DEPTH_LAYOUT_LESS:
3208          return BRW_PSCDEPTH_ON_LE;
3209       case FRAG_DEPTH_LAYOUT_UNCHANGED:
3210          /* We initially set this to OFF, but having the shader write the
3211           * depth means we allocate register space in the SEND message. The
3212           * difference between the SEND register count and the OFF state
3213           * programming makes the HW hang.
3214           *
3215           * Removing the depth writes also leads to test failures. So use
3216           * LesserThanOrEqual, which fits writing the same value
3217           * (unchanged/equal).
3218           *
3219           */
3220          return BRW_PSCDEPTH_ON_LE;
3221       }
3222    }
3223    return BRW_PSCDEPTH_OFF;
3224 }
3225 
3226 /**
3227  * Move load_interpolated_input with simple (payload-based) barycentric modes
3228  * to the top of the program so we don't emit multiple PLNs for the same input.
3229  *
3230  * This works around CSE not being able to handle non-dominating cases
3231  * such as:
3232  *
3233  *    if (...) {
3234  *       interpolate input
3235  *    } else {
3236  *       interpolate the same exact input
3237  *    }
3238  *
3239  * This should be replaced by global value numbering someday.
3240  */
3241 bool
brw_nir_move_interpolation_to_top(nir_shader * nir)3242 brw_nir_move_interpolation_to_top(nir_shader *nir)
3243 {
3244    bool progress = false;
3245 
3246    nir_foreach_function_impl(impl, nir) {
3247       nir_block *top = nir_start_block(impl);
3248       nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
3249       bool impl_progress = false;
3250 
3251       for (nir_block *block = nir_block_cf_tree_next(top);
3252            block != NULL;
3253            block = nir_block_cf_tree_next(block)) {
3254 
3255          nir_foreach_instr_safe(instr, block) {
3256             if (instr->type != nir_instr_type_intrinsic)
3257                continue;
3258 
3259             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
3260             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
3261                continue;
3262             nir_intrinsic_instr *bary_intrinsic =
3263                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
3264             nir_intrinsic_op op = bary_intrinsic->intrinsic;
3265 
3266             /* Leave interpolateAtSample/Offset() where they are. */
3267             if (op == nir_intrinsic_load_barycentric_at_sample ||
3268                 op == nir_intrinsic_load_barycentric_at_offset)
3269                continue;
3270 
3271             nir_instr *move[3] = {
3272                &bary_intrinsic->instr,
3273                intrin->src[1].ssa->parent_instr,
3274                instr
3275             };
3276 
3277             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
3278                if (move[i]->block != top) {
3279                   nir_instr_move(cursor, move[i]);
3280                   impl_progress = true;
3281                }
3282             }
3283          }
3284       }
3285 
3286       progress = progress || impl_progress;
3287 
3288       nir_metadata_preserve(impl, impl_progress ? (nir_metadata_block_index |
3289                                                       nir_metadata_dominance)
3290                                                    : nir_metadata_all);
3291    }
3292 
3293    return progress;
3294 }
3295 
3296 static void
brw_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const struct brw_mue_map * mue_map)3297 brw_nir_populate_wm_prog_data(nir_shader *shader,
3298                               const struct intel_device_info *devinfo,
3299                               const struct brw_wm_prog_key *key,
3300                               struct brw_wm_prog_data *prog_data,
3301                               const struct brw_mue_map *mue_map)
3302 {
3303    prog_data->uses_kill = shader->info.fs.uses_discard ||
3304                           shader->info.fs.uses_demote;
3305    prog_data->uses_omask = !key->ignore_sample_mask_out &&
3306       (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
3307    prog_data->color_outputs_written = key->color_outputs_valid;
3308    prog_data->max_polygons = 1;
3309    prog_data->computed_depth_mode = computed_depth_mode(shader);
3310    prog_data->computed_stencil =
3311       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
3312 
3313    prog_data->sample_shading =
3314       shader->info.fs.uses_sample_shading ||
3315       shader->info.outputs_read;
3316 
3317    assert(key->multisample_fbo != BRW_NEVER ||
3318           key->persample_interp == BRW_NEVER);
3319 
3320    prog_data->persample_dispatch = key->persample_interp;
3321    if (prog_data->sample_shading)
3322       prog_data->persample_dispatch = BRW_ALWAYS;
3323 
3324    /* We can only persample dispatch if we have a multisample FBO */
3325    prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
3326                                         key->multisample_fbo);
3327 
3328    /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
3329     * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
3330     * to definitively tell whether alpha_to_coverage is on or off.
3331     */
3332    prog_data->alpha_to_coverage = key->alpha_to_coverage;
3333    assert(prog_data->alpha_to_coverage != BRW_SOMETIMES ||
3334           prog_data->persample_dispatch == BRW_SOMETIMES);
3335 
3336    prog_data->uses_sample_mask =
3337       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
3338 
3339    /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
3340     *
3341     *    "MSDISPMODE_PERSAMPLE is required in order to select
3342     *    POSOFFSET_SAMPLE"
3343     *
3344     * So we can only really get sample positions if we are doing real
3345     * per-sample dispatch.  If we need gl_SamplePosition and we don't have
3346     * persample dispatch, we hard-code it to 0.5.
3347     */
3348    prog_data->uses_pos_offset =
3349       prog_data->persample_dispatch != BRW_NEVER &&
3350       (BITSET_TEST(shader->info.system_values_read,
3351                    SYSTEM_VALUE_SAMPLE_POS) ||
3352        BITSET_TEST(shader->info.system_values_read,
3353                    SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
3354 
3355    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
3356    prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
3357    prog_data->inner_coverage = shader->info.fs.inner_coverage;
3358 
3359    prog_data->barycentric_interp_modes =
3360       brw_compute_barycentric_interp_modes(devinfo, shader);
3361 
3362    /* From the BDW PRM documentation for 3DSTATE_WM:
3363     *
3364     *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
3365     *     Sample or Non- perspective Sample barycentric coordinates."
3366     *
3367     * So cleanup any potentially set sample barycentric mode when not in per
3368     * sample dispatch.
3369     */
3370    if (prog_data->persample_dispatch == BRW_NEVER) {
3371       prog_data->barycentric_interp_modes &=
3372          ~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
3373    }
3374 
3375    prog_data->uses_nonperspective_interp_modes |=
3376       (prog_data->barycentric_interp_modes &
3377       BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
3378 
3379    /* The current VK_EXT_graphics_pipeline_library specification requires
3380     * coarse to specified at compile time. But per sample interpolation can be
3381     * dynamic. So we should never be in a situation where coarse &
3382     * persample_interp are both respectively true & BRW_ALWAYS.
3383     *
3384     * Coarse will dynamically turned off when persample_interp is active.
3385     */
3386    assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
3387 
3388    prog_data->coarse_pixel_dispatch =
3389       brw_sometimes_invert(prog_data->persample_dispatch);
3390    if (!key->coarse_pixel ||
3391        prog_data->uses_omask ||
3392        prog_data->sample_shading ||
3393        prog_data->uses_sample_mask ||
3394        (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
3395        prog_data->computed_stencil) {
3396       prog_data->coarse_pixel_dispatch = BRW_NEVER;
3397    }
3398 
3399    /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
3400     * Message Descriptor :
3401     *
3402     *    "Message Type. Specifies the type of message being sent when
3403     *     pixel-rate evaluation is requested :
3404     *
3405     *     Format = U2
3406     *       0: Per Message Offset (eval_snapped with immediate offset)
3407     *       1: Sample Position Offset (eval_sindex)
3408     *       2: Centroid Position Offset (eval_centroid)
3409     *       3: Per Slot Offset (eval_snapped with register offset)
3410     *
3411     *     Message Type. Specifies the type of message being sent when
3412     *     coarse-rate evaluation is requested :
3413     *
3414     *     Format = U2
3415     *       0: Coarse to Pixel Mapping Message (internal message)
3416     *       1: Reserved
3417     *       2: Coarse Centroid Position (eval_centroid)
3418     *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
3419     *
3420     * The Sample Position Offset is marked as reserved for coarse rate
3421     * evaluation and leads to hangs if we try to use it. So disable coarse
3422     * pixel shading if we have any intrinsic that will result in a pixel
3423     * interpolater message at sample.
3424     */
3425    if (intel_nir_pulls_at_sample(shader))
3426       prog_data->coarse_pixel_dispatch = BRW_NEVER;
3427 
3428    /* We choose to always enable VMask prior to XeHP, as it would cause
3429     * us to lose out on the eliminate_find_live_channel() optimization.
3430     */
3431    prog_data->uses_vmask = devinfo->verx10 < 125 ||
3432                            shader->info.fs.needs_quad_helper_invocations ||
3433                            shader->info.uses_wide_subgroup_intrinsics ||
3434                            prog_data->coarse_pixel_dispatch != BRW_NEVER;
3435 
3436    prog_data->uses_src_w =
3437       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
3438    prog_data->uses_src_depth =
3439       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
3440       prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
3441    prog_data->uses_depth_w_coefficients =
3442       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
3443       prog_data->coarse_pixel_dispatch != BRW_NEVER;
3444 
3445    calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
3446    brw_compute_flat_inputs(prog_data, shader);
3447 }
3448 
3449 /**
3450  * Pre-gfx6, the register file of the EUs was shared between threads,
3451  * and each thread used some subset allocated on a 16-register block
3452  * granularity.  The unit states wanted these block counts.
3453  */
3454 static inline int
brw_register_blocks(int reg_count)3455 brw_register_blocks(int reg_count)
3456 {
3457    return ALIGN(reg_count, 16) / 16 - 1;
3458 }
3459 
3460 const unsigned *
brw_compile_fs(const struct brw_compiler * compiler,struct brw_compile_fs_params * params)3461 brw_compile_fs(const struct brw_compiler *compiler,
3462                struct brw_compile_fs_params *params)
3463 {
3464    struct nir_shader *nir = params->base.nir;
3465    const struct brw_wm_prog_key *key = params->key;
3466    struct brw_wm_prog_data *prog_data = params->prog_data;
3467    bool allow_spilling = params->allow_spilling;
3468    const bool debug_enabled =
3469       brw_should_print_shader(nir, params->base.debug_flag ?
3470                                    params->base.debug_flag : DEBUG_WM);
3471 
3472    prog_data->base.stage = MESA_SHADER_FRAGMENT;
3473    prog_data->base.ray_queries = nir->info.ray_queries;
3474    prog_data->base.total_scratch = 0;
3475 
3476    const struct intel_device_info *devinfo = compiler->devinfo;
3477    const unsigned max_subgroup_size = 32;
3478 
3479    brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
3480    brw_nir_lower_fs_inputs(nir, devinfo, key);
3481    brw_nir_lower_fs_outputs(nir);
3482 
3483    /* From the SKL PRM, Volume 7, "Alpha Coverage":
3484     *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
3485     *   hardware, regardless of the state setting for this feature."
3486     */
3487    if (key->alpha_to_coverage != BRW_NEVER) {
3488       /* Run constant fold optimization in order to get the correct source
3489        * offset to determine render target 0 store instruction in
3490        * emit_alpha_to_coverage pass.
3491        */
3492       NIR_PASS(_, nir, nir_opt_constant_folding);
3493       NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
3494    }
3495 
3496    NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
3497    brw_postprocess_nir(nir, compiler, debug_enabled,
3498                        key->base.robust_flags);
3499 
3500    brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
3501                                  params->mue_map);
3502 
3503    std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
3504    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
3505       *multi_cfg = NULL;
3506    float throughput = 0;
3507    bool has_spilled = false;
3508 
3509    if (devinfo->ver < 20) {
3510       v8 = std::make_unique<fs_visitor>(compiler, &params->base, key,
3511                                         prog_data, nir, 8, 1,
3512                                         params->base.stats != NULL,
3513                                         debug_enabled);
3514       if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
3515          params->base.error_str = ralloc_strdup(params->base.mem_ctx,
3516                                                 v8->fail_msg);
3517          return NULL;
3518       } else if (INTEL_SIMD(FS, 8)) {
3519          simd8_cfg = v8->cfg;
3520 
3521          assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
3522          prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
3523 
3524          prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
3525          const performance &perf = v8->performance_analysis.require();
3526          throughput = MAX2(throughput, perf.throughput);
3527          has_spilled = v8->spilled_any_registers;
3528          allow_spilling = false;
3529       }
3530    }
3531 
3532    if (key->coarse_pixel && devinfo->ver < 20) {
3533       if (prog_data->dual_src_blend) {
3534          v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
3535                                   " use SIMD8 messages.\n");
3536       }
3537       v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
3538                                " pixel shading.\n");
3539    }
3540 
3541    if (nir->info.ray_queries > 0 && v8)
3542       v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
3543 
3544    if (!has_spilled &&
3545        (!v8 || v8->max_dispatch_width >= 16) &&
3546        (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
3547       /* Try a SIMD16 compile */
3548       v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
3549                                          prog_data, nir, 16, 1,
3550                                          params->base.stats != NULL,
3551                                          debug_enabled);
3552       if (v8)
3553          v16->import_uniforms(v8.get());
3554       if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
3555          brw_shader_perf_log(compiler, params->base.log_data,
3556                              "SIMD16 shader failed to compile: %s\n",
3557                              v16->fail_msg);
3558       } else {
3559          simd16_cfg = v16->cfg;
3560 
3561          assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
3562          prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
3563 
3564          prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
3565          const performance &perf = v16->performance_analysis.require();
3566          throughput = MAX2(throughput, perf.throughput);
3567          has_spilled = v16->spilled_any_registers;
3568          allow_spilling = false;
3569       }
3570    }
3571 
3572    const bool simd16_failed = v16 && !simd16_cfg;
3573 
3574    /* Currently, the compiler only supports SIMD32 on SNB+ */
3575    if (!has_spilled &&
3576        (!v8 || v8->max_dispatch_width >= 32) &&
3577        (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
3578        !simd16_failed &&
3579        INTEL_SIMD(FS, 32)) {
3580       /* Try a SIMD32 compile */
3581       v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
3582                                          prog_data, nir, 32, 1,
3583                                          params->base.stats != NULL,
3584                                          debug_enabled);
3585       if (v8)
3586          v32->import_uniforms(v8.get());
3587       else if (v16)
3588          v32->import_uniforms(v16.get());
3589 
3590       if (!v32->run_fs(allow_spilling, false)) {
3591          brw_shader_perf_log(compiler, params->base.log_data,
3592                              "SIMD32 shader failed to compile: %s\n",
3593                              v32->fail_msg);
3594       } else {
3595          const performance &perf = v32->performance_analysis.require();
3596 
3597          if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
3598             brw_shader_perf_log(compiler, params->base.log_data,
3599                                 "SIMD32 shader inefficient\n");
3600          } else {
3601             simd32_cfg = v32->cfg;
3602 
3603             assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
3604             prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
3605 
3606             prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
3607             throughput = MAX2(throughput, perf.throughput);
3608          }
3609       }
3610    }
3611 
3612    if (devinfo->ver >= 12 && !has_spilled &&
3613        params->max_polygons >= 2 && !key->coarse_pixel) {
3614       fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
3615       assert(vbase);
3616 
3617       if (devinfo->ver >= 20 &&
3618           params->max_polygons >= 4 &&
3619           vbase->max_dispatch_width >= 32 &&
3620           4 * prog_data->num_varying_inputs <= MAX_VARYING &&
3621           INTEL_SIMD(FS, 4X8)) {
3622          /* Try a quad-SIMD8 compile */
3623          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
3624                                                prog_data, nir, 32, 4,
3625                                                params->base.stats != NULL,
3626                                                debug_enabled);
3627          vmulti->import_uniforms(vbase);
3628          if (!vmulti->run_fs(false, params->use_rep_send)) {
3629             brw_shader_perf_log(compiler, params->base.log_data,
3630                                 "Quad-SIMD8 shader failed to compile: %s\n",
3631                                 vmulti->fail_msg);
3632          } else {
3633             multi_cfg = vmulti->cfg;
3634             assert(!vmulti->spilled_any_registers);
3635          }
3636       }
3637 
3638       if (!multi_cfg && devinfo->ver >= 20 &&
3639           vbase->max_dispatch_width >= 32 &&
3640           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
3641           INTEL_SIMD(FS, 2X16)) {
3642          /* Try a dual-SIMD16 compile */
3643          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
3644                                                prog_data, nir, 32, 2,
3645                                                params->base.stats != NULL,
3646                                                debug_enabled);
3647          vmulti->import_uniforms(vbase);
3648          if (!vmulti->run_fs(false, params->use_rep_send)) {
3649             brw_shader_perf_log(compiler, params->base.log_data,
3650                                 "Dual-SIMD16 shader failed to compile: %s\n",
3651                                 vmulti->fail_msg);
3652          } else {
3653             multi_cfg = vmulti->cfg;
3654             assert(!vmulti->spilled_any_registers);
3655          }
3656       }
3657 
3658       if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
3659           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
3660           INTEL_SIMD(FS, 2X8)) {
3661          /* Try a dual-SIMD8 compile */
3662          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
3663                                                prog_data, nir, 16, 2,
3664                                                params->base.stats != NULL,
3665                                                debug_enabled);
3666          vmulti->import_uniforms(vbase);
3667          if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
3668             brw_shader_perf_log(compiler, params->base.log_data,
3669                                 "Dual-SIMD8 shader failed to compile: %s\n",
3670                                 vmulti->fail_msg);
3671          } else {
3672             multi_cfg = vmulti->cfg;
3673          }
3674       }
3675 
3676       if (multi_cfg) {
3677          assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
3678          prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
3679 
3680          prog_data->reg_blocks_8 = brw_register_blocks(vmulti->grf_used);
3681       }
3682    }
3683 
3684    /* When the caller requests a repclear shader, they want SIMD16-only */
3685    if (params->use_rep_send)
3686       simd8_cfg = NULL;
3687 
3688    fs_generator g(compiler, &params->base, &prog_data->base,
3689                   MESA_SHADER_FRAGMENT);
3690 
3691    if (unlikely(debug_enabled)) {
3692       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
3693                                      "%s fragment shader %s",
3694                                      nir->info.label ?
3695                                         nir->info.label : "unnamed",
3696                                      nir->info.name));
3697    }
3698 
3699    struct brw_compile_stats *stats = params->base.stats;
3700    uint32_t max_dispatch_width = 0;
3701 
3702    if (multi_cfg) {
3703       prog_data->dispatch_multi = vmulti->dispatch_width;
3704       prog_data->max_polygons = vmulti->max_polygons;
3705       g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
3706                       vmulti->performance_analysis.require(),
3707                       stats, vmulti->max_polygons);
3708       stats = stats ? stats + 1 : NULL;
3709       max_dispatch_width = vmulti->dispatch_width;
3710 
3711    } else if (simd8_cfg) {
3712       prog_data->dispatch_8 = true;
3713       g.generate_code(simd8_cfg, 8, v8->shader_stats,
3714                       v8->performance_analysis.require(), stats, 1);
3715       stats = stats ? stats + 1 : NULL;
3716       max_dispatch_width = 8;
3717    }
3718 
3719    if (simd16_cfg) {
3720       prog_data->dispatch_16 = true;
3721       prog_data->prog_offset_16 = g.generate_code(
3722          simd16_cfg, 16, v16->shader_stats,
3723          v16->performance_analysis.require(), stats, 1);
3724       stats = stats ? stats + 1 : NULL;
3725       max_dispatch_width = 16;
3726    }
3727 
3728    if (simd32_cfg) {
3729       prog_data->dispatch_32 = true;
3730       prog_data->prog_offset_32 = g.generate_code(
3731          simd32_cfg, 32, v32->shader_stats,
3732          v32->performance_analysis.require(), stats, 1);
3733       stats = stats ? stats + 1 : NULL;
3734       max_dispatch_width = 32;
3735    }
3736 
3737    for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
3738       s->max_dispatch_width = max_dispatch_width;
3739 
3740    g.add_const_data(nir->constant_data, nir->constant_data_size);
3741    return g.get_assembly();
3742 }
3743 
3744 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)3745 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
3746                              unsigned threads)
3747 {
3748    assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
3749    assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
3750    return cs_prog_data->push.per_thread.size * threads +
3751           cs_prog_data->push.cross_thread.size;
3752 }
3753 
3754 static void
fill_push_const_block_info(struct brw_push_const_block * block,unsigned dwords)3755 fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
3756 {
3757    block->dwords = dwords;
3758    block->regs = DIV_ROUND_UP(dwords, 8);
3759    block->size = block->regs * 32;
3760 }
3761 
3762 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct brw_cs_prog_data * cs_prog_data)3763 cs_fill_push_const_info(const struct intel_device_info *devinfo,
3764                         struct brw_cs_prog_data *cs_prog_data)
3765 {
3766    const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
3767    int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data);
3768 
3769    /* The thread ID should be stored in the last param dword */
3770    assert(subgroup_id_index == -1 ||
3771           subgroup_id_index == (int)prog_data->nr_params - 1);
3772 
3773    unsigned cross_thread_dwords, per_thread_dwords;
3774    if (subgroup_id_index >= 0) {
3775       /* Fill all but the last register with cross-thread payload */
3776       cross_thread_dwords = 8 * (subgroup_id_index / 8);
3777       per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
3778       assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
3779    } else {
3780       /* Fill all data using cross-thread payload */
3781       cross_thread_dwords = prog_data->nr_params;
3782       per_thread_dwords = 0u;
3783    }
3784 
3785    fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
3786    fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
3787 
3788    assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
3789           cs_prog_data->push.per_thread.size == 0);
3790    assert(cs_prog_data->push.cross_thread.dwords +
3791           cs_prog_data->push.per_thread.dwords ==
3792              prog_data->nr_params);
3793 }
3794 
3795 static bool
filter_simd(const nir_instr * instr,const void *)3796 filter_simd(const nir_instr *instr, const void * /* options */)
3797 {
3798    if (instr->type != nir_instr_type_intrinsic)
3799       return false;
3800 
3801    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
3802    case nir_intrinsic_load_simd_width_intel:
3803    case nir_intrinsic_load_subgroup_id:
3804       return true;
3805 
3806    default:
3807       return false;
3808    }
3809 }
3810 
3811 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)3812 lower_simd(nir_builder *b, nir_instr *instr, void *options)
3813 {
3814    uintptr_t simd_width = (uintptr_t)options;
3815 
3816    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
3817    case nir_intrinsic_load_simd_width_intel:
3818       return nir_imm_int(b, simd_width);
3819 
3820    case nir_intrinsic_load_subgroup_id:
3821       /* If the whole workgroup fits in one thread, we can lower subgroup_id
3822        * to a constant zero.
3823        */
3824       if (!b->shader->info.workgroup_size_variable) {
3825          unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
3826                                          b->shader->info.workgroup_size[1] *
3827                                          b->shader->info.workgroup_size[2];
3828          if (local_workgroup_size <= simd_width)
3829             return nir_imm_int(b, 0);
3830       }
3831       return NULL;
3832 
3833    default:
3834       return NULL;
3835    }
3836 }
3837 
3838 bool
brw_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)3839 brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
3840 {
3841    return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
3842                                  (void *)(uintptr_t)dispatch_width);
3843 }
3844 
3845 const unsigned *
brw_compile_cs(const struct brw_compiler * compiler,struct brw_compile_cs_params * params)3846 brw_compile_cs(const struct brw_compiler *compiler,
3847                struct brw_compile_cs_params *params)
3848 {
3849    const nir_shader *nir = params->base.nir;
3850    const struct brw_cs_prog_key *key = params->key;
3851    struct brw_cs_prog_data *prog_data = params->prog_data;
3852 
3853    const bool debug_enabled =
3854       brw_should_print_shader(nir, params->base.debug_flag ?
3855                                    params->base.debug_flag : DEBUG_CS);
3856 
3857    prog_data->base.stage = MESA_SHADER_COMPUTE;
3858    prog_data->base.total_shared = nir->info.shared_size;
3859    prog_data->base.ray_queries = nir->info.ray_queries;
3860    prog_data->base.total_scratch = 0;
3861 
3862    if (!nir->info.workgroup_size_variable) {
3863       prog_data->local_size[0] = nir->info.workgroup_size[0];
3864       prog_data->local_size[1] = nir->info.workgroup_size[1];
3865       prog_data->local_size[2] = nir->info.workgroup_size[2];
3866    }
3867 
3868    brw_simd_selection_state simd_state{
3869       .devinfo = compiler->devinfo,
3870       .prog_data = prog_data,
3871       .required_width = brw_required_dispatch_width(&nir->info),
3872    };
3873 
3874    std::unique_ptr<fs_visitor> v[3];
3875 
3876    for (unsigned simd = 0; simd < 3; simd++) {
3877       if (!brw_simd_should_compile(simd_state, simd))
3878          continue;
3879 
3880       const unsigned dispatch_width = 8u << simd;
3881 
3882       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
3883       brw_nir_apply_key(shader, compiler, &key->base,
3884                         dispatch_width);
3885 
3886       NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
3887 
3888       /* Clean up after the local index and ID calculations. */
3889       NIR_PASS(_, shader, nir_opt_constant_folding);
3890       NIR_PASS(_, shader, nir_opt_dce);
3891 
3892       brw_postprocess_nir(shader, compiler, debug_enabled,
3893                           key->base.robust_flags);
3894 
3895       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
3896                                              &key->base,
3897                                              &prog_data->base,
3898                                              shader, dispatch_width,
3899                                              params->base.stats != NULL,
3900                                              debug_enabled);
3901 
3902       const int first = brw_simd_first_compiled(simd_state);
3903       if (first >= 0)
3904          v[simd]->import_uniforms(v[first].get());
3905 
3906       const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
3907 
3908       if (v[simd]->run_cs(allow_spilling)) {
3909          cs_fill_push_const_info(compiler->devinfo, prog_data);
3910 
3911          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
3912       } else {
3913          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
3914          if (simd > 0) {
3915             brw_shader_perf_log(compiler, params->base.log_data,
3916                                 "SIMD%u shader failed to compile: %s\n",
3917                                 dispatch_width, v[simd]->fail_msg);
3918          }
3919       }
3920    }
3921 
3922    const int selected_simd = brw_simd_select(simd_state);
3923    if (selected_simd < 0) {
3924       params->base.error_str =
3925          ralloc_asprintf(params->base.mem_ctx,
3926                          "Can't compile shader: "
3927                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
3928                          simd_state.error[0], simd_state.error[1],
3929                          simd_state.error[2]);
3930       return NULL;
3931    }
3932 
3933    assert(selected_simd < 3);
3934 
3935    if (!nir->info.workgroup_size_variable)
3936       prog_data->prog_mask = 1 << selected_simd;
3937 
3938    fs_generator g(compiler, &params->base, &prog_data->base,
3939                   MESA_SHADER_COMPUTE);
3940    if (unlikely(debug_enabled)) {
3941       char *name = ralloc_asprintf(params->base.mem_ctx,
3942                                    "%s compute shader %s",
3943                                    nir->info.label ?
3944                                    nir->info.label : "unnamed",
3945                                    nir->info.name);
3946       g.enable_debug(name);
3947    }
3948 
3949    uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
3950 
3951    struct brw_compile_stats *stats = params->base.stats;
3952    for (unsigned simd = 0; simd < 3; simd++) {
3953       if (prog_data->prog_mask & (1u << simd)) {
3954          assert(v[simd]);
3955          prog_data->prog_offset[simd] =
3956             g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
3957                             v[simd]->performance_analysis.require(), stats);
3958          if (stats)
3959             stats->max_dispatch_width = max_dispatch_width;
3960          stats = stats ? stats + 1 : NULL;
3961          max_dispatch_width = 8u << simd;
3962       }
3963    }
3964 
3965    g.add_const_data(nir->constant_data, nir->constant_data_size);
3966 
3967    return g.get_assembly();
3968 }
3969 
3970 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)3971 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
3972                          const struct brw_cs_prog_data *prog_data,
3973                          const unsigned *override_local_size)
3974 {
3975    struct intel_cs_dispatch_info info = {};
3976 
3977    const unsigned *sizes =
3978       override_local_size ? override_local_size :
3979                             prog_data->local_size;
3980 
3981    const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
3982    assert(simd >= 0 && simd < 3);
3983 
3984    info.group_size = sizes[0] * sizes[1] * sizes[2];
3985    info.simd_size = 8u << simd;
3986    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
3987 
3988    const uint32_t remainder = info.group_size & (info.simd_size - 1);
3989    if (remainder > 0)
3990       info.right_mask = ~0u >> (32 - remainder);
3991    else
3992       info.right_mask = ~0u >> (32 - info.simd_size);
3993 
3994    return info;
3995 }
3996 
3997 static uint8_t
compile_single_bs(const struct brw_compiler * compiler,struct brw_compile_bs_params * params,const struct brw_bs_prog_key * key,struct brw_bs_prog_data * prog_data,nir_shader * shader,fs_generator * g,struct brw_compile_stats * stats,int * prog_offset)3998 compile_single_bs(const struct brw_compiler *compiler,
3999                   struct brw_compile_bs_params *params,
4000                   const struct brw_bs_prog_key *key,
4001                   struct brw_bs_prog_data *prog_data,
4002                   nir_shader *shader,
4003                   fs_generator *g,
4004                   struct brw_compile_stats *stats,
4005                   int *prog_offset)
4006 {
4007    const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
4008 
4009    prog_data->base.stage = shader->info.stage;
4010    prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
4011                                     shader->scratch_size);
4012 
4013    const unsigned max_dispatch_width = 16;
4014    brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width);
4015    brw_postprocess_nir(shader, compiler, debug_enabled,
4016                        key->base.robust_flags);
4017 
4018    brw_simd_selection_state simd_state{
4019       .devinfo = compiler->devinfo,
4020       .prog_data = prog_data,
4021 
4022       /* Since divergence is a lot more likely in RT than compute, it makes
4023        * sense to limit ourselves to the smallest available SIMD for now.
4024        */
4025       .required_width = compiler->devinfo->ver >= 20 ? 16u : 8u,
4026    };
4027 
4028    std::unique_ptr<fs_visitor> v[2];
4029 
4030    for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) {
4031       if (!brw_simd_should_compile(simd_state, simd))
4032          continue;
4033 
4034       const unsigned dispatch_width = 8u << simd;
4035 
4036       if (dispatch_width == 8 && compiler->devinfo->ver >= 20)
4037          continue;
4038 
4039       v[simd] = std::make_unique<fs_visitor>(compiler, &params->base,
4040                                              &key->base,
4041                                              &prog_data->base, shader,
4042                                              dispatch_width,
4043                                              stats != NULL,
4044                                              debug_enabled);
4045 
4046       const bool allow_spilling = !brw_simd_any_compiled(simd_state);
4047       if (v[simd]->run_bs(allow_spilling)) {
4048          brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
4049       } else {
4050          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx,
4051                                                 v[simd]->fail_msg);
4052          if (simd > 0) {
4053             brw_shader_perf_log(compiler, params->base.log_data,
4054                                 "SIMD%u shader failed to compile: %s",
4055                                 dispatch_width, v[simd]->fail_msg);
4056          }
4057       }
4058    }
4059 
4060    const int selected_simd = brw_simd_select(simd_state);
4061    if (selected_simd < 0) {
4062       params->base.error_str =
4063          ralloc_asprintf(params->base.mem_ctx,
4064                          "Can't compile shader: "
4065                          "SIMD8 '%s' and SIMD16 '%s'.\n",
4066                          simd_state.error[0], simd_state.error[1]);
4067       return 0;
4068    }
4069 
4070    assert(selected_simd < int(ARRAY_SIZE(v)));
4071    fs_visitor *selected = v[selected_simd].get();
4072    assert(selected);
4073 
4074    const unsigned dispatch_width = selected->dispatch_width;
4075 
4076    int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats,
4077                                  selected->performance_analysis.require(), stats);
4078    if (prog_offset)
4079       *prog_offset = offset;
4080    else
4081       assert(offset == 0);
4082 
4083    return dispatch_width;
4084 }
4085 
4086 uint64_t
brw_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)4087 brw_bsr(const struct intel_device_info *devinfo,
4088         uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
4089 {
4090    assert(offset % 64 == 0);
4091    assert(simd_size == 8 || simd_size == 16);
4092    assert(local_arg_offset % 8 == 0);
4093 
4094    return offset |
4095           SET_BITS(simd_size == 8, 4, 4) |
4096           SET_BITS(local_arg_offset / 8, 2, 0);
4097 }
4098 
4099 const unsigned *
brw_compile_bs(const struct brw_compiler * compiler,struct brw_compile_bs_params * params)4100 brw_compile_bs(const struct brw_compiler *compiler,
4101                struct brw_compile_bs_params *params)
4102 {
4103    nir_shader *shader = params->base.nir;
4104    struct brw_bs_prog_data *prog_data = params->prog_data;
4105    unsigned num_resume_shaders = params->num_resume_shaders;
4106    nir_shader **resume_shaders = params->resume_shaders;
4107    const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
4108 
4109    prog_data->base.stage = shader->info.stage;
4110    prog_data->base.ray_queries = shader->info.ray_queries;
4111    prog_data->base.total_scratch = 0;
4112 
4113    prog_data->max_stack_size = 0;
4114    prog_data->num_resume_shaders = num_resume_shaders;
4115 
4116    fs_generator g(compiler, &params->base, &prog_data->base,
4117                   shader->info.stage);
4118    if (unlikely(debug_enabled)) {
4119       char *name = ralloc_asprintf(params->base.mem_ctx,
4120                                    "%s %s shader %s",
4121                                    shader->info.label ?
4122                                       shader->info.label : "unnamed",
4123                                    gl_shader_stage_name(shader->info.stage),
4124                                    shader->info.name);
4125       g.enable_debug(name);
4126    }
4127 
4128    prog_data->simd_size =
4129       compile_single_bs(compiler, params, params->key, prog_data,
4130                         shader, &g, params->base.stats, NULL);
4131    if (prog_data->simd_size == 0)
4132       return NULL;
4133 
4134    uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx,
4135                                        uint64_t, num_resume_shaders);
4136    for (unsigned i = 0; i < num_resume_shaders; i++) {
4137       if (INTEL_DEBUG(DEBUG_RT)) {
4138          char *name = ralloc_asprintf(params->base.mem_ctx,
4139                                       "%s %s resume(%u) shader %s",
4140                                       shader->info.label ?
4141                                          shader->info.label : "unnamed",
4142                                       gl_shader_stage_name(shader->info.stage),
4143                                       i, shader->info.name);
4144          g.enable_debug(name);
4145       }
4146 
4147       /* TODO: Figure out shader stats etc. for resume shaders */
4148       int offset = 0;
4149       uint8_t simd_size =
4150          compile_single_bs(compiler, params, params->key,
4151                            prog_data, resume_shaders[i], &g, NULL, &offset);
4152       if (simd_size == 0)
4153          return NULL;
4154 
4155       assert(offset > 0);
4156       resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
4157    }
4158 
4159    /* We only have one constant data so we want to make sure they're all the
4160     * same.
4161     */
4162    for (unsigned i = 0; i < num_resume_shaders; i++) {
4163       assert(resume_shaders[i]->constant_data_size ==
4164              shader->constant_data_size);
4165       assert(memcmp(resume_shaders[i]->constant_data,
4166                     shader->constant_data,
4167                     shader->constant_data_size) == 0);
4168    }
4169 
4170    g.add_const_data(shader->constant_data, shader->constant_data_size);
4171    g.add_resume_sbt(num_resume_shaders, resume_sbt);
4172 
4173    return g.get_assembly();
4174 }
4175 
4176 /**
4177  * Test the dispatch mask packing assumptions of
4178  * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
4179  * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
4180  * executed with an unexpected dispatch mask.
4181  */
4182 static UNUSED void
brw_fs_test_dispatch_packing(const fs_builder & bld)4183 brw_fs_test_dispatch_packing(const fs_builder &bld)
4184 {
4185    const fs_visitor *shader = static_cast<const fs_visitor *>(bld.shader);
4186    const gl_shader_stage stage = shader->stage;
4187    const bool uses_vmask =
4188       stage == MESA_SHADER_FRAGMENT &&
4189       brw_wm_prog_data(shader->stage_prog_data)->uses_vmask;
4190 
4191    if (brw_stage_has_packed_dispatch(shader->devinfo, stage,
4192                                      shader->max_polygons,
4193                                      shader->stage_prog_data)) {
4194       const fs_builder ubld = bld.exec_all().group(1, 0);
4195       const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
4196       const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
4197 
4198       ubld.ADD(tmp, mask, brw_imm_ud(1));
4199       ubld.AND(tmp, mask, tmp);
4200 
4201       /* This will loop forever if the dispatch mask doesn't have the expected
4202        * form '2^n-1', in which case tmp will be non-zero.
4203        */
4204       bld.emit(BRW_OPCODE_DO);
4205       bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
4206       set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
4207    }
4208 }
4209 
4210 unsigned
workgroup_size() const4211 fs_visitor::workgroup_size() const
4212 {
4213    assert(gl_shader_stage_uses_workgroup(stage));
4214    const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
4215    return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
4216 }
4217 
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)4218 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
4219 {
4220    return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
4221 }
4222 
4223 namespace brw {
4224    fs_reg
fetch_payload_reg(const brw::fs_builder & bld,uint8_t regs[2],brw_reg_type type,unsigned n)4225    fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
4226                      brw_reg_type type, unsigned n)
4227    {
4228       if (!regs[0])
4229          return fs_reg();
4230 
4231       if (bld.dispatch_width() > 16) {
4232          const fs_reg tmp = bld.vgrf(type, n);
4233          const brw::fs_builder hbld = bld.exec_all().group(16, 0);
4234          const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
4235          fs_reg *const components = new fs_reg[m * n];
4236 
4237          for (unsigned c = 0; c < n; c++) {
4238             for (unsigned g = 0; g < m; g++)
4239                components[c * m + g] =
4240                   offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
4241          }
4242 
4243          hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
4244 
4245          delete[] components;
4246          return tmp;
4247 
4248       } else {
4249          return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
4250       }
4251    }
4252 
4253    fs_reg
fetch_barycentric_reg(const brw::fs_builder & bld,uint8_t regs[2])4254    fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
4255    {
4256       if (!regs[0])
4257          return fs_reg();
4258       else if (bld.shader->devinfo->ver >= 20)
4259          return fetch_payload_reg(bld, regs, BRW_REGISTER_TYPE_F, 2);
4260 
4261       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
4262       const brw::fs_builder hbld = bld.exec_all().group(8, 0);
4263       const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
4264       fs_reg *const components = new fs_reg[2 * m];
4265 
4266       for (unsigned c = 0; c < 2; c++) {
4267          for (unsigned g = 0; g < m; g++)
4268             components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
4269                                            hbld, c + 2 * (g % 2));
4270       }
4271 
4272       hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
4273 
4274       delete[] components;
4275       return tmp;
4276    }
4277 
4278    void
check_dynamic_msaa_flag(const fs_builder & bld,const struct brw_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)4279    check_dynamic_msaa_flag(const fs_builder &bld,
4280                            const struct brw_wm_prog_data *wm_prog_data,
4281                            enum intel_msaa_flags flag)
4282    {
4283       fs_inst *inst = bld.AND(bld.null_reg_ud(),
4284                               dynamic_msaa_flags(wm_prog_data),
4285                               brw_imm_ud(flag));
4286       inst->conditional_mod = BRW_CONDITIONAL_NZ;
4287    }
4288 }
4289