• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file elk_fs.cpp
25  *
26  * This file drives the GLSL IR -> LIR translation, contains the
27  * optimizations on the LIR, and drives the generation of native code
28  * from the LIR.
29  */
30 
31 #include "elk_eu.h"
32 #include "elk_fs.h"
33 #include "elk_fs_builder.h"
34 #include "elk_fs_live_variables.h"
35 #include "elk_nir.h"
36 #include "elk_vec4_gs_visitor.h"
37 #include "elk_cfg.h"
38 #include "elk_dead_control_flow.h"
39 #include "elk_private.h"
40 #include "../intel_nir.h"
41 #include "shader_enums.h"
42 #include "dev/intel_debug.h"
43 #include "dev/intel_wa.h"
44 #include "compiler/glsl_types.h"
45 #include "compiler/nir/nir_builder.h"
46 #include "util/u_math.h"
47 
48 #include <memory>
49 
50 using namespace elk;
51 
52 static unsigned get_lowered_simd_width(const elk_fs_visitor *shader,
53                                        const elk_fs_inst *inst);
54 
55 void
init(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources)56 elk_fs_inst::init(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
57               const elk_fs_reg *src, unsigned sources)
58 {
59    memset((void*)this, 0, sizeof(*this));
60 
61    this->src = new elk_fs_reg[MAX2(sources, 3)];
62    for (unsigned i = 0; i < sources; i++)
63       this->src[i] = src[i];
64 
65    this->opcode = opcode;
66    this->dst = dst;
67    this->sources = sources;
68    this->exec_size = exec_size;
69    this->base_mrf = -1;
70 
71    assert(dst.file != IMM && dst.file != UNIFORM);
72 
73    assert(this->exec_size != 0);
74 
75    this->conditional_mod = ELK_CONDITIONAL_NONE;
76 
77    /* This will be the case for almost all instructions. */
78    switch (dst.file) {
79    case VGRF:
80    case ARF:
81    case FIXED_GRF:
82    case MRF:
83    case ATTR:
84       this->size_written = dst.component_size(exec_size);
85       break;
86    case BAD_FILE:
87       this->size_written = 0;
88       break;
89    case IMM:
90    case UNIFORM:
91       unreachable("Invalid destination register file");
92    }
93 
94    this->writes_accumulator = false;
95 }
96 
elk_fs_inst()97 elk_fs_inst::elk_fs_inst()
98 {
99    init(ELK_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size)102 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size)
103 {
104    init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst)107 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst)
108 {
109    init(opcode, exec_size, dst, NULL, 0);
110 }
111 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0)112 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
113                  const elk_fs_reg &src0)
114 {
115    const elk_fs_reg src[1] = { src0 };
116    init(opcode, exec_size, dst, src, 1);
117 }
118 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1)119 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
120                  const elk_fs_reg &src0, const elk_fs_reg &src1)
121 {
122    const elk_fs_reg src[2] = { src0, src1 };
123    init(opcode, exec_size, dst, src, 2);
124 }
125 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1,const elk_fs_reg & src2)126 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
127                  const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2)
128 {
129    const elk_fs_reg src[3] = { src0, src1, src2 };
130    init(opcode, exec_size, dst, src, 3);
131 }
132 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_width,const elk_fs_reg & dst,const elk_fs_reg src[],unsigned sources)133 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
134                  const elk_fs_reg src[], unsigned sources)
135 {
136    init(opcode, exec_width, dst, src, sources);
137 }
138 
elk_fs_inst(const elk_fs_inst & that)139 elk_fs_inst::elk_fs_inst(const elk_fs_inst &that)
140 {
141    memcpy((void*)this, &that, sizeof(that));
142 
143    this->src = new elk_fs_reg[MAX2(that.sources, 3)];
144 
145    for (unsigned i = 0; i < that.sources; i++)
146       this->src[i] = that.src[i];
147 }
148 
~elk_fs_inst()149 elk_fs_inst::~elk_fs_inst()
150 {
151    delete[] this->src;
152 }
153 
154 void
resize_sources(uint8_t num_sources)155 elk_fs_inst::resize_sources(uint8_t num_sources)
156 {
157    if (this->sources != num_sources) {
158       elk_fs_reg *src = new elk_fs_reg[MAX2(num_sources, 3)];
159 
160       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161          src[i] = this->src[i];
162 
163       delete[] this->src;
164       this->src = src;
165       this->sources = num_sources;
166    }
167 }
168 
169 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & surface,const elk_fs_reg & surface_handle,const elk_fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)170 elk_fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171                                        const elk_fs_reg &dst,
172                                        const elk_fs_reg &surface,
173                                        const elk_fs_reg &surface_handle,
174                                        const elk_fs_reg &varying_offset,
175                                        uint32_t const_offset,
176                                        uint8_t alignment,
177                                        unsigned components)
178 {
179    assert(components <= 4);
180 
181    /* We have our constant surface use a pitch of 4 bytes, so our index can
182     * be any component of a vector, and then we load 4 contiguous
183     * components starting from that.  TODO: Support loading fewer than 4.
184     */
185    elk_fs_reg total_offset = vgrf(glsl_uint_type());
186    bld.ADD(total_offset, varying_offset, elk_imm_ud(const_offset));
187 
188    /* The pull load message will load a vec4 (16 bytes). If we are loading
189     * a double this means we are only loading 2 elements worth of data.
190     * We also want to use a 32-bit data type for the dst of the load operation
191     * so other parts of the driver don't get confused about the size of the
192     * result.
193     */
194    elk_fs_reg vec4_result = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
195 
196    elk_fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
197    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
198    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
199    srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
200    srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = elk_imm_ud(alignment);
201 
202    elk_fs_inst *inst = bld.emit(ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
203                             vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
204    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
205 
206    elk_shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
207 }
208 
209 /**
210  * A helper for MOV generation for fixing up broken hardware SEND dependency
211  * handling.
212  */
213 void
DEP_RESOLVE_MOV(const fs_builder & bld,int grf)214 elk_fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
215 {
216    /* The caller always wants uncompressed to emit the minimal extra
217     * dependencies, and to avoid having to deal with aligning its regs to 2.
218     */
219    const fs_builder ubld = bld.annotate("send dependency resolve")
220                               .quarter(0);
221 
222    ubld.MOV(ubld.null_reg_f(), elk_fs_reg(VGRF, grf, ELK_REGISTER_TYPE_F));
223 }
224 
225 bool
is_send_from_grf() const226 elk_fs_inst::is_send_from_grf() const
227 {
228    switch (opcode) {
229    case ELK_SHADER_OPCODE_SEND:
230    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
231    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
232    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
233    case ELK_SHADER_OPCODE_INTERLOCK:
234    case ELK_SHADER_OPCODE_MEMORY_FENCE:
235    case ELK_SHADER_OPCODE_BARRIER:
236       return true;
237    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238       return src[1].file == VGRF;
239    case ELK_FS_OPCODE_FB_WRITE:
240    case ELK_FS_OPCODE_FB_READ:
241       return src[0].file == VGRF;
242    default:
243       return false;
244    }
245 }
246 
247 bool
is_control_source(unsigned arg) const248 elk_fs_inst::is_control_source(unsigned arg) const
249 {
250    switch (opcode) {
251    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
252    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
253       return arg == 0;
254 
255    case ELK_SHADER_OPCODE_BROADCAST:
256    case ELK_SHADER_OPCODE_SHUFFLE:
257    case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
258    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
259    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
260    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
261       return arg == 1;
262 
263    case ELK_SHADER_OPCODE_MOV_INDIRECT:
264    case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
265    case ELK_SHADER_OPCODE_TEX:
266    case ELK_FS_OPCODE_TXB:
267    case ELK_SHADER_OPCODE_TXD:
268    case ELK_SHADER_OPCODE_TXF:
269    case ELK_SHADER_OPCODE_TXF_LZ:
270    case ELK_SHADER_OPCODE_TXF_CMS:
271    case ELK_SHADER_OPCODE_TXF_CMS_W:
272    case ELK_SHADER_OPCODE_TXF_UMS:
273    case ELK_SHADER_OPCODE_TXF_MCS:
274    case ELK_SHADER_OPCODE_TXL:
275    case ELK_SHADER_OPCODE_TXL_LZ:
276    case ELK_SHADER_OPCODE_TXS:
277    case ELK_SHADER_OPCODE_LOD:
278    case ELK_SHADER_OPCODE_TG4:
279    case ELK_SHADER_OPCODE_TG4_OFFSET:
280    case ELK_SHADER_OPCODE_SAMPLEINFO:
281       return arg == 1 || arg == 2;
282 
283    case ELK_SHADER_OPCODE_SEND:
284       return arg == 0 || arg == 1;
285 
286    default:
287       return false;
288    }
289 }
290 
291 bool
is_payload(unsigned arg) const292 elk_fs_inst::is_payload(unsigned arg) const
293 {
294    switch (opcode) {
295    case ELK_FS_OPCODE_FB_WRITE:
296    case ELK_FS_OPCODE_FB_READ:
297    case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
298    case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
299    case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
300    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
301    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
302    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
303    case ELK_SHADER_OPCODE_INTERLOCK:
304    case ELK_SHADER_OPCODE_MEMORY_FENCE:
305    case ELK_SHADER_OPCODE_BARRIER:
306    case ELK_SHADER_OPCODE_TEX:
307    case ELK_FS_OPCODE_TXB:
308    case ELK_SHADER_OPCODE_TXD:
309    case ELK_SHADER_OPCODE_TXF:
310    case ELK_SHADER_OPCODE_TXF_LZ:
311    case ELK_SHADER_OPCODE_TXF_CMS:
312    case ELK_SHADER_OPCODE_TXF_CMS_W:
313    case ELK_SHADER_OPCODE_TXF_UMS:
314    case ELK_SHADER_OPCODE_TXF_MCS:
315    case ELK_SHADER_OPCODE_TXL:
316    case ELK_SHADER_OPCODE_TXL_LZ:
317    case ELK_SHADER_OPCODE_TXS:
318    case ELK_SHADER_OPCODE_LOD:
319    case ELK_SHADER_OPCODE_TG4:
320    case ELK_SHADER_OPCODE_TG4_OFFSET:
321    case ELK_SHADER_OPCODE_SAMPLEINFO:
322       return arg == 0;
323 
324    case ELK_SHADER_OPCODE_SEND:
325       return arg == 2 || arg == 3;
326 
327    default:
328       return false;
329    }
330 }
331 
332 /**
333  * Returns true if this instruction's sources and destinations cannot
334  * safely be the same register.
335  *
336  * In most cases, a register can be written over safely by the same
337  * instruction that is its last use.  For a single instruction, the
338  * sources are dereferenced before writing of the destination starts
339  * (naturally).
340  *
341  * However, there are a few cases where this can be problematic:
342  *
343  * - Virtual opcodes that translate to multiple instructions in the
344  *   code generator: if src == dst and one instruction writes the
345  *   destination before a later instruction reads the source, then
346  *   src will have been clobbered.
347  *
348  * - SIMD16 compressed instructions with certain regioning (see below).
349  *
350  * The register allocator uses this information to set up conflicts between
351  * GRF sources and the destination.
352  */
353 bool
has_source_and_destination_hazard() const354 elk_fs_inst::has_source_and_destination_hazard() const
355 {
356    switch (opcode) {
357    case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
358       /* Multiple partial writes to the destination */
359       return true;
360    case ELK_SHADER_OPCODE_SHUFFLE:
361       /* This instruction returns an arbitrary channel from the source and
362        * gets split into smaller instructions in the generator.  It's possible
363        * that one of the instructions will read from a channel corresponding
364        * to an earlier instruction.
365        */
366    case ELK_SHADER_OPCODE_SEL_EXEC:
367       /* This is implemented as
368        *
369        * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
370        * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
371        *
372        * Because the source is only read in the second instruction, the first
373        * may stomp all over it.
374        */
375       return true;
376    case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
377       switch (src[1].ud) {
378       case ELK_SWIZZLE_XXXX:
379       case ELK_SWIZZLE_YYYY:
380       case ELK_SWIZZLE_ZZZZ:
381       case ELK_SWIZZLE_WWWW:
382       case ELK_SWIZZLE_XXZZ:
383       case ELK_SWIZZLE_YYWW:
384       case ELK_SWIZZLE_XYXY:
385       case ELK_SWIZZLE_ZWZW:
386          /* These can be implemented as a single Align1 region on all
387           * platforms, so there's never a hazard between source and
388           * destination.  C.f. elk_fs_generator::generate_quad_swizzle().
389           */
390          return false;
391       default:
392          return !is_uniform(src[0]);
393       }
394    case ELK_OPCODE_DPAS:
395       /* This is overly conservative. The actual hazard is more complicated to
396        * describe. When the repeat count is N, the single instruction behaves
397        * like N instructions with a repeat count of one, but the destination
398        * and source registers are incremented (in somewhat complex ways) for
399        * each instruction.
400        *
401        * This means the source and destination register is actually a range of
402        * registers. The hazard exists of an earlier iteration would write a
403        * register that should be read by a later iteration.
404        *
405        * There may be some advantage to properly modeling this, but for now,
406        * be overly conservative.
407        */
408       return rcount > 1;
409    default:
410       /* The SIMD16 compressed instruction
411        *
412        * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
413        *
414        * is actually decoded in hardware as:
415        *
416        * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
417        * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
418        *
419        * Which is safe.  However, if we have uniform accesses
420        * happening, we get into trouble:
421        *
422        * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
423        * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
424        *
425        * Now our destination for the first instruction overwrote the
426        * second instruction's src0, and we get garbage for those 8
427        * pixels.  There's a similar issue for the pre-gfx6
428        * pixel_x/pixel_y, which are registers of 16-bit values and thus
429        * would get stomped by the first decode as well.
430        */
431       if (exec_size == 16) {
432          for (int i = 0; i < sources; i++) {
433             if (src[i].file == VGRF && (src[i].stride == 0 ||
434                                         src[i].type == ELK_REGISTER_TYPE_UW ||
435                                         src[i].type == ELK_REGISTER_TYPE_W ||
436                                         src[i].type == ELK_REGISTER_TYPE_UB ||
437                                         src[i].type == ELK_REGISTER_TYPE_B)) {
438                return true;
439             }
440          }
441       }
442       return false;
443    }
444 }
445 
446 bool
can_do_source_mods(const struct intel_device_info * devinfo) const447 elk_fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
448 {
449    if (devinfo->ver == 6 && is_math())
450       return false;
451 
452    if (is_send_from_grf())
453       return false;
454 
455    /* From Wa_1604601757:
456     *
457     * "When multiplying a DW and any lower precision integer, source modifier
458     *  is not supported."
459     */
460    if (devinfo->ver >= 12 && (opcode == ELK_OPCODE_MUL ||
461                               opcode == ELK_OPCODE_MAD)) {
462       const elk_reg_type exec_type = get_exec_type(this);
463       const unsigned min_type_sz = opcode == ELK_OPCODE_MAD ?
464          MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
465          MIN2(type_sz(src[0].type), type_sz(src[1].type));
466 
467       if (elk_reg_type_is_integer(exec_type) &&
468           type_sz(exec_type) >= 4 &&
469           type_sz(exec_type) != min_type_sz)
470          return false;
471    }
472 
473    if (!elk_backend_instruction::can_do_source_mods())
474       return false;
475 
476    return true;
477 }
478 
479 bool
can_do_cmod()480 elk_fs_inst::can_do_cmod()
481 {
482    if (!elk_backend_instruction::can_do_cmod())
483       return false;
484 
485    /* The accumulator result appears to get used for the conditional modifier
486     * generation.  When negating a UD value, there is a 33rd bit generated for
487     * the sign in the accumulator value, so now you can't check, for example,
488     * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
489     */
490    for (unsigned i = 0; i < sources; i++) {
491       if (elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
492          return false;
493    }
494 
495    return true;
496 }
497 
498 bool
can_change_types() const499 elk_fs_inst::can_change_types() const
500 {
501    return dst.type == src[0].type &&
502           !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
503           (opcode == ELK_OPCODE_MOV ||
504            (opcode == ELK_OPCODE_SEL &&
505             dst.type == src[1].type &&
506             predicate != ELK_PREDICATE_NONE &&
507             !src[1].abs && !src[1].negate && src[1].file != ATTR));
508 }
509 
510 void
init()511 elk_fs_reg::init()
512 {
513    memset((void*)this, 0, sizeof(*this));
514    type = ELK_REGISTER_TYPE_UD;
515    stride = 1;
516 }
517 
518 /** Generic unset register constructor. */
elk_fs_reg()519 elk_fs_reg::elk_fs_reg()
520 {
521    init();
522    this->file = BAD_FILE;
523 }
524 
elk_fs_reg(struct::elk_reg reg)525 elk_fs_reg::elk_fs_reg(struct ::elk_reg reg) :
526    elk_backend_reg(reg)
527 {
528    this->offset = 0;
529    this->stride = 1;
530    if (this->file == IMM &&
531        (this->type != ELK_REGISTER_TYPE_V &&
532         this->type != ELK_REGISTER_TYPE_UV &&
533         this->type != ELK_REGISTER_TYPE_VF)) {
534       this->stride = 0;
535    }
536 }
537 
538 bool
equals(const elk_fs_reg & r) const539 elk_fs_reg::equals(const elk_fs_reg &r) const
540 {
541    return (this->elk_backend_reg::equals(r) &&
542            stride == r.stride);
543 }
544 
545 bool
negative_equals(const elk_fs_reg & r) const546 elk_fs_reg::negative_equals(const elk_fs_reg &r) const
547 {
548    return (this->elk_backend_reg::negative_equals(r) &&
549            stride == r.stride);
550 }
551 
552 bool
is_contiguous() const553 elk_fs_reg::is_contiguous() const
554 {
555    switch (file) {
556    case ARF:
557    case FIXED_GRF:
558       return hstride == ELK_HORIZONTAL_STRIDE_1 &&
559              vstride == width + hstride;
560    case MRF:
561    case VGRF:
562    case ATTR:
563       return stride == 1;
564    case UNIFORM:
565    case IMM:
566    case BAD_FILE:
567       return true;
568    }
569 
570    unreachable("Invalid register file");
571 }
572 
573 unsigned
component_size(unsigned width) const574 elk_fs_reg::component_size(unsigned width) const
575 {
576    if (file == ARF || file == FIXED_GRF) {
577       const unsigned w = MIN2(width, 1u << this->width);
578       const unsigned h = width >> this->width;
579       const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
580       const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
581       assert(w > 0);
582       return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
583    } else {
584       return MAX2(width * stride, 1) * type_sz(type);
585    }
586 }
587 
588 void
vfail(const char * format,va_list va)589 elk_fs_visitor::vfail(const char *format, va_list va)
590 {
591    char *msg;
592 
593    if (failed)
594       return;
595 
596    failed = true;
597 
598    msg = ralloc_vasprintf(mem_ctx, format, va);
599    msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
600          dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
601 
602    this->fail_msg = msg;
603 
604    if (unlikely(debug_enabled)) {
605       fprintf(stderr, "%s",  msg);
606    }
607 }
608 
609 void
fail(const char * format,...)610 elk_fs_visitor::fail(const char *format, ...)
611 {
612    va_list va;
613 
614    va_start(va, format);
615    vfail(format, va);
616    va_end(va);
617 }
618 
619 /**
620  * Mark this program as impossible to compile with dispatch width greater
621  * than n.
622  *
623  * During the SIMD8 compile (which happens first), we can detect and flag
624  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
625  * SIMD16+ compile altogether.
626  *
627  * During a compile of dispatch width greater than n (if one happens anyway),
628  * this just calls fail().
629  */
630 void
limit_dispatch_width(unsigned n,const char * msg)631 elk_fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
632 {
633    if (dispatch_width > n) {
634       fail("%s", msg);
635    } else {
636       max_dispatch_width = MIN2(max_dispatch_width, n);
637       elk_shader_perf_log(compiler, log_data,
638                           "Shader dispatch width limited to SIMD%d: %s\n",
639                           n, msg);
640    }
641 }
642 
643 /**
644  * Returns true if the instruction has a flag that means it won't
645  * update an entire destination register.
646  *
647  * For example, dead code elimination and live variable analysis want to know
648  * when a write to a variable screens off any preceding values that were in
649  * it.
650  */
651 bool
is_partial_write() const652 elk_fs_inst::is_partial_write() const
653 {
654    if (this->predicate && !this->predicate_trivial &&
655        this->opcode != ELK_OPCODE_SEL)
656       return true;
657 
658    if (this->dst.offset % REG_SIZE != 0)
659       return true;
660 
661    /* SEND instructions always write whole registers */
662    if (this->opcode == ELK_SHADER_OPCODE_SEND)
663       return false;
664 
665    /* Special case UNDEF since a lot of places in the backend do things like this :
666     *
667     *  fs_builder ubld = bld.exec_all().group(1, 0);
668     *  elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
669     *  ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
670     */
671    if (this->opcode == ELK_SHADER_OPCODE_UNDEF) {
672       assert(this->dst.is_contiguous());
673       return this->size_written < 32;
674    }
675 
676    return this->exec_size * type_sz(this->dst.type) < 32 ||
677           !this->dst.is_contiguous();
678 }
679 
680 unsigned
components_read(unsigned i) const681 elk_fs_inst::components_read(unsigned i) const
682 {
683    /* Return zero if the source is not present. */
684    if (src[i].file == BAD_FILE)
685       return 0;
686 
687    switch (opcode) {
688    case ELK_FS_OPCODE_LINTERP:
689       if (i == 0)
690          return 2;
691       else
692          return 1;
693 
694    case ELK_FS_OPCODE_PIXEL_X:
695    case ELK_FS_OPCODE_PIXEL_Y:
696       assert(i < 2);
697       if (i == 0)
698          return 2;
699       else
700          return 1;
701 
702    case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
703       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
704       /* First/second FB write color. */
705       if (i < 2)
706          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
707       else
708          return 1;
709 
710    case ELK_SHADER_OPCODE_TEX_LOGICAL:
711    case ELK_SHADER_OPCODE_TXD_LOGICAL:
712    case ELK_SHADER_OPCODE_TXF_LOGICAL:
713    case ELK_SHADER_OPCODE_TXL_LOGICAL:
714    case ELK_SHADER_OPCODE_TXS_LOGICAL:
715    case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
716    case ELK_FS_OPCODE_TXB_LOGICAL:
717    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
718    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
719    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
720    case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
721    case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
722    case ELK_SHADER_OPCODE_LOD_LOGICAL:
723    case ELK_SHADER_OPCODE_TG4_LOGICAL:
724    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
725    case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
726       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
727              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
728              src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
729       /* Texture coordinates. */
730       if (i == TEX_LOGICAL_SRC_COORDINATE)
731          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
732       /* Texture derivatives. */
733       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
734                opcode == ELK_SHADER_OPCODE_TXD_LOGICAL)
735          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
736       /* Texture offset. */
737       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
738          return 2;
739       /* MCS */
740       else if (i == TEX_LOGICAL_SRC_MCS) {
741          if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL)
742             return 2;
743          else if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
744             return 4;
745          else
746             return 1;
747       } else
748          return 1;
749 
750    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
751    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
752       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
753       /* Surface coordinates. */
754       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
755          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
756       /* Surface operation source (ignored for reads). */
757       else if (i == SURFACE_LOGICAL_SRC_DATA)
758          return 0;
759       else
760          return 1;
761 
762    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
763    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
764       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
765              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
766       /* Surface coordinates. */
767       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
768          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
769       /* Surface operation source. */
770       else if (i == SURFACE_LOGICAL_SRC_DATA)
771          return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
772       else
773          return 1;
774 
775    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
776    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
777    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
778       assert(src[A64_LOGICAL_ARG].file == IMM);
779       return 1;
780 
781    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
782       assert(src[A64_LOGICAL_ARG].file == IMM);
783       if (i == A64_LOGICAL_SRC) { /* data to write */
784          const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
785          assert(comps > 0);
786          return comps;
787       } else {
788          return 1;
789       }
790 
791    case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
792       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
793       return 1;
794 
795    case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
796       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
797       if (i == SURFACE_LOGICAL_SRC_DATA) {
798          const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
799          assert(comps > 0);
800          return comps;
801       } else {
802          return 1;
803       }
804 
805    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
806       assert(src[A64_LOGICAL_ARG].file == IMM);
807       return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
808 
809    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
810       assert(src[A64_LOGICAL_ARG].file == IMM);
811       return i == A64_LOGICAL_SRC ?
812              lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
813 
814    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
815    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
816       /* Scattered logical opcodes use the following params:
817        * src[0] Surface coordinates
818        * src[1] Surface operation source (ignored for reads)
819        * src[2] Surface
820        * src[3] IMM with always 1 dimension.
821        * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
822        */
823       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
824              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
825       return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
826 
827    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
828    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
829       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
830              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
831       return 1;
832 
833    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
834    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
835       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
836              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
837       const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
838       /* Surface coordinates. */
839       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
840          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
841       /* Surface operation source. */
842       else if (i == SURFACE_LOGICAL_SRC_DATA)
843          return lsc_op_num_data_values(op);
844       else
845          return 1;
846    }
847    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
848       return (i == 0 ? 2 : 1);
849 
850    case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
851       assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
852 
853       if (i == URB_LOGICAL_SRC_DATA)
854          return src[URB_LOGICAL_SRC_COMPONENTS].ud;
855       else
856          return 1;
857 
858    case ELK_OPCODE_DPAS:
859       unreachable("Do not use components_read() for DPAS.");
860 
861    default:
862       return 1;
863    }
864 }
865 
866 unsigned
size_read(int arg) const867 elk_fs_inst::size_read(int arg) const
868 {
869    switch (opcode) {
870    case ELK_SHADER_OPCODE_SEND:
871       if (arg == 2) {
872          return mlen * REG_SIZE;
873       } else if (arg == 3) {
874          return ex_mlen * REG_SIZE;
875       }
876       break;
877 
878    case ELK_FS_OPCODE_FB_WRITE:
879    case ELK_FS_OPCODE_REP_FB_WRITE:
880       if (arg == 0) {
881          if (base_mrf >= 0)
882             return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
883          else
884             return mlen * REG_SIZE;
885       }
886       break;
887 
888    case ELK_FS_OPCODE_FB_READ:
889    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
890    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
891       if (arg == 0)
892          return mlen * REG_SIZE;
893       break;
894 
895    case ELK_FS_OPCODE_SET_SAMPLE_ID:
896       if (arg == 1)
897          return 1;
898       break;
899 
900    case ELK_FS_OPCODE_LINTERP:
901       if (arg == 1)
902          return 16;
903       break;
904 
905    case ELK_SHADER_OPCODE_LOAD_PAYLOAD:
906       if (arg < this->header_size)
907          return retype(src[arg], ELK_REGISTER_TYPE_UD).component_size(8);
908       break;
909 
910    case ELK_CS_OPCODE_CS_TERMINATE:
911    case ELK_SHADER_OPCODE_BARRIER:
912       return REG_SIZE;
913 
914    case ELK_SHADER_OPCODE_MOV_INDIRECT:
915       if (arg == 0) {
916          assert(src[2].file == IMM);
917          return src[2].ud;
918       }
919       break;
920 
921    case ELK_OPCODE_DPAS:
922       switch (arg) {
923       case 0:
924          if (src[0].type == ELK_REGISTER_TYPE_HF) {
925             return rcount * REG_SIZE / 2;
926          } else {
927             return rcount * REG_SIZE;
928          }
929       case 1:
930          return sdepth * REG_SIZE;
931       case 2:
932          /* This is simpler than the formula described in the Bspec, but it
933           * covers all of the cases that we support on DG2.
934           */
935          return rcount * REG_SIZE;
936       default:
937          unreachable("Invalid source number.");
938       }
939       break;
940 
941    case ELK_SHADER_OPCODE_TEX:
942    case ELK_FS_OPCODE_TXB:
943    case ELK_SHADER_OPCODE_TXD:
944    case ELK_SHADER_OPCODE_TXF:
945    case ELK_SHADER_OPCODE_TXF_LZ:
946    case ELK_SHADER_OPCODE_TXF_CMS:
947    case ELK_SHADER_OPCODE_TXF_CMS_W:
948    case ELK_SHADER_OPCODE_TXF_UMS:
949    case ELK_SHADER_OPCODE_TXF_MCS:
950    case ELK_SHADER_OPCODE_TXL:
951    case ELK_SHADER_OPCODE_TXL_LZ:
952    case ELK_SHADER_OPCODE_TXS:
953    case ELK_SHADER_OPCODE_LOD:
954    case ELK_SHADER_OPCODE_TG4:
955    case ELK_SHADER_OPCODE_TG4_OFFSET:
956    case ELK_SHADER_OPCODE_SAMPLEINFO:
957       if (arg == 0 && src[0].file == VGRF)
958          return mlen * REG_SIZE;
959       break;
960 
961    default:
962       break;
963    }
964 
965    switch (src[arg].file) {
966    case UNIFORM:
967    case IMM:
968       return components_read(arg) * type_sz(src[arg].type);
969    case BAD_FILE:
970    case ARF:
971    case FIXED_GRF:
972    case VGRF:
973    case ATTR:
974       return components_read(arg) * src[arg].component_size(exec_size);
975    case MRF:
976       unreachable("MRF registers are not allowed as sources");
977    }
978    return 0;
979 }
980 
981 namespace {
982    unsigned
predicate_width(const intel_device_info * devinfo,elk_predicate predicate)983    predicate_width(const intel_device_info *devinfo, elk_predicate predicate)
984    {
985       if (devinfo->ver >= 20) {
986          return 1;
987       } else {
988          switch (predicate) {
989          case ELK_PREDICATE_NONE:            return 1;
990          case ELK_PREDICATE_NORMAL:          return 1;
991          case ELK_PREDICATE_ALIGN1_ANY2H:    return 2;
992          case ELK_PREDICATE_ALIGN1_ALL2H:    return 2;
993          case ELK_PREDICATE_ALIGN1_ANY4H:    return 4;
994          case ELK_PREDICATE_ALIGN1_ALL4H:    return 4;
995          case ELK_PREDICATE_ALIGN1_ANY8H:    return 8;
996          case ELK_PREDICATE_ALIGN1_ALL8H:    return 8;
997          case ELK_PREDICATE_ALIGN1_ANY16H:   return 16;
998          case ELK_PREDICATE_ALIGN1_ALL16H:   return 16;
999          case ELK_PREDICATE_ALIGN1_ANY32H:   return 32;
1000          case ELK_PREDICATE_ALIGN1_ALL32H:   return 32;
1001          default: unreachable("Unsupported predicate");
1002          }
1003       }
1004    }
1005 
1006    /* Return the subset of flag registers that an instruction could
1007     * potentially read or write based on the execution controls and flag
1008     * subregister number of the instruction.
1009     */
1010    unsigned
flag_mask(const elk_fs_inst * inst,unsigned width)1011    flag_mask(const elk_fs_inst *inst, unsigned width)
1012    {
1013       assert(util_is_power_of_two_nonzero(width));
1014       const unsigned start = (inst->flag_subreg * 16 + inst->group) &
1015                              ~(width - 1);
1016       const unsigned end = start + ALIGN(inst->exec_size, width);
1017       return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
1018    }
1019 
1020    unsigned
bit_mask(unsigned n)1021    bit_mask(unsigned n)
1022    {
1023       return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
1024    }
1025 
1026    unsigned
flag_mask(const elk_fs_reg & r,unsigned sz)1027    flag_mask(const elk_fs_reg &r, unsigned sz)
1028    {
1029       if (r.file == ARF) {
1030          const unsigned start = (r.nr - ELK_ARF_FLAG) * 4 + r.subnr;
1031          const unsigned end = start + sz;
1032          return bit_mask(end) & ~bit_mask(start);
1033       } else {
1034          return 0;
1035       }
1036    }
1037 }
1038 
1039 unsigned
flags_read(const intel_device_info * devinfo) const1040 elk_fs_inst::flags_read(const intel_device_info *devinfo) const
1041 {
1042    if (devinfo->ver < 20 && (predicate == ELK_PREDICATE_ALIGN1_ANYV ||
1043                              predicate == ELK_PREDICATE_ALIGN1_ALLV)) {
1044       /* The vertical predication modes combine corresponding bits from
1045        * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
1046        */
1047       const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
1048       return flag_mask(this, 1) << shift | flag_mask(this, 1);
1049    } else if (predicate) {
1050       return flag_mask(this, predicate_width(devinfo, predicate));
1051    } else {
1052       unsigned mask = 0;
1053       for (int i = 0; i < sources; i++) {
1054          mask |= flag_mask(src[i], size_read(i));
1055       }
1056       return mask;
1057    }
1058 }
1059 
1060 unsigned
flags_written(const intel_device_info * devinfo) const1061 elk_fs_inst::flags_written(const intel_device_info *devinfo) const
1062 {
1063    /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
1064     * using a separate cmpn and sel instruction.  This lowering occurs in
1065     * fs_vistor::lower_minmax which is called very, very late.
1066     */
1067    if ((conditional_mod && ((opcode != ELK_OPCODE_SEL || devinfo->ver <= 5) &&
1068                             opcode != ELK_OPCODE_CSEL &&
1069                             opcode != ELK_OPCODE_IF &&
1070                             opcode != ELK_OPCODE_WHILE)) ||
1071        opcode == ELK_FS_OPCODE_FB_WRITE) {
1072       return flag_mask(this, 1);
1073    } else if (opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1074               opcode == ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1075               opcode == ELK_FS_OPCODE_LOAD_LIVE_CHANNELS) {
1076       return flag_mask(this, 32);
1077    } else {
1078       return flag_mask(dst, size_written);
1079    }
1080 }
1081 
1082 /**
1083  * Returns how many MRFs an FS opcode will write over.
1084  *
1085  * Note that this is not the 0 or 1 implied writes in an actual gen
1086  * instruction -- the FS opcodes often generate MOVs in addition.
1087  */
1088 unsigned
implied_mrf_writes() const1089 elk_fs_inst::implied_mrf_writes() const
1090 {
1091    if (mlen == 0)
1092       return 0;
1093 
1094    if (base_mrf == -1)
1095       return 0;
1096 
1097    switch (opcode) {
1098    case ELK_SHADER_OPCODE_RCP:
1099    case ELK_SHADER_OPCODE_RSQ:
1100    case ELK_SHADER_OPCODE_SQRT:
1101    case ELK_SHADER_OPCODE_EXP2:
1102    case ELK_SHADER_OPCODE_LOG2:
1103    case ELK_SHADER_OPCODE_SIN:
1104    case ELK_SHADER_OPCODE_COS:
1105       return 1 * exec_size / 8;
1106    case ELK_SHADER_OPCODE_POW:
1107    case ELK_SHADER_OPCODE_INT_QUOTIENT:
1108    case ELK_SHADER_OPCODE_INT_REMAINDER:
1109       return 2 * exec_size / 8;
1110    case ELK_SHADER_OPCODE_TEX:
1111    case ELK_FS_OPCODE_TXB:
1112    case ELK_SHADER_OPCODE_TXD:
1113    case ELK_SHADER_OPCODE_TXF:
1114    case ELK_SHADER_OPCODE_TXF_CMS:
1115    case ELK_SHADER_OPCODE_TXF_MCS:
1116    case ELK_SHADER_OPCODE_TG4:
1117    case ELK_SHADER_OPCODE_TG4_OFFSET:
1118    case ELK_SHADER_OPCODE_TXL:
1119    case ELK_SHADER_OPCODE_TXS:
1120    case ELK_SHADER_OPCODE_LOD:
1121    case ELK_SHADER_OPCODE_SAMPLEINFO:
1122       return 1;
1123    case ELK_FS_OPCODE_FB_WRITE:
1124    case ELK_FS_OPCODE_REP_FB_WRITE:
1125       return src[0].file == BAD_FILE ? 0 : 2;
1126    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1127    case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1128       return 1;
1129    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1130       return mlen;
1131    case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1132       return mlen;
1133    default:
1134       unreachable("not reached");
1135    }
1136 }
1137 
1138 bool
has_sampler_residency() const1139 elk_fs_inst::has_sampler_residency() const
1140 {
1141    switch (opcode) {
1142    case ELK_SHADER_OPCODE_TEX_LOGICAL:
1143    case ELK_FS_OPCODE_TXB_LOGICAL:
1144    case ELK_SHADER_OPCODE_TXL_LOGICAL:
1145    case ELK_SHADER_OPCODE_TXD_LOGICAL:
1146    case ELK_SHADER_OPCODE_TXF_LOGICAL:
1147    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1148    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1149    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1150    case ELK_SHADER_OPCODE_TXS_LOGICAL:
1151    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1152    case ELK_SHADER_OPCODE_TG4_LOGICAL:
1153       assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1154       return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1155    default:
1156       return false;
1157    }
1158 }
1159 
1160 elk_fs_reg
vgrf(const glsl_type * const type)1161 elk_fs_visitor::vgrf(const glsl_type *const type)
1162 {
1163    int reg_width = dispatch_width / 8;
1164    return elk_fs_reg(VGRF,
1165                  alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1166                  elk_type_for_base_type(type));
1167 }
1168 
elk_fs_reg(enum elk_reg_file file,unsigned nr)1169 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr)
1170 {
1171    init();
1172    this->file = file;
1173    this->nr = nr;
1174    this->type = ELK_REGISTER_TYPE_F;
1175    this->stride = (file == UNIFORM ? 0 : 1);
1176 }
1177 
elk_fs_reg(enum elk_reg_file file,unsigned nr,enum elk_reg_type type)1178 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type)
1179 {
1180    init();
1181    this->file = file;
1182    this->nr = nr;
1183    this->type = type;
1184    this->stride = (file == UNIFORM ? 0 : 1);
1185 }
1186 
1187 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1188  * This brings in those uniform definitions
1189  */
1190 void
import_uniforms(elk_fs_visitor * v)1191 elk_fs_visitor::import_uniforms(elk_fs_visitor *v)
1192 {
1193    this->push_constant_loc = v->push_constant_loc;
1194    this->uniforms = v->uniforms;
1195 }
1196 
1197 enum elk_barycentric_mode
elk_barycentric_mode(nir_intrinsic_instr * intr)1198 elk_barycentric_mode(nir_intrinsic_instr *intr)
1199 {
1200    const glsl_interp_mode mode =
1201       (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1202 
1203    /* Barycentric modes don't make sense for flat inputs. */
1204    assert(mode != INTERP_MODE_FLAT);
1205 
1206    unsigned bary;
1207    switch (intr->intrinsic) {
1208    case nir_intrinsic_load_barycentric_pixel:
1209    case nir_intrinsic_load_barycentric_at_offset:
1210       bary = ELK_BARYCENTRIC_PERSPECTIVE_PIXEL;
1211       break;
1212    case nir_intrinsic_load_barycentric_centroid:
1213       bary = ELK_BARYCENTRIC_PERSPECTIVE_CENTROID;
1214       break;
1215    case nir_intrinsic_load_barycentric_sample:
1216    case nir_intrinsic_load_barycentric_at_sample:
1217       bary = ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1218       break;
1219    default:
1220       unreachable("invalid intrinsic");
1221    }
1222 
1223    if (mode == INTERP_MODE_NOPERSPECTIVE)
1224       bary += 3;
1225 
1226    return (enum elk_barycentric_mode) bary;
1227 }
1228 
1229 /**
1230  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1231  */
1232 static enum elk_barycentric_mode
centroid_to_pixel(enum elk_barycentric_mode bary)1233 centroid_to_pixel(enum elk_barycentric_mode bary)
1234 {
1235    assert(bary == ELK_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1236           bary == ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1237    return (enum elk_barycentric_mode) ((unsigned) bary - 1);
1238 }
1239 
1240 /**
1241  * Walk backwards from the end of the program looking for a URB write that
1242  * isn't in control flow, and mark it with EOT.
1243  *
1244  * Return true if successful or false if a separate EOT write is needed.
1245  */
1246 bool
mark_last_urb_write_with_eot()1247 elk_fs_visitor::mark_last_urb_write_with_eot()
1248 {
1249    foreach_in_list_reverse(elk_fs_inst, prev, &this->instructions) {
1250       if (prev->opcode == ELK_SHADER_OPCODE_URB_WRITE_LOGICAL) {
1251          prev->eot = true;
1252 
1253          /* Delete now dead instructions. */
1254          foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1255             if (dead == prev)
1256                break;
1257             dead->remove();
1258          }
1259          return true;
1260       } else if (prev->is_control_flow() || prev->has_side_effects()) {
1261          break;
1262       }
1263    }
1264 
1265    return false;
1266 }
1267 
1268 void
emit_gs_thread_end()1269 elk_fs_visitor::emit_gs_thread_end()
1270 {
1271    assert(stage == MESA_SHADER_GEOMETRY);
1272 
1273    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
1274 
1275    if (gs_compile->control_data_header_size_bits > 0) {
1276       emit_gs_control_data_bits(this->final_gs_vertex_count);
1277    }
1278 
1279    const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1280    elk_fs_inst *inst;
1281 
1282    if (gs_prog_data->static_vertex_count != -1) {
1283       /* Try and tag the last URB write with EOT instead of emitting a whole
1284        * separate write just to finish the thread.
1285        */
1286       if (mark_last_urb_write_with_eot())
1287          return;
1288 
1289       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1290       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1291       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(0);
1292       inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1293                        srcs, ARRAY_SIZE(srcs));
1294    } else {
1295       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1296       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1297       srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1298       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
1299       inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1300                        srcs, ARRAY_SIZE(srcs));
1301    }
1302    inst->eot = true;
1303    inst->offset = 0;
1304 }
1305 
1306 void
assign_curb_setup()1307 elk_fs_visitor::assign_curb_setup()
1308 {
1309    unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1310 
1311    unsigned ubo_push_length = 0;
1312    unsigned ubo_push_start[4];
1313    for (int i = 0; i < 4; i++) {
1314       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1315       ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1316    }
1317 
1318    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1319 
1320    uint64_t used = 0;
1321    bool is_compute = gl_shader_stage_is_compute(stage);
1322 
1323    if (is_compute && elk_cs_prog_data(prog_data)->uses_inline_data) {
1324       /* With COMPUTE_WALKER, we can push up to one register worth of data via
1325        * the inline data parameter in the COMPUTE_WALKER command itself.
1326        *
1327        * TODO: Support inline data and push at the same time.
1328        */
1329       assert(devinfo->verx10 >= 125);
1330       assert(uniform_push_length <= reg_unit(devinfo));
1331    } else if (is_compute && devinfo->verx10 >= 125) {
1332       assert(devinfo->has_lsc);
1333       fs_builder ubld = fs_builder(this, 1).exec_all().at(
1334          cfg->first_block(), cfg->first_block()->start());
1335 
1336       /* The base offset for our push data is passed in as R0.0[31:6]. We have
1337        * to mask off the bottom 6 bits.
1338        */
1339       elk_fs_reg base_addr = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1340       ubld.AND(base_addr,
1341                retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD),
1342                elk_imm_ud(INTEL_MASK(31, 6)));
1343 
1344       /* On Gfx12-HP we load constants at the start of the program using A32
1345        * stateless messages.
1346        */
1347       for (unsigned i = 0; i < uniform_push_length;) {
1348          /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
1349          unsigned num_regs = MIN2(uniform_push_length - i, 8);
1350          assert(num_regs > 0);
1351          num_regs = 1 << util_logbase2(num_regs);
1352 
1353          elk_fs_reg addr = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1354          ubld.ADD(addr, base_addr, elk_imm_ud(i * REG_SIZE));
1355 
1356          elk_fs_reg srcs[4] = {
1357             elk_imm_ud(0), /* desc */
1358             elk_imm_ud(0), /* ex_desc */
1359             addr,          /* payload */
1360             elk_fs_reg(),      /* payload2 */
1361          };
1362 
1363          elk_fs_reg dest = retype(elk_vec8_grf(payload().num_regs + i, 0),
1364                               ELK_REGISTER_TYPE_UD);
1365          elk_fs_inst *send = ubld.emit(ELK_SHADER_OPCODE_SEND, dest, srcs, 4);
1366 
1367          send->sfid = GFX12_SFID_UGM;
1368          send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1369                                    1 /* exec_size */,
1370                                    LSC_ADDR_SURFTYPE_FLAT,
1371                                    LSC_ADDR_SIZE_A32,
1372                                    1 /* num_coordinates */,
1373                                    LSC_DATA_SIZE_D32,
1374                                    num_regs * 8 /* num_channels */,
1375                                    true /* transpose */,
1376                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1377                                    true /* has_dest */);
1378          send->header_size = 0;
1379          send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
1380          send->size_written =
1381             lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
1382          send->send_is_volatile = true;
1383 
1384          i += num_regs;
1385       }
1386 
1387       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1388    }
1389 
1390    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1391    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1392       for (unsigned int i = 0; i < inst->sources; i++) {
1393 	 if (inst->src[i].file == UNIFORM) {
1394             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1395             int constant_nr;
1396             if (inst->src[i].nr >= UBO_START) {
1397                /* constant_nr is in 32-bit units, the rest are in bytes */
1398                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1399                              inst->src[i].offset / 4;
1400             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1401                constant_nr = push_constant_loc[uniform_nr];
1402             } else {
1403                /* Section 5.11 of the OpenGL 4.1 spec says:
1404                 * "Out-of-bounds reads return undefined values, which include
1405                 *  values from other variables of the active program or zero."
1406                 * Just return the first push constant.
1407                 */
1408                constant_nr = 0;
1409             }
1410 
1411             assert(constant_nr / 8 < 64);
1412             used |= BITFIELD64_BIT(constant_nr / 8);
1413 
1414 	    struct elk_reg elk_reg = elk_vec1_grf(payload().num_regs +
1415 						  constant_nr / 8,
1416 						  constant_nr % 8);
1417             elk_reg.abs = inst->src[i].abs;
1418             elk_reg.negate = inst->src[i].negate;
1419 
1420             assert(inst->src[i].stride == 0);
1421             inst->src[i] = byte_offset(
1422                retype(elk_reg, inst->src[i].type),
1423                inst->src[i].offset % 4);
1424 	 }
1425       }
1426    }
1427 
1428    uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1429    if (want_zero) {
1430       fs_builder ubld = fs_builder(this, 8).exec_all().at(
1431          cfg->first_block(), cfg->first_block()->start());
1432 
1433       /* push_reg_mask_param is in 32-bit units */
1434       unsigned mask_param = stage_prog_data->push_reg_mask_param;
1435       struct elk_reg mask = elk_vec1_grf(payload().num_regs + mask_param / 8,
1436                                                               mask_param % 8);
1437 
1438       elk_fs_reg b32;
1439       for (unsigned i = 0; i < 64; i++) {
1440          if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1441             elk_fs_reg shifted = ubld.vgrf(ELK_REGISTER_TYPE_W, 2);
1442             ubld.SHL(horiz_offset(shifted, 8),
1443                      byte_offset(retype(mask, ELK_REGISTER_TYPE_W), i / 8),
1444                      elk_imm_v(0x01234567));
1445             ubld.SHL(shifted, horiz_offset(shifted, 8), elk_imm_w(8));
1446 
1447             fs_builder ubld16 = ubld.group(16, 0);
1448             b32 = ubld16.vgrf(ELK_REGISTER_TYPE_D);
1449             ubld16.group(16, 0).ASR(b32, shifted, elk_imm_w(15));
1450          }
1451 
1452          if (want_zero & BITFIELD64_BIT(i)) {
1453             assert(i < prog_data->curb_read_length);
1454             struct elk_reg push_reg =
1455                retype(elk_vec8_grf(payload().num_regs + i, 0),
1456                       ELK_REGISTER_TYPE_D);
1457 
1458             ubld.AND(push_reg, push_reg, component(b32, i % 16));
1459          }
1460       }
1461 
1462       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1463    }
1464 
1465    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1466    this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1467 }
1468 
1469 /*
1470  * Build up an array of indices into the urb_setup array that
1471  * references the active entries of the urb_setup array.
1472  * Used to accelerate walking the active entries of the urb_setup array
1473  * on each upload.
1474  */
1475 void
elk_compute_urb_setup_index(struct elk_wm_prog_data * wm_prog_data)1476 elk_compute_urb_setup_index(struct elk_wm_prog_data *wm_prog_data)
1477 {
1478    /* Make sure uint8_t is sufficient */
1479    STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1480    uint8_t index = 0;
1481    for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1482       if (wm_prog_data->urb_setup[attr] >= 0) {
1483          wm_prog_data->urb_setup_attribs[index++] = attr;
1484       }
1485    }
1486    wm_prog_data->urb_setup_attribs_count = index;
1487 }
1488 
1489 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * nir)1490 calculate_urb_setup(const struct intel_device_info *devinfo,
1491                     const struct elk_wm_prog_key *key,
1492                     struct elk_wm_prog_data *prog_data,
1493                     const nir_shader *nir)
1494 {
1495    memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1496    memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1497 
1498    int urb_next = 0; /* in vec4s */
1499 
1500    const uint64_t inputs_read =
1501       nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1502 
1503    /* Figure out where each of the incoming setup attributes lands. */
1504    if (devinfo->ver >= 6) {
1505       assert(!nir->info.per_primitive_inputs);
1506 
1507       uint64_t vue_header_bits =
1508          VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1509 
1510       uint64_t unique_fs_attrs = inputs_read & ELK_FS_VARYING_INPUT_MASK;
1511 
1512       /* VUE header fields all live in the same URB slot, so we pass them
1513        * as a single FS input attribute.  We want to only count them once.
1514        */
1515       if (inputs_read & vue_header_bits) {
1516          unique_fs_attrs &= ~vue_header_bits;
1517          unique_fs_attrs |= VARYING_BIT_PSIZ;
1518       }
1519 
1520       if (util_bitcount64(unique_fs_attrs) <= 16) {
1521          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1522           * first 16 varying inputs, so we can put them wherever we want.
1523           * Just put them in order.
1524           *
1525           * This is useful because it means that (a) inputs not used by the
1526           * fragment shader won't take up valuable register space, and (b) we
1527           * won't have to recompile the fragment shader if it gets paired with
1528           * a different vertex (or geometry) shader.
1529           *
1530           * VUE header fields share the same FS input attribute.
1531           */
1532          if (inputs_read & vue_header_bits) {
1533             if (inputs_read & VARYING_BIT_PSIZ)
1534                prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1535             if (inputs_read & VARYING_BIT_LAYER)
1536                prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1537             if (inputs_read & VARYING_BIT_VIEWPORT)
1538                prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1539 
1540             urb_next++;
1541          }
1542 
1543          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1544             if (inputs_read & ELK_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1545                 BITFIELD64_BIT(i)) {
1546                prog_data->urb_setup[i] = urb_next++;
1547             }
1548          }
1549       } else {
1550          /* We have enough input varyings that the SF/SBE pipeline stage can't
1551           * arbitrarily rearrange them to suit our whim; we have to put them
1552           * in an order that matches the output of the previous pipeline stage
1553           * (geometry or vertex shader).
1554           */
1555 
1556          /* Re-compute the VUE map here in the case that the one coming from
1557           * geometry has more than one position slot (used for Primitive
1558           * Replication).
1559           */
1560          struct intel_vue_map prev_stage_vue_map;
1561          elk_compute_vue_map(devinfo, &prev_stage_vue_map,
1562                              key->input_slots_valid,
1563                              nir->info.separate_shader, 1);
1564 
1565          int first_slot =
1566             elk_compute_first_urb_slot_required(inputs_read,
1567                                                 &prev_stage_vue_map);
1568 
1569          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1570          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1571               slot++) {
1572             int varying = prev_stage_vue_map.slot_to_varying[slot];
1573             if (varying != ELK_VARYING_SLOT_PAD &&
1574                 (inputs_read & ELK_FS_VARYING_INPUT_MASK &
1575                  BITFIELD64_BIT(varying))) {
1576                prog_data->urb_setup[varying] = slot - first_slot;
1577             }
1578          }
1579          urb_next = prev_stage_vue_map.num_slots - first_slot;
1580       }
1581    } else {
1582       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1583       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1584          /* Point size is packed into the header, not as a general attribute */
1585          if (i == VARYING_SLOT_PSIZ)
1586             continue;
1587 
1588 	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1589 	    /* The back color slot is skipped when the front color is
1590 	     * also written to.  In addition, some slots can be
1591 	     * written in the vertex shader and not read in the
1592 	     * fragment shader.  So the register number must always be
1593 	     * incremented, mapped or not.
1594 	     */
1595 	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1596 	       prog_data->urb_setup[i] = urb_next;
1597             urb_next++;
1598 	 }
1599       }
1600 
1601       /*
1602        * It's a FS only attribute, and we did interpolation for this attribute
1603        * in SF thread. So, count it here, too.
1604        *
1605        * See compile_sf_prog() for more info.
1606        */
1607       if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1608          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1609    }
1610 
1611    prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1612    prog_data->inputs = inputs_read;
1613 
1614    elk_compute_urb_setup_index(prog_data);
1615 }
1616 
1617 void
assign_urb_setup()1618 elk_fs_visitor::assign_urb_setup()
1619 {
1620    assert(stage == MESA_SHADER_FRAGMENT);
1621    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
1622 
1623    int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1624 
1625    /* Offset all the urb_setup[] index by the actual position of the
1626     * setup regs, now that the location of the constants has been chosen.
1627     */
1628    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1629       for (int i = 0; i < inst->sources; i++) {
1630          if (inst->src[i].file == ATTR) {
1631             /* ATTR elk_fs_reg::nr in the FS is in units of logical scalar
1632              * inputs each of which consumes 16B on Gfx4-Gfx12.  In
1633              * single polygon mode this leads to the following layout
1634              * of the vertex setup plane parameters in the ATTR
1635              * register file:
1636              *
1637              *  elk_fs_reg::nr   Input   Comp0  Comp1  Comp2  Comp3
1638              *      0       Attr0.x  a1-a0  a2-a0   N/A    a0
1639              *      1       Attr0.y  a1-a0  a2-a0   N/A    a0
1640              *      2       Attr0.z  a1-a0  a2-a0   N/A    a0
1641              *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
1642              *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
1643              *     ...
1644              *
1645              * In multipolygon mode that no longer works since
1646              * different channels may be processing polygons with
1647              * different plane parameters, so each parameter above is
1648              * represented as a dispatch_width-wide vector:
1649              *
1650              *  elk_fs_reg::nr     elk_fs_reg::offset    Input      Comp0     ...    CompN
1651              *      0                 0          Attr0.x  a1[0]-a0[0] ... a1[N]-a0[N]
1652              *      0        4 * dispatch_width  Attr0.x  a2[0]-a0[0] ... a2[N]-a0[N]
1653              *      0        8 * dispatch_width  Attr0.x     N/A      ...     N/A
1654              *      0       12 * dispatch_width  Attr0.x    a0[0]     ...    a0[N]
1655              *      1                 0          Attr0.y  a1[0]-a0[0] ... a1[N]-a0[N]
1656              *     ...
1657              *
1658              * Note that many of the components on a single row above
1659              * are likely to be replicated multiple times (if, say, a
1660              * single SIMD thread is only processing 2 different
1661              * polygons), so plane parameters aren't actually stored
1662              * in GRF memory with that layout to avoid wasting space.
1663              * Instead we compose ATTR register regions with a 2D
1664              * region that walks through the parameters of each
1665              * polygon with the correct stride, reading the parameter
1666              * corresponding to each channel directly from the PS
1667              * thread payload.
1668              *
1669              * The latter layout corresponds to a param_width equal to
1670              * dispatch_width, while the former (scalar parameter)
1671              * layout has a param_width of 1.
1672              *
1673              * Gfx20+ represent plane parameters in a format similar
1674              * to the above, except the parameters are packed in 12B
1675              * and ordered like "a0, a1-a0, a2-a0" instead of the
1676              * above vec4 representation with a missing component.
1677              */
1678             const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
1679 
1680             /* Size of a single scalar component of a plane parameter
1681              * in bytes.
1682              */
1683             const unsigned chan_sz = 4;
1684             struct elk_reg reg;
1685             assert(max_polygons > 0);
1686 
1687             /* Calculate the base register on the thread payload of
1688              * either the block of vertex setup data or the block of
1689              * per-primitive constant data depending on whether we're
1690              * accessing a primitive or vertex input.  Also calculate
1691              * the index of the input within that block.
1692              */
1693             const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1694             const unsigned base = urb_start +
1695                (per_prim ? 0 :
1696                 ALIGN(prog_data->num_per_primitive_inputs / 2,
1697                       reg_unit(devinfo)) * max_polygons);
1698             const unsigned idx = per_prim ? inst->src[i].nr :
1699                inst->src[i].nr - prog_data->num_per_primitive_inputs;
1700 
1701             /* Translate the offset within the param_width-wide
1702              * representation described above into an offset and a
1703              * grf, which contains the plane parameters for the first
1704              * polygon processed by the thread.
1705              */
1706             if (devinfo->ver >= 20 && !per_prim) {
1707                /* Gfx20+ is able to pack 5 logical input components
1708                 * per 64B register for vertex setup data.
1709                 */
1710                const unsigned grf = base + idx / 5 * 2 * max_polygons;
1711                assert(inst->src[i].offset / param_width < 12);
1712                const unsigned delta = idx % 5 * 12 +
1713                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1714                   inst->src[i].offset % chan_sz;
1715                reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1716                                  delta);
1717             } else {
1718                /* Earlier platforms and per-primitive block pack 2 logical
1719                 * input components per 32B register.
1720                 */
1721                const unsigned grf = base + idx / 2 * max_polygons;
1722                assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1723                const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1724                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1725                   inst->src[i].offset % chan_sz;
1726                reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1727                                  delta);
1728             }
1729 
1730             if (max_polygons > 1) {
1731                assert(devinfo->ver >= 12);
1732                /* Misaligned channel strides that would lead to
1733                 * cross-channel access in the representation above are
1734                 * disallowed.
1735                 */
1736                assert(inst->src[i].stride * type_sz(inst->src[i].type) == chan_sz);
1737 
1738                /* Number of channels processing the same polygon. */
1739                const unsigned poly_width = dispatch_width / max_polygons;
1740                assert(dispatch_width % max_polygons == 0);
1741 
1742                /* Accessing a subset of channels of a parameter vector
1743                 * starting from "chan" is necessary to handle
1744                 * SIMD-lowered instructions though.
1745                 */
1746                const unsigned chan = inst->src[i].offset %
1747                   (param_width * chan_sz) / chan_sz;
1748                assert(chan < dispatch_width);
1749                assert(chan % poly_width == 0);
1750                const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
1751                reg = byte_offset(reg, chan / poly_width * reg_size);
1752 
1753                if (inst->exec_size > poly_width) {
1754                   /* Accessing the parameters for multiple polygons.
1755                    * Corresponding parameters for different polygons
1756                    * are stored a GRF apart on the thread payload, so
1757                    * use that as vertical stride.
1758                    */
1759                   const unsigned vstride = reg_size / type_sz(inst->src[i].type);
1760                   assert(vstride <= 32);
1761                   assert(chan % poly_width == 0);
1762                   reg = stride(reg, vstride, poly_width, 0);
1763                } else {
1764                   /* Accessing one parameter for a single polygon --
1765                    * Translate to a scalar region.
1766                    */
1767                   assert(chan % poly_width + inst->exec_size <= poly_width);
1768                   reg = stride(reg, 0, 1, 0);
1769                }
1770 
1771             } else {
1772                const unsigned width = inst->src[i].stride == 0 ?
1773                   1 : MIN2(inst->exec_size, 8);
1774                reg = stride(reg, width * inst->src[i].stride,
1775                             width, inst->src[i].stride);
1776             }
1777 
1778             reg.abs = inst->src[i].abs;
1779             reg.negate = inst->src[i].negate;
1780             inst->src[i] = reg;
1781          }
1782       }
1783    }
1784 
1785    /* Each attribute is 4 setup channels, each of which is half a reg,
1786     * but they may be replicated multiple times for multipolygon
1787     * dispatch.
1788     */
1789    this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
1790 
1791    /* Unlike regular attributes, per-primitive attributes have all 4 channels
1792     * in the same slot, so each GRF can store two slots.
1793     */
1794    assert(prog_data->num_per_primitive_inputs % 2 == 0);
1795    this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
1796 }
1797 
1798 void
convert_attr_sources_to_hw_regs(elk_fs_inst * inst)1799 elk_fs_visitor::convert_attr_sources_to_hw_regs(elk_fs_inst *inst)
1800 {
1801    for (int i = 0; i < inst->sources; i++) {
1802       if (inst->src[i].file == ATTR) {
1803          assert(inst->src[i].nr == 0);
1804          int grf = payload().num_regs +
1805                    prog_data->curb_read_length +
1806                    inst->src[i].offset / REG_SIZE;
1807 
1808          /* As explained at elk_reg_from_fs_reg, From the Haswell PRM:
1809           *
1810           * VertStride must be used to cross GRF register boundaries. This
1811           * rule implies that elements within a 'Width' cannot cross GRF
1812           * boundaries.
1813           *
1814           * So, for registers that are large enough, we have to split the exec
1815           * size in two and trust the compression state to sort it out.
1816           */
1817          unsigned total_size = inst->exec_size *
1818                                inst->src[i].stride *
1819                                type_sz(inst->src[i].type);
1820 
1821          assert(total_size <= 2 * REG_SIZE);
1822          const unsigned exec_size =
1823             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1824 
1825          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1826          struct elk_reg reg =
1827             stride(byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1828                                inst->src[i].offset % REG_SIZE),
1829                    exec_size * inst->src[i].stride,
1830                    width, inst->src[i].stride);
1831          reg.abs = inst->src[i].abs;
1832          reg.negate = inst->src[i].negate;
1833 
1834          inst->src[i] = reg;
1835       }
1836    }
1837 }
1838 
1839 void
assign_vs_urb_setup()1840 elk_fs_visitor::assign_vs_urb_setup()
1841 {
1842    struct elk_vs_prog_data *vs_prog_data = elk_vs_prog_data(prog_data);
1843 
1844    assert(stage == MESA_SHADER_VERTEX);
1845 
1846    /* Each attribute is 4 regs. */
1847    this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1848 
1849    assert(vs_prog_data->base.urb_read_length <= 15);
1850 
1851    /* Rewrite all ATTR file references to the hw grf that they land in. */
1852    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1853       convert_attr_sources_to_hw_regs(inst);
1854    }
1855 }
1856 
1857 void
assign_tcs_urb_setup()1858 elk_fs_visitor::assign_tcs_urb_setup()
1859 {
1860    assert(stage == MESA_SHADER_TESS_CTRL);
1861 
1862    /* Rewrite all ATTR file references to HW_REGs. */
1863    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1864       convert_attr_sources_to_hw_regs(inst);
1865    }
1866 }
1867 
1868 void
assign_tes_urb_setup()1869 elk_fs_visitor::assign_tes_urb_setup()
1870 {
1871    assert(stage == MESA_SHADER_TESS_EVAL);
1872 
1873    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1874 
1875    first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1876 
1877    /* Rewrite all ATTR file references to HW_REGs. */
1878    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1879       convert_attr_sources_to_hw_regs(inst);
1880    }
1881 }
1882 
1883 void
assign_gs_urb_setup()1884 elk_fs_visitor::assign_gs_urb_setup()
1885 {
1886    assert(stage == MESA_SHADER_GEOMETRY);
1887 
1888    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1889 
1890    first_non_payload_grf +=
1891       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1892 
1893    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1894       /* Rewrite all ATTR file references to GRFs. */
1895       convert_attr_sources_to_hw_regs(inst);
1896    }
1897 }
1898 
1899 
1900 /**
1901  * Split large virtual GRFs into separate components if we can.
1902  *
1903  * This pass aggressively splits VGRFs into as small a chunks as possible,
1904  * down to single registers if it can.  If no VGRFs can be split, we return
1905  * false so this pass can safely be used inside an optimization loop.  We
1906  * want to split, because virtual GRFs are what we register allocate and
1907  * spill (due to contiguousness requirements for some instructions), and
1908  * they're what we naturally generate in the codegen process, but most
1909  * virtual GRFs don't actually need to be contiguous sets of GRFs.  If we
1910  * split, we'll end up with reduced live intervals and better dead code
1911  * elimination and coalescing.
1912  */
1913 bool
split_virtual_grfs()1914 elk_fs_visitor::split_virtual_grfs()
1915 {
1916    /* Compact the register file so we eliminate dead vgrfs.  This
1917     * only defines split points for live registers, so if we have
1918     * too large dead registers they will hit assertions later.
1919     */
1920    compact_virtual_grfs();
1921 
1922    unsigned num_vars = this->alloc.count;
1923 
1924    /* Count the total number of registers */
1925    unsigned reg_count = 0;
1926    unsigned vgrf_to_reg[num_vars];
1927    for (unsigned i = 0; i < num_vars; i++) {
1928       vgrf_to_reg[i] = reg_count;
1929       reg_count += alloc.sizes[i];
1930    }
1931 
1932    /* An array of "split points".  For each register slot, this indicates
1933     * if this slot can be separated from the previous slot.  Every time an
1934     * instruction uses multiple elements of a register (as a source or
1935     * destination), we mark the used slots as inseparable.  Then we go
1936     * through and split the registers into the smallest pieces we can.
1937     */
1938    bool *split_points = new bool[reg_count];
1939    memset(split_points, 0, reg_count * sizeof(*split_points));
1940 
1941    /* Mark all used registers as fully splittable */
1942    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1943       if (inst->dst.file == VGRF) {
1944          unsigned reg = vgrf_to_reg[inst->dst.nr];
1945          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1946             split_points[reg + j] = true;
1947       }
1948 
1949       for (unsigned i = 0; i < inst->sources; i++) {
1950          if (inst->src[i].file == VGRF) {
1951             unsigned reg = vgrf_to_reg[inst->src[i].nr];
1952             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1953                split_points[reg + j] = true;
1954          }
1955       }
1956    }
1957 
1958    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1959       /* We fix up undef instructions later */
1960       if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1961          assert(inst->dst.file == VGRF);
1962          continue;
1963       }
1964 
1965       if (inst->dst.file == VGRF) {
1966          unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1967          for (unsigned j = 1; j < regs_written(inst); j++)
1968             split_points[reg + j] = false;
1969       }
1970       for (unsigned i = 0; i < inst->sources; i++) {
1971          if (inst->src[i].file == VGRF) {
1972             unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1973             for (unsigned j = 1; j < regs_read(inst, i); j++)
1974                split_points[reg + j] = false;
1975          }
1976       }
1977    }
1978 
1979    /* Bitset of which registers have been split */
1980    bool *vgrf_has_split = new bool[num_vars];
1981    memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
1982 
1983    unsigned *new_virtual_grf = new unsigned[reg_count];
1984    unsigned *new_reg_offset = new unsigned[reg_count];
1985 
1986    unsigned reg = 0;
1987    bool has_splits = false;
1988    for (unsigned i = 0; i < num_vars; i++) {
1989       /* The first one should always be 0 as a quick sanity check. */
1990       assert(split_points[reg] == false);
1991 
1992       /* j = 0 case */
1993       new_reg_offset[reg] = 0;
1994       reg++;
1995       unsigned offset = 1;
1996 
1997       /* j > 0 case */
1998       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1999          /* If this is a split point, reset the offset to 0 and allocate a
2000           * new virtual GRF for the previous offset many registers
2001           */
2002          if (split_points[reg]) {
2003             has_splits = true;
2004             vgrf_has_split[i] = true;
2005             assert(offset <= MAX_VGRF_SIZE(devinfo));
2006             unsigned grf = alloc.allocate(offset);
2007             for (unsigned k = reg - offset; k < reg; k++)
2008                new_virtual_grf[k] = grf;
2009             offset = 0;
2010          }
2011          new_reg_offset[reg] = offset;
2012          offset++;
2013          reg++;
2014       }
2015 
2016       /* The last one gets the original register number */
2017       assert(offset <= MAX_VGRF_SIZE(devinfo));
2018       alloc.sizes[i] = offset;
2019       for (unsigned k = reg - offset; k < reg; k++)
2020          new_virtual_grf[k] = i;
2021    }
2022    assert(reg == reg_count);
2023 
2024    bool progress;
2025    if (!has_splits) {
2026       progress = false;
2027       goto cleanup;
2028    }
2029 
2030    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2031       if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
2032          assert(inst->dst.file == VGRF);
2033          if (vgrf_has_split[inst->dst.nr]) {
2034             const fs_builder ibld(this, block, inst);
2035             assert(inst->size_written % REG_SIZE == 0);
2036             unsigned reg_offset = inst->dst.offset / REG_SIZE;
2037             unsigned size_written = 0;
2038             while (size_written < inst->size_written) {
2039                reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
2040                elk_fs_inst *undef =
2041                   ibld.UNDEF(
2042                      byte_offset(elk_fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
2043                                  new_reg_offset[reg] * REG_SIZE));
2044                undef->size_written =
2045                   MIN2(inst->size_written - size_written, undef->size_written);
2046                assert(undef->size_written % REG_SIZE == 0);
2047                size_written += undef->size_written;
2048             }
2049             inst->remove(block);
2050          } else {
2051             reg = vgrf_to_reg[inst->dst.nr];
2052             assert(new_reg_offset[reg] == 0);
2053             assert(new_virtual_grf[reg] == inst->dst.nr);
2054          }
2055          continue;
2056       }
2057 
2058       if (inst->dst.file == VGRF) {
2059          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2060          if (vgrf_has_split[inst->dst.nr]) {
2061             inst->dst.nr = new_virtual_grf[reg];
2062             inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
2063                                inst->dst.offset % REG_SIZE;
2064             assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2065          } else {
2066             assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
2067             assert(new_virtual_grf[reg] == inst->dst.nr);
2068          }
2069       }
2070       for (unsigned i = 0; i < inst->sources; i++) {
2071 	 if (inst->src[i].file != VGRF)
2072             continue;
2073 
2074          reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2075          if (vgrf_has_split[inst->src[i].nr]) {
2076             inst->src[i].nr = new_virtual_grf[reg];
2077             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
2078                                   inst->src[i].offset % REG_SIZE;
2079             assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2080          } else {
2081             assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
2082             assert(new_virtual_grf[reg] == inst->src[i].nr);
2083          }
2084       }
2085    }
2086    invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2087 
2088    progress = true;
2089 
2090 cleanup:
2091    delete[] split_points;
2092    delete[] vgrf_has_split;
2093    delete[] new_virtual_grf;
2094    delete[] new_reg_offset;
2095 
2096    return progress;
2097 }
2098 
2099 /**
2100  * Remove unused virtual GRFs and compact the vgrf_* arrays.
2101  *
2102  * During code generation, we create tons of temporary variables, many of
2103  * which get immediately killed and are never used again.  Yet, in later
2104  * optimization and analysis passes, such as compute_live_intervals, we need
2105  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2106  * overhead.
2107  */
2108 bool
compact_virtual_grfs()2109 elk_fs_visitor::compact_virtual_grfs()
2110 {
2111    bool progress = false;
2112    int *remap_table = new int[this->alloc.count];
2113    memset(remap_table, -1, this->alloc.count * sizeof(int));
2114 
2115    /* Mark which virtual GRFs are used. */
2116    foreach_block_and_inst(block, const elk_fs_inst, inst, cfg) {
2117       if (inst->dst.file == VGRF)
2118          remap_table[inst->dst.nr] = 0;
2119 
2120       for (int i = 0; i < inst->sources; i++) {
2121          if (inst->src[i].file == VGRF)
2122             remap_table[inst->src[i].nr] = 0;
2123       }
2124    }
2125 
2126    /* Compact the GRF arrays. */
2127    int new_index = 0;
2128    for (unsigned i = 0; i < this->alloc.count; i++) {
2129       if (remap_table[i] == -1) {
2130          /* We just found an unused register.  This means that we are
2131           * actually going to compact something.
2132           */
2133          progress = true;
2134       } else {
2135          remap_table[i] = new_index;
2136          alloc.sizes[new_index] = alloc.sizes[i];
2137          invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2138          ++new_index;
2139       }
2140    }
2141 
2142    this->alloc.count = new_index;
2143 
2144    /* Patch all the instructions to use the newly renumbered registers */
2145    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2146       if (inst->dst.file == VGRF)
2147          inst->dst.nr = remap_table[inst->dst.nr];
2148 
2149       for (int i = 0; i < inst->sources; i++) {
2150          if (inst->src[i].file == VGRF)
2151             inst->src[i].nr = remap_table[inst->src[i].nr];
2152       }
2153    }
2154 
2155    /* Patch all the references to delta_xy, since they're used in register
2156     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2157     * think some random VGRF is delta_xy.
2158     */
2159    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2160       if (delta_xy[i].file == VGRF) {
2161          if (remap_table[delta_xy[i].nr] != -1) {
2162             delta_xy[i].nr = remap_table[delta_xy[i].nr];
2163          } else {
2164             delta_xy[i].file = BAD_FILE;
2165          }
2166       }
2167    }
2168 
2169    delete[] remap_table;
2170 
2171    return progress;
2172 }
2173 
2174 int
elk_get_subgroup_id_param_index(const intel_device_info * devinfo,const elk_stage_prog_data * prog_data)2175 elk_get_subgroup_id_param_index(const intel_device_info *devinfo,
2176                                 const elk_stage_prog_data *prog_data)
2177 {
2178    if (prog_data->nr_params == 0)
2179       return -1;
2180 
2181    if (devinfo->verx10 >= 125)
2182       return -1;
2183 
2184    /* The local thread id is always the last parameter in the list */
2185    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
2186    if (last_param == ELK_PARAM_BUILTIN_SUBGROUP_ID)
2187       return prog_data->nr_params - 1;
2188 
2189    return -1;
2190 }
2191 
2192 /**
2193  * Assign UNIFORM file registers to either push constants or pull constants.
2194  *
2195  * We allow a fragment shader to have more than the specified minimum
2196  * maximum number of fragment shader uniform components (64).  If
2197  * there are too many of these, they'd fill up all of register space.
2198  * So, this will push some of them out to the pull constant buffer and
2199  * update the program to load them.
2200  */
2201 void
assign_constant_locations()2202 elk_fs_visitor::assign_constant_locations()
2203 {
2204    /* Only the first compile gets to decide on locations. */
2205    if (push_constant_loc)
2206       return;
2207 
2208    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2209    for (unsigned u = 0; u < uniforms; u++)
2210       push_constant_loc[u] = u;
2211 
2212    /* Now that we know how many regular uniforms we'll push, reduce the
2213     * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
2214     */
2215    /* For gen4/5:
2216     * Only allow 16 registers (128 uniform components) as push constants.
2217     *
2218     * If changing this value, note the limitation about total_regs in
2219     * elk_curbe.c/crocus_state.c
2220     */
2221    const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
2222    unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
2223    for (int i = 0; i < 4; i++) {
2224       struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
2225 
2226       if (push_length + range->length > max_push_length)
2227          range->length = max_push_length - push_length;
2228 
2229       push_length += range->length;
2230    }
2231    assert(push_length <= max_push_length);
2232 }
2233 
2234 bool
get_pull_locs(const elk_fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)2235 elk_fs_visitor::get_pull_locs(const elk_fs_reg &src,
2236                           unsigned *out_surf_index,
2237                           unsigned *out_pull_index)
2238 {
2239    assert(src.file == UNIFORM);
2240 
2241    if (src.nr < UBO_START)
2242       return false;
2243 
2244    const struct elk_ubo_range *range =
2245       &prog_data->ubo_ranges[src.nr - UBO_START];
2246 
2247    /* If this access is in our (reduced) range, use the push data. */
2248    if (src.offset / 32 < range->length)
2249       return false;
2250 
2251    *out_surf_index = range->block;
2252    *out_pull_index = (32 * range->start + src.offset) / 4;
2253 
2254    prog_data->has_ubo_pull = true;
2255 
2256    return true;
2257 }
2258 
2259 /**
2260  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2261  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2262  */
2263 bool
lower_constant_loads()2264 elk_fs_visitor::lower_constant_loads()
2265 {
2266    unsigned index, pull_index;
2267    bool progress = false;
2268 
2269    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
2270       /* Set up the annotation tracking for new generated instructions. */
2271       const fs_builder ibld(this, block, inst);
2272 
2273       for (int i = 0; i < inst->sources; i++) {
2274 	 if (inst->src[i].file != UNIFORM)
2275 	    continue;
2276 
2277          /* We'll handle this case later */
2278          if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT && i == 0)
2279             continue;
2280 
2281          if (!get_pull_locs(inst->src[i], &index, &pull_index))
2282 	    continue;
2283 
2284          assert(inst->src[i].stride == 0);
2285 
2286          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2287          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2288          const elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2289          const unsigned base = pull_index * 4;
2290 
2291          elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
2292          srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = elk_imm_ud(index);
2293          srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = elk_imm_ud(base & ~(block_sz - 1));
2294          srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = elk_imm_ud(block_sz);
2295 
2296 
2297          ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
2298                    srcs, PULL_UNIFORM_CONSTANT_SRCS);
2299 
2300          /* Rewrite the instruction to use the temporary VGRF. */
2301          inst->src[i].file = VGRF;
2302          inst->src[i].nr = dst.nr;
2303          inst->src[i].offset = (base & (block_sz - 1)) +
2304                                inst->src[i].offset % 4;
2305 
2306          progress = true;
2307       }
2308 
2309       if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT &&
2310           inst->src[0].file == UNIFORM) {
2311 
2312          if (!get_pull_locs(inst->src[0], &index, &pull_index))
2313             continue;
2314 
2315          VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2316                                     elk_imm_ud(index),
2317                                     elk_fs_reg() /* surface_handle */,
2318                                     inst->src[1],
2319                                     pull_index * 4, 4, 1);
2320          inst->remove(block);
2321 
2322          progress = true;
2323       }
2324    }
2325    invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2326 
2327    return progress;
2328 }
2329 
2330 static uint64_t
src_as_uint(const elk_fs_reg & src)2331 src_as_uint(const elk_fs_reg &src)
2332 {
2333    assert(src.file == IMM);
2334 
2335    switch (src.type) {
2336    case ELK_REGISTER_TYPE_W:
2337       return (uint64_t)(int16_t)(src.ud & 0xffff);
2338 
2339    case ELK_REGISTER_TYPE_UW:
2340       return (uint64_t)(uint16_t)(src.ud & 0xffff);
2341 
2342    case ELK_REGISTER_TYPE_D:
2343       return (uint64_t)src.d;
2344 
2345    case ELK_REGISTER_TYPE_UD:
2346       return (uint64_t)src.ud;
2347 
2348    case ELK_REGISTER_TYPE_Q:
2349       return src.d64;
2350 
2351    case ELK_REGISTER_TYPE_UQ:
2352       return src.u64;
2353 
2354    default:
2355       unreachable("Invalid integer type.");
2356    }
2357 }
2358 
2359 static elk_fs_reg
elk_imm_for_type(uint64_t value,enum elk_reg_type type)2360 elk_imm_for_type(uint64_t value, enum elk_reg_type type)
2361 {
2362    switch (type) {
2363    case ELK_REGISTER_TYPE_W:
2364       return elk_imm_w(value);
2365 
2366    case ELK_REGISTER_TYPE_UW:
2367       return elk_imm_uw(value);
2368 
2369    case ELK_REGISTER_TYPE_D:
2370       return elk_imm_d(value);
2371 
2372    case ELK_REGISTER_TYPE_UD:
2373       return elk_imm_ud(value);
2374 
2375    case ELK_REGISTER_TYPE_Q:
2376       return elk_imm_d(value);
2377 
2378    case ELK_REGISTER_TYPE_UQ:
2379       return elk_imm_uq(value);
2380 
2381    default:
2382       unreachable("Invalid integer type.");
2383    }
2384 }
2385 
2386 bool
opt_algebraic()2387 elk_fs_visitor::opt_algebraic()
2388 {
2389    bool progress = false;
2390 
2391    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2392       switch (inst->opcode) {
2393       case ELK_OPCODE_MOV:
2394          if (!devinfo->has_64bit_float &&
2395              inst->dst.type == ELK_REGISTER_TYPE_DF) {
2396             assert(inst->dst.type == inst->src[0].type);
2397             assert(!inst->saturate);
2398             assert(!inst->src[0].abs);
2399             assert(!inst->src[0].negate);
2400             const elk::fs_builder ibld(this, block, inst);
2401 
2402             if (!inst->is_partial_write())
2403                ibld.emit_undef_for_dst(inst);
2404 
2405             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 1),
2406                      subscript(inst->src[0], ELK_REGISTER_TYPE_F, 1));
2407             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 0),
2408                      subscript(inst->src[0], ELK_REGISTER_TYPE_F, 0));
2409 
2410             inst->remove(block);
2411             progress = true;
2412          }
2413 
2414          if (!devinfo->has_64bit_int &&
2415              (inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2416               inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2417             assert(inst->dst.type == inst->src[0].type);
2418             assert(!inst->saturate);
2419             assert(!inst->src[0].abs);
2420             assert(!inst->src[0].negate);
2421             const elk::fs_builder ibld(this, block, inst);
2422 
2423             if (!inst->is_partial_write())
2424                ibld.emit_undef_for_dst(inst);
2425 
2426             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2427                      subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1));
2428             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2429                      subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0));
2430 
2431             inst->remove(block);
2432             progress = true;
2433          }
2434 
2435          if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2436               inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2437              inst->dst.is_null() &&
2438              (inst->src[0].abs || inst->src[0].negate)) {
2439             inst->src[0].abs = false;
2440             inst->src[0].negate = false;
2441             progress = true;
2442             break;
2443          }
2444 
2445          if (inst->src[0].file != IMM)
2446             break;
2447 
2448          if (inst->saturate) {
2449             /* Full mixed-type saturates don't happen.  However, we can end up
2450              * with things like:
2451              *
2452              *    mov.sat(8) g21<1>DF       -1F
2453              *
2454              * Other mixed-size-but-same-base-type cases may also be possible.
2455              */
2456             if (inst->dst.type != inst->src[0].type &&
2457                 inst->dst.type != ELK_REGISTER_TYPE_DF &&
2458                 inst->src[0].type != ELK_REGISTER_TYPE_F)
2459                assert(!"unimplemented: saturate mixed types");
2460 
2461             if (elk_saturate_immediate(inst->src[0].type,
2462                                        &inst->src[0].as_elk_reg())) {
2463                inst->saturate = false;
2464                progress = true;
2465             }
2466          }
2467          break;
2468 
2469       case ELK_OPCODE_MUL:
2470          if (inst->src[1].file != IMM)
2471             continue;
2472 
2473          if (elk_reg_type_is_floating_point(inst->src[1].type))
2474             break;
2475 
2476          /* From the BDW PRM, Vol 2a, "mul - Multiply":
2477           *
2478           *    "When multiplying integer datatypes, if src0 is DW and src1
2479           *    is W, irrespective of the destination datatype, the
2480           *    accumulator maintains full 48-bit precision."
2481           *    ...
2482           *    "When multiplying integer data types, if one of the sources
2483           *    is a DW, the resulting full precision data is stored in
2484           *    the accumulator."
2485           *
2486           * There are also similar notes in earlier PRMs.
2487           *
2488           * The MOV instruction can copy the bits of the source, but it
2489           * does not clear the higher bits of the accumulator. So, because
2490           * we might use the full accumulator in the MUL/MACH macro, we
2491           * shouldn't replace such MULs with MOVs.
2492           */
2493          if ((elk_reg_type_to_size(inst->src[0].type) == 4 ||
2494               elk_reg_type_to_size(inst->src[1].type) == 4) &&
2495              (inst->dst.is_accumulator() ||
2496               inst->writes_accumulator_implicitly(devinfo)))
2497             break;
2498 
2499          /* a * 1.0 = a */
2500          if (inst->src[1].is_one()) {
2501             inst->opcode = ELK_OPCODE_MOV;
2502             inst->sources = 1;
2503             inst->src[1] = reg_undef;
2504             progress = true;
2505             break;
2506          }
2507 
2508          /* a * -1.0 = -a */
2509          if (inst->src[1].is_negative_one()) {
2510             inst->opcode = ELK_OPCODE_MOV;
2511             inst->sources = 1;
2512             inst->src[0].negate = !inst->src[0].negate;
2513             inst->src[1] = reg_undef;
2514             progress = true;
2515             break;
2516          }
2517 
2518          break;
2519       case ELK_OPCODE_ADD:
2520          if (inst->src[1].file != IMM)
2521             continue;
2522 
2523          if (elk_reg_type_is_integer(inst->src[1].type) &&
2524              inst->src[1].is_zero()) {
2525             inst->opcode = ELK_OPCODE_MOV;
2526             inst->sources = 1;
2527             inst->src[1] = reg_undef;
2528             progress = true;
2529             break;
2530          }
2531 
2532          if (inst->src[0].file == IMM) {
2533             assert(inst->src[0].type == ELK_REGISTER_TYPE_F);
2534             inst->opcode = ELK_OPCODE_MOV;
2535             inst->sources = 1;
2536             inst->src[0].f += inst->src[1].f;
2537             inst->src[1] = reg_undef;
2538             progress = true;
2539             break;
2540          }
2541          break;
2542 
2543       case ELK_OPCODE_AND:
2544          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2545             const uint64_t src0 = src_as_uint(inst->src[0]);
2546             const uint64_t src1 = src_as_uint(inst->src[1]);
2547 
2548             inst->opcode = ELK_OPCODE_MOV;
2549             inst->sources = 1;
2550             inst->src[0] = elk_imm_for_type(src0 & src1, inst->dst.type);
2551             inst->src[1] = reg_undef;
2552             progress = true;
2553             break;
2554          }
2555 
2556          break;
2557 
2558       case ELK_OPCODE_OR:
2559          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2560             const uint64_t src0 = src_as_uint(inst->src[0]);
2561             const uint64_t src1 = src_as_uint(inst->src[1]);
2562 
2563             inst->opcode = ELK_OPCODE_MOV;
2564             inst->sources = 1;
2565             inst->src[0] = elk_imm_for_type(src0 | src1, inst->dst.type);
2566             inst->src[1] = reg_undef;
2567             progress = true;
2568             break;
2569          }
2570 
2571          if (inst->src[0].equals(inst->src[1]) ||
2572              inst->src[1].is_zero()) {
2573             /* On Gfx8+, the OR instruction can have a source modifier that
2574              * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
2575              * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2576              */
2577             if (inst->src[0].negate) {
2578                inst->opcode = ELK_OPCODE_NOT;
2579                inst->sources = 1;
2580                inst->src[0].negate = false;
2581             } else {
2582                inst->opcode = ELK_OPCODE_MOV;
2583                inst->sources = 1;
2584             }
2585             inst->src[1] = reg_undef;
2586             progress = true;
2587             break;
2588          }
2589          break;
2590       case ELK_OPCODE_CMP:
2591          if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2592               inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2593              inst->src[1].is_zero() &&
2594              (inst->src[0].abs || inst->src[0].negate)) {
2595             inst->src[0].abs = false;
2596             inst->src[0].negate = false;
2597             progress = true;
2598             break;
2599          }
2600          break;
2601       case ELK_OPCODE_SEL:
2602          if (!devinfo->has_64bit_float &&
2603              !devinfo->has_64bit_int &&
2604              (inst->dst.type == ELK_REGISTER_TYPE_DF ||
2605               inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2606               inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2607             assert(inst->dst.type == inst->src[0].type);
2608             assert(!inst->saturate);
2609             assert(!inst->src[0].abs && !inst->src[0].negate);
2610             assert(!inst->src[1].abs && !inst->src[1].negate);
2611             const elk::fs_builder ibld(this, block, inst);
2612 
2613             if (!inst->is_partial_write())
2614                ibld.emit_undef_for_dst(inst);
2615 
2616             set_predicate(inst->predicate,
2617                           ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2618                                    subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
2619                                    subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0)));
2620             set_predicate(inst->predicate,
2621                           ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2622                                    subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
2623                                    subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1)));
2624 
2625             inst->remove(block);
2626             progress = true;
2627          }
2628          if (inst->src[0].equals(inst->src[1])) {
2629             inst->opcode = ELK_OPCODE_MOV;
2630             inst->sources = 1;
2631             inst->src[1] = reg_undef;
2632             inst->predicate = ELK_PREDICATE_NONE;
2633             inst->predicate_inverse = false;
2634             progress = true;
2635          } else if (inst->saturate && inst->src[1].file == IMM) {
2636             switch (inst->conditional_mod) {
2637             case ELK_CONDITIONAL_LE:
2638             case ELK_CONDITIONAL_L:
2639                switch (inst->src[1].type) {
2640                case ELK_REGISTER_TYPE_F:
2641                   if (inst->src[1].f >= 1.0f) {
2642                      inst->opcode = ELK_OPCODE_MOV;
2643                      inst->sources = 1;
2644                      inst->src[1] = reg_undef;
2645                      inst->conditional_mod = ELK_CONDITIONAL_NONE;
2646                      progress = true;
2647                   }
2648                   break;
2649                default:
2650                   break;
2651                }
2652                break;
2653             case ELK_CONDITIONAL_GE:
2654             case ELK_CONDITIONAL_G:
2655                switch (inst->src[1].type) {
2656                case ELK_REGISTER_TYPE_F:
2657                   if (inst->src[1].f <= 0.0f) {
2658                      inst->opcode = ELK_OPCODE_MOV;
2659                      inst->sources = 1;
2660                      inst->src[1] = reg_undef;
2661                      inst->conditional_mod = ELK_CONDITIONAL_NONE;
2662                      progress = true;
2663                   }
2664                   break;
2665                default:
2666                   break;
2667                }
2668             default:
2669                break;
2670             }
2671          }
2672          break;
2673       case ELK_OPCODE_MAD:
2674          if (inst->src[0].type != ELK_REGISTER_TYPE_F ||
2675              inst->src[1].type != ELK_REGISTER_TYPE_F ||
2676              inst->src[2].type != ELK_REGISTER_TYPE_F)
2677             break;
2678          if (inst->src[1].is_one()) {
2679             inst->opcode = ELK_OPCODE_ADD;
2680             inst->sources = 2;
2681             inst->src[1] = inst->src[2];
2682             inst->src[2] = reg_undef;
2683             progress = true;
2684          } else if (inst->src[2].is_one()) {
2685             inst->opcode = ELK_OPCODE_ADD;
2686             inst->sources = 2;
2687             inst->src[2] = reg_undef;
2688             progress = true;
2689          }
2690          break;
2691       case ELK_OPCODE_SHL:
2692          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2693             /* It's not currently possible to generate this, and this constant
2694              * folding does not handle it.
2695              */
2696             assert(!inst->saturate);
2697 
2698             elk_fs_reg result;
2699 
2700             switch (type_sz(inst->src[0].type)) {
2701             case 2:
2702                result = elk_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
2703                break;
2704             case 4:
2705                result = elk_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
2706                break;
2707             case 8:
2708                result = elk_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
2709                break;
2710             default:
2711                /* Just in case a future platform re-enables B or UB types. */
2712                unreachable("Invalid source size.");
2713             }
2714 
2715             inst->opcode = ELK_OPCODE_MOV;
2716             inst->src[0] = retype(result, inst->dst.type);
2717             inst->src[1] = reg_undef;
2718             inst->sources = 1;
2719 
2720             progress = true;
2721          }
2722          break;
2723 
2724       case ELK_SHADER_OPCODE_BROADCAST:
2725          if (is_uniform(inst->src[0])) {
2726             inst->opcode = ELK_OPCODE_MOV;
2727             inst->sources = 1;
2728             inst->force_writemask_all = true;
2729             progress = true;
2730          } else if (inst->src[1].file == IMM) {
2731             inst->opcode = ELK_OPCODE_MOV;
2732             /* It's possible that the selected component will be too large and
2733              * overflow the register.  This can happen if someone does a
2734              * readInvocation() from GLSL or SPIR-V and provides an OOB
2735              * invocationIndex.  If this happens and we some how manage
2736              * to constant fold it in and get here, then component() may cause
2737              * us to start reading outside of the VGRF which will lead to an
2738              * assert later.  Instead, just let it wrap around if it goes over
2739              * exec_size.
2740              */
2741             const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
2742             inst->src[0] = component(inst->src[0], comp);
2743             inst->sources = 1;
2744             inst->force_writemask_all = true;
2745             progress = true;
2746          }
2747          break;
2748 
2749       case ELK_SHADER_OPCODE_SHUFFLE:
2750          if (is_uniform(inst->src[0])) {
2751             inst->opcode = ELK_OPCODE_MOV;
2752             inst->sources = 1;
2753             progress = true;
2754          } else if (inst->src[1].file == IMM) {
2755             inst->opcode = ELK_OPCODE_MOV;
2756             inst->src[0] = component(inst->src[0],
2757                                      inst->src[1].ud);
2758             inst->sources = 1;
2759             progress = true;
2760          }
2761          break;
2762 
2763       default:
2764 	 break;
2765       }
2766 
2767       /* Ensure that the correct source has the immediate value. 2-source
2768        * instructions must have the immediate in src[1]. On Gfx12 and later,
2769        * some 3-source instructions can have the immediate in src[0] or
2770        * src[2]. It's complicated, so don't mess with 3-source instructions
2771        * here.
2772        */
2773       if (progress && inst->sources == 2 && inst->is_commutative()) {
2774          if (inst->src[0].file == IMM) {
2775             elk_fs_reg tmp = inst->src[1];
2776             inst->src[1] = inst->src[0];
2777             inst->src[0] = tmp;
2778          }
2779       }
2780    }
2781 
2782    if (progress)
2783       invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
2784                           DEPENDENCY_INSTRUCTION_DETAIL);
2785 
2786    return progress;
2787 }
2788 
2789 static unsigned
load_payload_sources_read_for_size(elk_fs_inst * lp,unsigned size_read)2790 load_payload_sources_read_for_size(elk_fs_inst *lp, unsigned size_read)
2791 {
2792    assert(lp->opcode == ELK_SHADER_OPCODE_LOAD_PAYLOAD);
2793    assert(size_read >= lp->header_size * REG_SIZE);
2794 
2795    unsigned i;
2796    unsigned size = lp->header_size * REG_SIZE;
2797    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
2798       size += lp->exec_size * type_sz(lp->src[i].type);
2799 
2800    /* Size read must cover exactly a subset of sources. */
2801    assert(size == size_read);
2802    return i;
2803 }
2804 
2805 /**
2806  * Optimize sample messages that have constant zero values for the trailing
2807  * parameters. We can just reduce the message length for these
2808  * instructions instead of reserving a register for it. Trailing parameters
2809  * that aren't sent default to zero anyway. This will cause the dead code
2810  * eliminator to remove the MOV instruction that would otherwise be emitted to
2811  * set up the zero value.
2812  */
2813 bool
opt_zero_samples()2814 elk_fs_visitor::opt_zero_samples()
2815 {
2816    /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
2817    assert(devinfo->ver >= 7);
2818 
2819    bool progress = false;
2820 
2821    foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2822       if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2823           send->sfid != ELK_SFID_SAMPLER)
2824          continue;
2825 
2826       /* Wa_14012688258:
2827        *
2828        * Don't trim zeros at the end of payload for sample operations
2829        * in cube and cube arrays.
2830        */
2831       if (send->keep_payload_trailing_zeros)
2832          continue;
2833 
2834       /* This pass works on SENDs before splitting. */
2835       if (send->ex_mlen > 0)
2836          continue;
2837 
2838       elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2839 
2840       if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2841          continue;
2842 
2843       /* How much of the payload are actually read by this SEND. */
2844       const unsigned params =
2845          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2846 
2847       /* We don't want to remove the message header or the first parameter.
2848        * Removing the first parameter is not allowed, see the Haswell PRM
2849        * volume 7, page 149:
2850        *
2851        *     "Parameter 0 is required except for the sampleinfo message, which
2852        *      has no parameter 0"
2853        */
2854       const unsigned first_param_idx = lp->header_size;
2855       unsigned zero_size = 0;
2856       for (unsigned i = params - 1; i > first_param_idx; i--) {
2857          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
2858             break;
2859          zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
2860       }
2861 
2862       const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
2863       if (zero_len > 0) {
2864          send->mlen -= zero_len;
2865          progress = true;
2866       }
2867    }
2868 
2869    if (progress)
2870       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2871 
2872    return progress;
2873 }
2874 
2875 /**
2876  * Opportunistically split SEND message payloads.
2877  *
2878  * Gfx9+ supports "split" SEND messages, which take two payloads that are
2879  * implicitly concatenated.  If we find a SEND message with a single payload,
2880  * we can split that payload in two.  This results in smaller contiguous
2881  * register blocks for us to allocate.  But it can help beyond that, too.
2882  *
2883  * We try and split a LOAD_PAYLOAD between sources which change registers.
2884  * For example, a sampler message often contains a x/y/z coordinate that may
2885  * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
2886  * or array index, which comes from elsewhere.  In this case, the first few
2887  * sources will be different offsets of the same VGRF, then a later source
2888  * will be a different VGRF.  So we split there, possibly eliminating the
2889  * payload concatenation altogether.
2890  */
2891 bool
opt_split_sends()2892 elk_fs_visitor::opt_split_sends()
2893 {
2894    if (devinfo->ver < 9)
2895       return false;
2896 
2897    bool progress = false;
2898 
2899    foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2900       if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2901           send->mlen <= reg_unit(devinfo) || send->ex_mlen > 0)
2902          continue;
2903 
2904       assert(send->src[2].file == VGRF);
2905 
2906       /* Currently don't split sends that reuse a previously used payload. */
2907       elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2908 
2909       if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2910          continue;
2911 
2912       if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
2913          continue;
2914 
2915       /* Split either after the header (if present), or when consecutive
2916        * sources switch from one VGRF to a different one.
2917        */
2918       unsigned mid = lp->header_size;
2919       if (mid == 0) {
2920          for (mid = 1; mid < lp->sources; mid++) {
2921             if (lp->src[mid].file == BAD_FILE)
2922                continue;
2923 
2924             if (lp->src[0].file != lp->src[mid].file ||
2925                 lp->src[0].nr != lp->src[mid].nr)
2926                break;
2927          }
2928       }
2929 
2930       /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
2931        * find out how many sources from the payload does it really need.
2932        */
2933       const unsigned end =
2934          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2935 
2936       /* Nothing to split. */
2937       if (end <= mid)
2938          continue;
2939 
2940       const fs_builder ibld(this, block, lp);
2941       elk_fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
2942       elk_fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
2943 
2944       assert(lp1->size_written % REG_SIZE == 0);
2945       assert(lp2->size_written % REG_SIZE == 0);
2946       assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
2947 
2948       lp1->dst = elk_fs_reg(VGRF, alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
2949       lp2->dst = elk_fs_reg(VGRF, alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
2950 
2951       send->resize_sources(4);
2952       send->src[2] = lp1->dst;
2953       send->src[3] = lp2->dst;
2954       send->ex_mlen = lp2->size_written / REG_SIZE;
2955       send->mlen -= send->ex_mlen;
2956 
2957       progress = true;
2958    }
2959 
2960    if (progress)
2961       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2962 
2963    return progress;
2964 }
2965 
2966 /**
2967  * Remove redundant or useless halts.
2968  *
2969  * For example, we can eliminate halts in the following sequence:
2970  *
2971  * halt        (redundant with the next halt)
2972  * halt        (useless; jumps to the next instruction)
2973  * halt-target
2974  */
2975 bool
opt_redundant_halt()2976 elk_fs_visitor::opt_redundant_halt()
2977 {
2978    bool progress = false;
2979 
2980    unsigned halt_count = 0;
2981    elk_fs_inst *halt_target = NULL;
2982    elk_bblock_t *halt_target_block = NULL;
2983    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2984       if (inst->opcode == ELK_OPCODE_HALT)
2985          halt_count++;
2986 
2987       if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
2988          halt_target = inst;
2989          halt_target_block = block;
2990          break;
2991       }
2992    }
2993 
2994    if (!halt_target) {
2995       assert(halt_count == 0);
2996       return false;
2997    }
2998 
2999    /* Delete any HALTs immediately before the halt target. */
3000    for (elk_fs_inst *prev = (elk_fs_inst *) halt_target->prev;
3001         !prev->is_head_sentinel() && prev->opcode == ELK_OPCODE_HALT;
3002         prev = (elk_fs_inst *) halt_target->prev) {
3003       prev->remove(halt_target_block);
3004       halt_count--;
3005       progress = true;
3006    }
3007 
3008    if (halt_count == 0) {
3009       halt_target->remove(halt_target_block);
3010       progress = true;
3011    }
3012 
3013    if (progress)
3014       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3015 
3016    return progress;
3017 }
3018 
3019 /**
3020  * Compute a bitmask with GRF granularity with a bit set for each GRF starting
3021  * from \p r.offset which overlaps the region starting at \p s.offset and
3022  * spanning \p ds bytes.
3023  */
3024 static inline unsigned
mask_relative_to(const elk_fs_reg & r,const elk_fs_reg & s,unsigned ds)3025 mask_relative_to(const elk_fs_reg &r, const elk_fs_reg &s, unsigned ds)
3026 {
3027    const int rel_offset = reg_offset(s) - reg_offset(r);
3028    const int shift = rel_offset / REG_SIZE;
3029    const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
3030    assert(reg_space(r) == reg_space(s) &&
3031           shift >= 0 && shift < int(8 * sizeof(unsigned)));
3032    return ((1 << n) - 1) << shift;
3033 }
3034 
3035 bool
compute_to_mrf()3036 elk_fs_visitor::compute_to_mrf()
3037 {
3038    bool progress = false;
3039    int next_ip = 0;
3040 
3041    /* No MRFs on Gen >= 7. */
3042    if (devinfo->ver >= 7)
3043       return false;
3044 
3045    const fs_live_variables &live = live_analysis.require();
3046 
3047    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3048       int ip = next_ip;
3049       next_ip++;
3050 
3051       if (inst->opcode != ELK_OPCODE_MOV ||
3052 	  inst->is_partial_write() ||
3053 	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
3054 	  inst->dst.type != inst->src[0].type ||
3055 	  inst->src[0].abs || inst->src[0].negate ||
3056           !inst->src[0].is_contiguous() ||
3057           inst->src[0].offset % REG_SIZE != 0)
3058 	 continue;
3059 
3060       /* Can't compute-to-MRF this GRF if someone else was going to
3061        * read it later.
3062        */
3063       if (live.vgrf_end[inst->src[0].nr] > ip)
3064 	 continue;
3065 
3066       /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
3067        * things that computed the value of all GRFs of the source region.  The
3068        * regs_left bitset keeps track of the registers we haven't yet found a
3069        * generating instruction for.
3070        */
3071       unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
3072 
3073       foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3074          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3075                              inst->src[0], inst->size_read(0))) {
3076 	    /* Found the last thing to write our reg we want to turn
3077 	     * into a compute-to-MRF.
3078 	     */
3079 
3080 	    /* If this one instruction didn't populate all the
3081 	     * channels, bail.  We might be able to rewrite everything
3082 	     * that writes that reg, but it would require smarter
3083 	     * tracking.
3084 	     */
3085 	    if (scan_inst->is_partial_write())
3086 	       break;
3087 
3088             /* Handling things not fully contained in the source of the copy
3089              * would need us to understand coalescing out more than one MOV at
3090              * a time.
3091              */
3092             if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
3093                                      inst->src[0], inst->size_read(0)))
3094                break;
3095 
3096 	    /* SEND instructions can't have MRF as a destination. */
3097 	    if (scan_inst->mlen)
3098 	       break;
3099 
3100 	    if (devinfo->ver == 6) {
3101 	       /* gfx6 math instructions must have the destination be
3102 		* GRF, so no compute-to-MRF for them.
3103 		*/
3104 	       if (scan_inst->is_math()) {
3105 		  break;
3106 	       }
3107 	    }
3108 
3109             /* Clear the bits for any registers this instruction overwrites. */
3110             regs_left &= ~mask_relative_to(
3111                inst->src[0], scan_inst->dst, scan_inst->size_written);
3112             if (!regs_left)
3113                break;
3114 	 }
3115 
3116 	 /* We don't handle control flow here.  Most computation of
3117 	  * values that end up in MRFs are shortly before the MRF
3118 	  * write anyway.
3119 	  */
3120 	 if (block->start() == scan_inst)
3121 	    break;
3122 
3123 	 /* You can't read from an MRF, so if someone else reads our
3124 	  * MRF's source GRF that we wanted to rewrite, that stops us.
3125 	  */
3126 	 bool interfered = false;
3127 	 for (int i = 0; i < scan_inst->sources; i++) {
3128             if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
3129                                 inst->src[0], inst->size_read(0))) {
3130 	       interfered = true;
3131 	    }
3132 	 }
3133 	 if (interfered)
3134 	    break;
3135 
3136          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3137                              inst->dst, inst->size_written)) {
3138 	    /* If somebody else writes our MRF here, we can't
3139 	     * compute-to-MRF before that.
3140 	     */
3141             break;
3142          }
3143 
3144          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
3145              regions_overlap(elk_fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
3146                              inst->dst, inst->size_written)) {
3147 	    /* Found a SEND instruction, which means that there are
3148 	     * live values in MRFs from base_mrf to base_mrf +
3149 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3150 	     * above it.
3151 	     */
3152             break;
3153          }
3154       }
3155 
3156       if (regs_left)
3157          continue;
3158 
3159       /* Found all generating instructions of our MRF's source value, so it
3160        * should be safe to rewrite them to point to the MRF directly.
3161        */
3162       regs_left = (1 << regs_read(inst, 0)) - 1;
3163 
3164       foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3165          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3166                              inst->src[0], inst->size_read(0))) {
3167             /* Clear the bits for any registers this instruction overwrites. */
3168             regs_left &= ~mask_relative_to(
3169                inst->src[0], scan_inst->dst, scan_inst->size_written);
3170 
3171             const unsigned rel_offset = reg_offset(scan_inst->dst) -
3172                                         reg_offset(inst->src[0]);
3173 
3174             if (inst->dst.nr & ELK_MRF_COMPR4) {
3175                /* Apply the same address transformation done by the hardware
3176                 * for COMPR4 MRF writes.
3177                 */
3178                assert(rel_offset < 2 * REG_SIZE);
3179                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
3180 
3181                /* Clear the COMPR4 bit if the generating instruction is not
3182                 * compressed.
3183                 */
3184                if (scan_inst->size_written < 2 * REG_SIZE)
3185                   scan_inst->dst.nr &= ~ELK_MRF_COMPR4;
3186 
3187             } else {
3188                /* Calculate the MRF number the result of this instruction is
3189                 * ultimately written to.
3190                 */
3191                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
3192             }
3193 
3194             scan_inst->dst.file = MRF;
3195             scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
3196             scan_inst->saturate |= inst->saturate;
3197             if (!regs_left)
3198                break;
3199          }
3200       }
3201 
3202       assert(!regs_left);
3203       inst->remove(block);
3204       progress = true;
3205    }
3206 
3207    if (progress)
3208       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3209 
3210    return progress;
3211 }
3212 
3213 /**
3214  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3215  * flow.  We could probably do better here with some form of divergence
3216  * analysis.
3217  */
3218 bool
eliminate_find_live_channel()3219 elk_fs_visitor::eliminate_find_live_channel()
3220 {
3221    bool progress = false;
3222    unsigned depth = 0;
3223 
3224    if (!elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
3225                                       stage_prog_data)) {
3226       /* The optimization below assumes that channel zero is live on thread
3227        * dispatch, which may not be the case if the fixed function dispatches
3228        * threads sparsely.
3229        */
3230       return false;
3231    }
3232 
3233    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3234       switch (inst->opcode) {
3235       case ELK_OPCODE_IF:
3236       case ELK_OPCODE_DO:
3237          depth++;
3238          break;
3239 
3240       case ELK_OPCODE_ENDIF:
3241       case ELK_OPCODE_WHILE:
3242          depth--;
3243          break;
3244 
3245       case ELK_OPCODE_HALT:
3246          /* This can potentially make control flow non-uniform until the end
3247           * of the program.
3248           */
3249          goto out;
3250 
3251       case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
3252          if (depth == 0) {
3253             inst->opcode = ELK_OPCODE_MOV;
3254             inst->src[0] = elk_imm_ud(0u);
3255             inst->sources = 1;
3256             inst->force_writemask_all = true;
3257             progress = true;
3258          }
3259          break;
3260 
3261       default:
3262          break;
3263       }
3264    }
3265 
3266 out:
3267    if (progress)
3268       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3269 
3270    return progress;
3271 }
3272 
3273 /**
3274  * Once we've generated code, try to convert normal ELK_FS_OPCODE_FB_WRITE
3275  * instructions to ELK_FS_OPCODE_REP_FB_WRITE.
3276  */
3277 void
emit_repclear_shader()3278 elk_fs_visitor::emit_repclear_shader()
3279 {
3280    elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
3281    elk_fs_inst *write = NULL;
3282 
3283    assert(uniforms == 0);
3284    assume(key->nr_color_regions > 0);
3285 
3286    elk_fs_reg color_output, header;
3287    if (devinfo->ver >= 7) {
3288       color_output = retype(elk_vec4_grf(127, 0), ELK_REGISTER_TYPE_UD);
3289       header = retype(elk_vec8_grf(125, 0), ELK_REGISTER_TYPE_UD);
3290    } else {
3291       color_output = retype(elk_vec4_reg(MRF, 2, 0), ELK_REGISTER_TYPE_UD);
3292       header = retype(elk_vec8_reg(MRF, 0, 0), ELK_REGISTER_TYPE_UD);
3293    }
3294 
3295    /* We pass the clear color as a flat input.  Copy it to the output. */
3296    elk_fs_reg color_input =
3297       elk_reg(ELK_GENERAL_REGISTER_FILE, 2, 3, 0, 0, ELK_REGISTER_TYPE_UD,
3298               ELK_VERTICAL_STRIDE_8, ELK_WIDTH_2, ELK_HORIZONTAL_STRIDE_4,
3299               ELK_SWIZZLE_XYZW, WRITEMASK_XYZW);
3300 
3301    const fs_builder bld = fs_builder(this).at_end();
3302    bld.exec_all().group(4, 0).MOV(color_output, color_input);
3303 
3304    if (key->nr_color_regions > 1) {
3305       /* Copy g0..g1 as the message header */
3306       bld.exec_all().group(16, 0)
3307          .MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
3308    }
3309 
3310    for (int i = 0; i < key->nr_color_regions; ++i) {
3311       if (i > 0)
3312          bld.exec_all().group(1, 0).MOV(component(header, 2), elk_imm_ud(i));
3313 
3314       if (devinfo->ver >= 7) {
3315          write = bld.emit(ELK_SHADER_OPCODE_SEND);
3316          write->resize_sources(3);
3317          write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
3318          write->src[0] = elk_imm_ud(0);
3319          write->src[1] = elk_imm_ud(0);
3320          write->src[2] = i == 0 ? color_output : header;
3321          write->check_tdr = true;
3322          write->send_has_side_effects = true;
3323          write->desc = elk_fb_write_desc(devinfo, i,
3324             ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
3325             i == key->nr_color_regions - 1, false);
3326       } else {
3327          write = bld.emit(ELK_FS_OPCODE_REP_FB_WRITE);
3328          write->target = i;
3329          write->base_mrf = i == 0 ? color_output.nr : header.nr;
3330       }
3331 
3332       /* We can use a headerless message for the first render target */
3333       write->header_size = i == 0 ? 0 : 2;
3334       write->mlen = 1 + write->header_size;
3335    }
3336    write->eot = true;
3337    write->last_rt = true;
3338 
3339    calculate_cfg();
3340 
3341    this->first_non_payload_grf = payload().num_regs;
3342 }
3343 
3344 /**
3345  * Walks through basic blocks, looking for repeated MRF writes and
3346  * removing the later ones.
3347  */
3348 bool
remove_duplicate_mrf_writes()3349 elk_fs_visitor::remove_duplicate_mrf_writes()
3350 {
3351    elk_fs_inst *last_mrf_move[ELK_MAX_MRF(devinfo->ver)];
3352    bool progress = false;
3353 
3354    /* Need to update the MRF tracking for compressed instructions. */
3355    if (dispatch_width >= 16)
3356       return false;
3357 
3358    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3359 
3360    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3361       if (inst->is_control_flow()) {
3362 	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3363       }
3364 
3365       if (inst->opcode == ELK_OPCODE_MOV &&
3366 	  inst->dst.file == MRF) {
3367          elk_fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3368 	 if (prev_inst && prev_inst->opcode == ELK_OPCODE_MOV &&
3369              inst->dst.equals(prev_inst->dst) &&
3370              inst->src[0].equals(prev_inst->src[0]) &&
3371              inst->saturate == prev_inst->saturate &&
3372              inst->predicate == prev_inst->predicate &&
3373              inst->conditional_mod == prev_inst->conditional_mod &&
3374              inst->exec_size == prev_inst->exec_size) {
3375 	    inst->remove(block);
3376 	    progress = true;
3377 	    continue;
3378 	 }
3379       }
3380 
3381       /* Clear out the last-write records for MRFs that were overwritten. */
3382       if (inst->dst.file == MRF) {
3383          last_mrf_move[inst->dst.nr] = NULL;
3384       }
3385 
3386       if (inst->mlen > 0 && inst->base_mrf != -1) {
3387 	 /* Found a SEND instruction, which will include two or fewer
3388 	  * implied MRF writes.  We could do better here.
3389 	  */
3390 	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3391 	    last_mrf_move[inst->base_mrf + i] = NULL;
3392 	 }
3393       }
3394 
3395       /* Clear out any MRF move records whose sources got overwritten. */
3396       for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3397          if (last_mrf_move[i] &&
3398              regions_overlap(inst->dst, inst->size_written,
3399                              last_mrf_move[i]->src[0],
3400                              last_mrf_move[i]->size_read(0))) {
3401             last_mrf_move[i] = NULL;
3402          }
3403       }
3404 
3405       if (inst->opcode == ELK_OPCODE_MOV &&
3406 	  inst->dst.file == MRF &&
3407 	  inst->src[0].file != ARF &&
3408 	  !inst->is_partial_write()) {
3409          last_mrf_move[inst->dst.nr] = inst;
3410       }
3411    }
3412 
3413    if (progress)
3414       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3415 
3416    return progress;
3417 }
3418 
3419 /**
3420  * Rounding modes for conversion instructions are included for each
3421  * conversion, but right now it is a state. So once it is set,
3422  * we don't need to call it again for subsequent calls.
3423  *
3424  * This is useful for vector/matrices conversions, as setting the
3425  * mode once is enough for the full vector/matrix
3426  */
3427 bool
remove_extra_rounding_modes()3428 elk_fs_visitor::remove_extra_rounding_modes()
3429 {
3430    bool progress = false;
3431    unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3432 
3433    elk_rnd_mode base_mode = ELK_RND_MODE_UNSPECIFIED;
3434    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3435         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3436         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3437        execution_mode)
3438       base_mode = ELK_RND_MODE_RTNE;
3439    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3440         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3441         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3442        execution_mode)
3443       base_mode = ELK_RND_MODE_RTZ;
3444 
3445    foreach_block (block, cfg) {
3446       elk_rnd_mode prev_mode = base_mode;
3447 
3448       foreach_inst_in_block_safe (elk_fs_inst, inst, block) {
3449          if (inst->opcode == ELK_SHADER_OPCODE_RND_MODE) {
3450             assert(inst->src[0].file == ELK_IMMEDIATE_VALUE);
3451             const elk_rnd_mode mode = (elk_rnd_mode) inst->src[0].d;
3452             if (mode == prev_mode) {
3453                inst->remove(block);
3454                progress = true;
3455             } else {
3456                prev_mode = mode;
3457             }
3458          }
3459       }
3460    }
3461 
3462    if (progress)
3463       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3464 
3465    return progress;
3466 }
3467 
3468 static void
clear_deps_for_inst_src(elk_fs_inst * inst,bool * deps,int first_grf,int grf_len)3469 clear_deps_for_inst_src(elk_fs_inst *inst, bool *deps, int first_grf, int grf_len)
3470 {
3471    /* Clear the flag for registers that actually got read (as expected). */
3472    for (int i = 0; i < inst->sources; i++) {
3473       int grf;
3474       if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3475          grf = inst->src[i].nr;
3476       } else {
3477          continue;
3478       }
3479 
3480       if (grf >= first_grf &&
3481           grf < first_grf + grf_len) {
3482          deps[grf - first_grf] = false;
3483          if (inst->exec_size == 16)
3484             deps[grf - first_grf + 1] = false;
3485       }
3486    }
3487 }
3488 
3489 /**
3490  * Implements this workaround for the original 965:
3491  *
3492  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3493  *      check for post destination dependencies on this instruction, software
3494  *      must ensure that there is no destination hazard for the case of ‘write
3495  *      followed by a posted write’ shown in the following example.
3496  *
3497  *      1. mov r3 0
3498  *      2. send r3.xy <rest of send instruction>
3499  *      3. mov r2 r3
3500  *
3501  *      Due to no post-destination dependency check on the ‘send’, the above
3502  *      code sequence could have two instructions (1 and 2) in flight at the
3503  *      same time that both consider ‘r3’ as the target of their final writes.
3504  */
3505 void
insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3506 elk_fs_visitor::insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t *block,
3507                                                         elk_fs_inst *inst)
3508 {
3509    int write_len = regs_written(inst);
3510    int first_write_grf = inst->dst.nr;
3511    bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3512    assert(write_len < (int)sizeof(needs_dep) - 1);
3513 
3514    memset(needs_dep, false, sizeof(needs_dep));
3515    memset(needs_dep, true, write_len);
3516 
3517    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3518 
3519    /* Walk backwards looking for writes to registers we're writing which
3520     * aren't read since being written.  If we hit the start of the program,
3521     * we assume that there are no outstanding dependencies on entry to the
3522     * program.
3523     */
3524    foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3525       /* If we hit control flow, assume that there *are* outstanding
3526        * dependencies, and force their cleanup before our instruction.
3527        */
3528       if (block->start() == scan_inst && block->num != 0) {
3529          for (int i = 0; i < write_len; i++) {
3530             if (needs_dep[i])
3531                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3532                                first_write_grf + i);
3533          }
3534          return;
3535       }
3536 
3537       /* We insert our reads as late as possible on the assumption that any
3538        * instruction but a MOV that might have left us an outstanding
3539        * dependency has more latency than a MOV.
3540        */
3541       if (scan_inst->dst.file == VGRF) {
3542          for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3543             int reg = scan_inst->dst.nr + i;
3544 
3545             if (reg >= first_write_grf &&
3546                 reg < first_write_grf + write_len &&
3547                 needs_dep[reg - first_write_grf]) {
3548                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3549                needs_dep[reg - first_write_grf] = false;
3550                if (scan_inst->exec_size == 16)
3551                   needs_dep[reg - first_write_grf + 1] = false;
3552             }
3553          }
3554       }
3555 
3556       /* Clear the flag for registers that actually got read (as expected). */
3557       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3558 
3559       /* Continue the loop only if we haven't resolved all the dependencies */
3560       int i;
3561       for (i = 0; i < write_len; i++) {
3562          if (needs_dep[i])
3563             break;
3564       }
3565       if (i == write_len)
3566          return;
3567    }
3568 }
3569 
3570 /**
3571  * Implements this workaround for the original 965:
3572  *
3573  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3574  *      used as a destination register until after it has been sourced by an
3575  *      instruction with a different destination register.
3576  */
3577 void
insert_gfx4_post_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3578 elk_fs_visitor::insert_gfx4_post_send_dependency_workarounds(elk_bblock_t *block, elk_fs_inst *inst)
3579 {
3580    int write_len = regs_written(inst);
3581    unsigned first_write_grf = inst->dst.nr;
3582    bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3583    assert(write_len < (int)sizeof(needs_dep) - 1);
3584 
3585    memset(needs_dep, false, sizeof(needs_dep));
3586    memset(needs_dep, true, write_len);
3587    /* Walk forwards looking for writes to registers we're writing which aren't
3588     * read before being written.
3589     */
3590    foreach_inst_in_block_starting_from(elk_fs_inst, scan_inst, inst) {
3591       /* If we hit control flow, force resolve all remaining dependencies. */
3592       if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3593          for (int i = 0; i < write_len; i++) {
3594             if (needs_dep[i])
3595                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3596                                first_write_grf + i);
3597          }
3598          return;
3599       }
3600 
3601       /* Clear the flag for registers that actually got read (as expected). */
3602       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3603 
3604       /* We insert our reads as late as possible since they're reading the
3605        * result of a SEND, which has massive latency.
3606        */
3607       if (scan_inst->dst.file == VGRF &&
3608           scan_inst->dst.nr >= first_write_grf &&
3609           scan_inst->dst.nr < first_write_grf + write_len &&
3610           needs_dep[scan_inst->dst.nr - first_write_grf]) {
3611          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3612                          scan_inst->dst.nr);
3613          needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3614       }
3615 
3616       /* Continue the loop only if we haven't resolved all the dependencies */
3617       int i;
3618       for (i = 0; i < write_len; i++) {
3619          if (needs_dep[i])
3620             break;
3621       }
3622       if (i == write_len)
3623          return;
3624    }
3625 }
3626 
3627 void
insert_gfx4_send_dependency_workarounds()3628 elk_fs_visitor::insert_gfx4_send_dependency_workarounds()
3629 {
3630    if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
3631       return;
3632 
3633    bool progress = false;
3634 
3635    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
3636       if (inst->mlen != 0 && inst->dst.file == VGRF) {
3637          insert_gfx4_pre_send_dependency_workarounds(block, inst);
3638          insert_gfx4_post_send_dependency_workarounds(block, inst);
3639          progress = true;
3640       }
3641    }
3642 
3643    if (progress)
3644       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3645 }
3646 
3647 bool
lower_load_payload()3648 elk_fs_visitor::lower_load_payload()
3649 {
3650    bool progress = false;
3651 
3652    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3653       if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
3654          continue;
3655 
3656       assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3657       assert(inst->saturate == false);
3658       elk_fs_reg dst = inst->dst;
3659 
3660       /* Get rid of COMPR4.  We'll add it back in if we need it */
3661       if (dst.file == MRF)
3662          dst.nr = dst.nr & ~ELK_MRF_COMPR4;
3663 
3664       const fs_builder ibld(this, block, inst);
3665       const fs_builder ubld = ibld.exec_all();
3666 
3667       for (uint8_t i = 0; i < inst->header_size;) {
3668          /* Number of header GRFs to initialize at once with a single MOV
3669           * instruction.
3670           */
3671          const unsigned n =
3672             (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3673              inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3674             2 : 1;
3675 
3676          if (inst->src[i].file != BAD_FILE)
3677             ubld.group(8 * n, 0).MOV(retype(dst, ELK_REGISTER_TYPE_UD),
3678                                      retype(inst->src[i], ELK_REGISTER_TYPE_UD));
3679 
3680          dst = byte_offset(dst, n * REG_SIZE);
3681          i += n;
3682       }
3683 
3684       if (inst->dst.file == MRF && (inst->dst.nr & ELK_MRF_COMPR4) &&
3685           inst->exec_size > 8) {
3686          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3687           * a straightforward copy.  Instead, the result of the
3688           * LOAD_PAYLOAD is treated as interleaved and the first four
3689           * non-header sources are unpacked as:
3690           *
3691           * m + 0: r0
3692           * m + 1: g0
3693           * m + 2: b0
3694           * m + 3: a0
3695           * m + 4: r1
3696           * m + 5: g1
3697           * m + 6: b1
3698           * m + 7: a1
3699           *
3700           * This is used for gen <= 5 fb writes.
3701           */
3702          assert(inst->exec_size == 16);
3703          assert(inst->header_size + 4 <= inst->sources);
3704          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3705             if (inst->src[i].file != BAD_FILE) {
3706                if (devinfo->has_compr4) {
3707                   elk_fs_reg compr4_dst = retype(dst, inst->src[i].type);
3708                   compr4_dst.nr |= ELK_MRF_COMPR4;
3709                   ibld.MOV(compr4_dst, inst->src[i]);
3710                } else {
3711                   /* Platform doesn't have COMPR4.  We have to fake it */
3712                   elk_fs_reg mov_dst = retype(dst, inst->src[i].type);
3713                   ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
3714                   mov_dst.nr += 4;
3715                   ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
3716                }
3717             }
3718 
3719             dst.nr++;
3720          }
3721 
3722          /* The loop above only ever incremented us through the first set
3723           * of 4 registers.  However, thanks to the magic of COMPR4, we
3724           * actually wrote to the first 8 registers, so we need to take
3725           * that into account now.
3726           */
3727          dst.nr += 4;
3728 
3729          /* The COMPR4 code took care of the first 4 sources.  We'll let
3730           * the regular path handle any remaining sources.  Yes, we are
3731           * modifying the instruction but we're about to delete it so
3732           * this really doesn't hurt anything.
3733           */
3734          inst->header_size += 4;
3735       }
3736 
3737       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3738          dst.type = inst->src[i].type;
3739          if (inst->src[i].file != BAD_FILE) {
3740             ibld.MOV(dst, inst->src[i]);
3741          }
3742          dst = offset(dst, ibld, 1);
3743       }
3744 
3745       inst->remove(block);
3746       progress = true;
3747    }
3748 
3749    if (progress)
3750       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3751 
3752    return progress;
3753 }
3754 
3755 /**
3756  * Factor an unsigned 32-bit integer.
3757  *
3758  * Attempts to factor \c x into two values that are at most 0xFFFF.  If no
3759  * such factorization is possible, either because the value is too large or is
3760  * prime, both \c result_a and \c result_b will be zero.
3761  */
3762 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)3763 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
3764 {
3765    /* This is necessary to prevent various opportunities for division by zero
3766     * below.
3767     */
3768    assert(x > 0xffff);
3769 
3770    /* This represents the actual expected constraints on the input.  Namely,
3771     * both the upper and lower words should be > 1.
3772     */
3773    assert(x >= 0x00020002);
3774 
3775    *result_a = 0;
3776    *result_b = 0;
3777 
3778    /* The value is too large to factor with the constraints. */
3779    if (x > (0xffffu * 0xffffu))
3780       return;
3781 
3782    /* A non-prime number will have the form p*q*d where p is some prime
3783     * number, q > 1, and 1 <= d <= q.  To meet the constraints of this
3784     * function, (p*d) < 0x10000.  This implies d <= floor(0xffff / p).
3785     * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)).  Finally,
3786     * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
3787     *
3788     * The observation is finding the largest possible value of p reduces the
3789     * possible range of d.  After selecting p, all values of d in this range
3790     * are tested until a factorization is found.  The size of the range of
3791     * possible values of d sets an upper bound on the run time of the
3792     * function.
3793     */
3794    static const uint16_t primes[256] = {
3795          2,    3,    5,    7,   11,   13,   17,   19,
3796         23,   29,   31,   37,   41,   43,   47,   53,
3797         59,   61,   67,   71,   73,   79,   83,   89,
3798         97,  101,  103,  107,  109,  113,  127,  131,  /*  32 */
3799        137,  139,  149,  151,  157,  163,  167,  173,
3800        179,  181,  191,  193,  197,  199,  211,  223,
3801        227,  229,  233,  239,  241,  251,  257,  263,
3802        269,  271,  277,  281,  283,  293,  307,  311,  /*  64 */
3803        313,  317,  331,  337,  347,  349,  353,  359,
3804        367,  373,  379,  383,  389,  397,  401,  409,
3805        419,  421,  431,  433,  439,  443,  449,  457,
3806        461,  463,  467,  479,  487,  491,  499,  503,  /*  96 */
3807        509,  521,  523,  541,  547,  557,  563,  569,
3808        571,  577,  587,  593,  599,  601,  607,  613,
3809        617,  619,  631,  641,  643,  647,  653,  659,
3810        661,  673,  677,  683,  691,  701,  709,  719,   /* 128 */
3811        727,  733,  739,  743,  751,  757,  761,  769,
3812        773,  787,  797,  809,  811,  821,  823,  827,
3813        829,  839,  853,  857,  859,  863,  877,  881,
3814        883,  887,  907,  911,  919,  929,  937,  941,  /* 160 */
3815        947,  953,  967,  971,  977,  983,  991,  997,
3816       1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
3817       1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
3818       1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,  /* 192 */
3819       1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
3820       1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
3821       1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
3822       1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,  /* 224 */
3823       1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
3824       1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
3825       1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
3826       1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,  /* 256 */
3827    };
3828 
3829    unsigned p;
3830    unsigned x_div_p;
3831 
3832    for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
3833       p = primes[i];
3834       x_div_p = x / p;
3835 
3836       if ((x_div_p * p) == x)
3837          break;
3838    }
3839 
3840    /* A prime factor was not found. */
3841    if (x_div_p * p != x)
3842       return;
3843 
3844    /* Terminate early if d=1 is a solution. */
3845    if (x_div_p < 0x10000) {
3846       *result_a = x_div_p;
3847       *result_b = p;
3848       return;
3849    }
3850 
3851    /* Pick the maximum possible value for 'd'.  It's important that the loop
3852     * below execute while d <= max_d because max_d is a valid value.  Having
3853     * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
3854     * incorrectly reported as not being factorable.  The problem would occur
3855     * with any value that is a factor of two primes in the table and one prime
3856     * not in the table.
3857     */
3858    const unsigned max_d = 0xffff / p;
3859 
3860    /* Pick an initial value of 'd' that (combined with rejecting too large
3861     * values above) guarantees that 'q' will always be small enough.
3862     * DIV_ROUND_UP is used to prevent 'd' from being zero.
3863     */
3864    for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
3865       unsigned q = x_div_p / d;
3866 
3867       if ((q * d) == x_div_p) {
3868          assert(p * d * q == x);
3869          assert((p * d) < 0x10000);
3870 
3871          *result_a = q;
3872          *result_b = p * d;
3873          break;
3874       }
3875 
3876       /* Since every value of 'd' is tried, as soon as 'd' is larger
3877        * than 'q', we're just re-testing combinations that have
3878        * already been tested.
3879        */
3880       if (d > q)
3881          break;
3882    }
3883 }
3884 
3885 void
lower_mul_dword_inst(elk_fs_inst * inst,elk_bblock_t * block)3886 elk_fs_visitor::lower_mul_dword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3887 {
3888    const fs_builder ibld(this, block, inst);
3889 
3890    /* It is correct to use inst->src[1].d in both end of the comparison.
3891     * Using .ud in the UINT16_MAX comparison would cause any negative value to
3892     * fail the check.
3893     */
3894    if (inst->src[1].file == IMM &&
3895        (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
3896       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3897        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3898        * src1 are used.
3899        *
3900        * If multiplying by an immediate value that fits in 16-bits, do a
3901        * single MUL instruction with that value in the proper location.
3902        */
3903       const bool ud = (inst->src[1].d >= 0);
3904       if (devinfo->ver < 7) {
3905          elk_fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
3906          ibld.MOV(imm, inst->src[1]);
3907          ibld.MUL(inst->dst, imm, inst->src[0]);
3908       } else {
3909          ibld.MUL(inst->dst, inst->src[0],
3910                   ud ? elk_imm_uw(inst->src[1].ud)
3911                      : elk_imm_w(inst->src[1].d));
3912       }
3913    } else {
3914       /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
3915        * do 32-bit integer multiplication in one instruction, but instead
3916        * must do a sequence (which actually calculates a 64-bit result):
3917        *
3918        *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3919        *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3920        *    mov(8)  g2<1>D     acc0<8,8,1>D
3921        *
3922        * But on Gen > 6, the ability to use second accumulator register
3923        * (acc1) for non-float data types was removed, preventing a simple
3924        * implementation in SIMD16. A 16-channel result can be calculated by
3925        * executing the three instructions twice in SIMD8, once with quarter
3926        * control of 1Q for the first eight channels and again with 2Q for
3927        * the second eight channels.
3928        *
3929        * Which accumulator register is implicitly accessed (by AccWrEnable
3930        * for instance) is determined by the quarter control. Unfortunately
3931        * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3932        * implicit accumulator access by an instruction with 2Q will access
3933        * acc1 regardless of whether the data type is usable in acc1.
3934        *
3935        * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3936        * integer data types.
3937        *
3938        * Since we only want the low 32-bits of the result, we can do two
3939        * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3940        * adjust the high result and add them (like the mach is doing):
3941        *
3942        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3943        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3944        *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3945        *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3946        *
3947        * We avoid the shl instruction by realizing that we only want to add
3948        * the low 16-bits of the "high" result to the high 16-bits of the
3949        * "low" result and using proper regioning on the add:
3950        *
3951        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3952        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3953        *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3954        *
3955        * Since it does not use the (single) accumulator register, we can
3956        * schedule multi-component multiplications much better.
3957        */
3958 
3959       bool needs_mov = false;
3960       elk_fs_reg orig_dst = inst->dst;
3961 
3962       /* Get a new VGRF for the "low" 32x16-bit multiplication result if
3963        * reusing the original destination is impossible due to hardware
3964        * restrictions, source/destination overlap, or it being the null
3965        * register.
3966        */
3967       elk_fs_reg low = inst->dst;
3968       if (orig_dst.is_null() || orig_dst.file == MRF ||
3969           regions_overlap(inst->dst, inst->size_written,
3970                           inst->src[0], inst->size_read(0)) ||
3971           regions_overlap(inst->dst, inst->size_written,
3972                           inst->src[1], inst->size_read(1)) ||
3973           inst->dst.stride >= 4) {
3974          needs_mov = true;
3975          low = elk_fs_reg(VGRF, alloc.allocate(regs_written(inst)),
3976                       inst->dst.type);
3977       }
3978 
3979       /* Get a new VGRF but keep the same stride as inst->dst */
3980       elk_fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
3981       high.stride = inst->dst.stride;
3982       high.offset = inst->dst.offset % REG_SIZE;
3983 
3984       bool do_addition = true;
3985       if (devinfo->ver >= 7) {
3986          /* From Wa_1604601757:
3987           *
3988           * "When multiplying a DW and any lower precision integer, source modifier
3989           *  is not supported."
3990           *
3991           * An unsupported negate modifier on src[1] would ordinarily be
3992           * lowered by the subsequent lower_regioning pass.  In this case that
3993           * pass would spawn another dword multiply.  Instead, lower the
3994           * modifier first.
3995           */
3996          const bool source_mods_unsupported = (devinfo->ver >= 12);
3997 
3998          if (inst->src[1].abs || (inst->src[1].negate &&
3999                                   source_mods_unsupported))
4000             lower_src_modifiers(this, block, inst, 1);
4001 
4002          if (inst->src[1].file == IMM) {
4003             unsigned a;
4004             unsigned b;
4005 
4006             /* If the immeditate value can be factored into two values, A and
4007              * B, that each fit in 16-bits, the multiplication result can
4008              * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
4009              * This saves an operation (the addition) and a temporary register
4010              * (high).
4011              *
4012              * Skip the optimization if either the high word or the low word
4013              * is 0 or 1.  In these conditions, at least one of the
4014              * multiplications generated by the straightforward method will be
4015              * eliminated anyway.
4016              */
4017             if (inst->src[1].ud > 0x0001ffff &&
4018                 (inst->src[1].ud & 0xffff) > 1) {
4019                factor_uint32(inst->src[1].ud, &a, &b);
4020 
4021                if (a != 0) {
4022                   ibld.MUL(low, inst->src[0], elk_imm_uw(a));
4023                   ibld.MUL(low, low, elk_imm_uw(b));
4024                   do_addition = false;
4025                }
4026             }
4027 
4028             if (do_addition) {
4029                ibld.MUL(low, inst->src[0],
4030                         elk_imm_uw(inst->src[1].ud & 0xffff));
4031                ibld.MUL(high, inst->src[0],
4032                         elk_imm_uw(inst->src[1].ud >> 16));
4033             }
4034          } else {
4035             ibld.MUL(low, inst->src[0],
4036                      subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
4037             ibld.MUL(high, inst->src[0],
4038                      subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 1));
4039          }
4040       } else {
4041          if (inst->src[0].abs)
4042             lower_src_modifiers(this, block, inst, 0);
4043 
4044          ibld.MUL(low, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 0),
4045                   inst->src[1]);
4046          ibld.MUL(high, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 1),
4047                   inst->src[1]);
4048       }
4049 
4050       if (do_addition) {
4051          ibld.ADD(subscript(low, ELK_REGISTER_TYPE_UW, 1),
4052                   subscript(low, ELK_REGISTER_TYPE_UW, 1),
4053                   subscript(high, ELK_REGISTER_TYPE_UW, 0));
4054       }
4055 
4056       if (needs_mov || inst->conditional_mod)
4057          set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
4058    }
4059 }
4060 
4061 void
lower_mul_qword_inst(elk_fs_inst * inst,elk_bblock_t * block)4062 elk_fs_visitor::lower_mul_qword_inst(elk_fs_inst *inst, elk_bblock_t *block)
4063 {
4064    const fs_builder ibld(this, block, inst);
4065 
4066    /* Considering two 64-bit integers ab and cd where each letter        ab
4067     * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
4068     * only need to provide the YZ part of the result.               -------
4069     *                                                                    BD
4070     *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
4071     *  about the lower 32 bits (since they are part of the upper     +  BC
4072     *  32 bits of our result). AC is not needed since it starts      + AC
4073     *  on the 65th bit of the result.                               -------
4074     *                                                                  WXYZ
4075     */
4076    unsigned int q_regs = regs_written(inst);
4077    unsigned int d_regs = (q_regs + 1) / 2;
4078 
4079    elk_fs_reg bd(VGRF, alloc.allocate(q_regs), ELK_REGISTER_TYPE_UQ);
4080    elk_fs_reg ad(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4081    elk_fs_reg bc(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4082 
4083    /* Here we need the full 64 bit result for 32b * 32b. */
4084    if (devinfo->has_integer_dword_mul) {
4085       ibld.MUL(bd, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4086                subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
4087    } else {
4088       elk_fs_reg bd_high(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4089       elk_fs_reg bd_low(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4090       const unsigned acc_width = reg_unit(devinfo) * 8;
4091       elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), ELK_REGISTER_TYPE_UD),
4092                              inst->group % acc_width);
4093 
4094       elk_fs_inst *mul = ibld.MUL(acc,
4095                             subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4096                             subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
4097       mul->writes_accumulator = true;
4098 
4099       ibld.MACH(bd_high, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4100                 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
4101       ibld.MOV(bd_low, acc);
4102 
4103       ibld.UNDEF(bd);
4104       ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 0), bd_low);
4105       ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 1), bd_high);
4106    }
4107 
4108    ibld.MUL(ad, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
4109             subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
4110    ibld.MUL(bc, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4111             subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1));
4112 
4113    ibld.ADD(ad, ad, bc);
4114    ibld.ADD(subscript(bd, ELK_REGISTER_TYPE_UD, 1),
4115             subscript(bd, ELK_REGISTER_TYPE_UD, 1), ad);
4116 
4117    if (devinfo->has_64bit_int) {
4118       ibld.MOV(inst->dst, bd);
4119    } else {
4120       if (!inst->is_partial_write())
4121          ibld.emit_undef_for_dst(inst);
4122       ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
4123                subscript(bd, ELK_REGISTER_TYPE_UD, 0));
4124       ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
4125                subscript(bd, ELK_REGISTER_TYPE_UD, 1));
4126    }
4127 }
4128 
4129 void
lower_mulh_inst(elk_fs_inst * inst,elk_bblock_t * block)4130 elk_fs_visitor::lower_mulh_inst(elk_fs_inst *inst, elk_bblock_t *block)
4131 {
4132    const fs_builder ibld(this, block, inst);
4133 
4134    /* According to the BDW+ BSpec page for the "Multiply Accumulate
4135     * High" instruction:
4136     *
4137     *  "An added preliminary mov is required for source modification on
4138     *   src1:
4139     *      mov (8) r3.0<1>:d -r3<8;8,1>:d
4140     *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
4141     *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
4142     */
4143    if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
4144       lower_src_modifiers(this, block, inst, 1);
4145 
4146    /* Should have been lowered to 8-wide. */
4147    assert(inst->exec_size <= get_lowered_simd_width(this, inst));
4148    const unsigned acc_width = reg_unit(devinfo) * 8;
4149    const elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), inst->dst.type),
4150                                 inst->group % acc_width);
4151    elk_fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
4152    elk_fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
4153 
4154    if (devinfo->ver >= 8) {
4155       /* Until Gfx8, integer multiplies read 32-bits from one source,
4156        * and 16-bits from the other, and relying on the MACH instruction
4157        * to generate the high bits of the result.
4158        *
4159        * On Gfx8, the multiply instruction does a full 32x32-bit
4160        * multiply, but in order to do a 64-bit multiply we can simulate
4161        * the previous behavior and then use a MACH instruction.
4162        */
4163       assert(mul->src[1].type == ELK_REGISTER_TYPE_D ||
4164              mul->src[1].type == ELK_REGISTER_TYPE_UD);
4165       mul->src[1].type = ELK_REGISTER_TYPE_UW;
4166       mul->src[1].stride *= 2;
4167 
4168       if (mul->src[1].file == IMM) {
4169          mul->src[1] = elk_imm_uw(mul->src[1].ud);
4170       }
4171    } else if (devinfo->verx10 == 70 &&
4172               inst->group > 0) {
4173       /* Among other things the quarter control bits influence which
4174        * accumulator register is used by the hardware for instructions
4175        * that access the accumulator implicitly (e.g. MACH).  A
4176        * second-half instruction would normally map to acc1, which
4177        * doesn't exist on Gfx7 and up (the hardware does emulate it for
4178        * floating-point instructions *only* by taking advantage of the
4179        * extra precision of acc0 not normally used for floating point
4180        * arithmetic).
4181        *
4182        * HSW and up are careful enough not to try to access an
4183        * accumulator register that doesn't exist, but on earlier Gfx7
4184        * hardware we need to make sure that the quarter control bits are
4185        * zero to avoid non-deterministic behaviour and emit an extra MOV
4186        * to get the result masked correctly according to the current
4187        * channel enables.
4188        */
4189       mach->group = 0;
4190       mach->force_writemask_all = true;
4191       mach->dst = ibld.vgrf(inst->dst.type);
4192       ibld.MOV(inst->dst, mach->dst);
4193    }
4194 }
4195 
4196 bool
lower_integer_multiplication()4197 elk_fs_visitor::lower_integer_multiplication()
4198 {
4199    bool progress = false;
4200 
4201    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4202       if (inst->opcode == ELK_OPCODE_MUL) {
4203          /* If the instruction is already in a form that does not need lowering,
4204           * return early.
4205           */
4206          if (devinfo->ver >= 7) {
4207             if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
4208                continue;
4209          } else {
4210             if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
4211                continue;
4212          }
4213 
4214          if ((inst->dst.type == ELK_REGISTER_TYPE_Q ||
4215               inst->dst.type == ELK_REGISTER_TYPE_UQ) &&
4216              (inst->src[0].type == ELK_REGISTER_TYPE_Q ||
4217               inst->src[0].type == ELK_REGISTER_TYPE_UQ) &&
4218              (inst->src[1].type == ELK_REGISTER_TYPE_Q ||
4219               inst->src[1].type == ELK_REGISTER_TYPE_UQ)) {
4220             lower_mul_qword_inst(inst, block);
4221             inst->remove(block);
4222             progress = true;
4223          } else if (!inst->dst.is_accumulator() &&
4224                     (inst->dst.type == ELK_REGISTER_TYPE_D ||
4225                      inst->dst.type == ELK_REGISTER_TYPE_UD) &&
4226                     (!devinfo->has_integer_dword_mul ||
4227                      devinfo->verx10 >= 125)) {
4228             lower_mul_dword_inst(inst, block);
4229             inst->remove(block);
4230             progress = true;
4231          }
4232       } else if (inst->opcode == ELK_SHADER_OPCODE_MULH) {
4233          lower_mulh_inst(inst, block);
4234          inst->remove(block);
4235          progress = true;
4236       }
4237 
4238    }
4239 
4240    if (progress)
4241       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4242 
4243    return progress;
4244 }
4245 
4246 bool
lower_minmax()4247 elk_fs_visitor::lower_minmax()
4248 {
4249    assert(devinfo->ver < 6);
4250 
4251    bool progress = false;
4252 
4253    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4254       const fs_builder ibld(this, block, inst);
4255 
4256       if (inst->opcode == ELK_OPCODE_SEL &&
4257           inst->predicate == ELK_PREDICATE_NONE) {
4258          /* If src1 is an immediate value that is not NaN, then it can't be
4259           * NaN.  In that case, emit CMP because it is much better for cmod
4260           * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
4261           * support HF or DF, so it is not necessary to check for those.
4262           */
4263          if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
4264              (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
4265             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4266                      inst->conditional_mod);
4267          } else {
4268             ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
4269                       inst->conditional_mod);
4270          }
4271          inst->predicate = ELK_PREDICATE_NORMAL;
4272          inst->conditional_mod = ELK_CONDITIONAL_NONE;
4273 
4274          progress = true;
4275       }
4276    }
4277 
4278    if (progress)
4279       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4280 
4281    return progress;
4282 }
4283 
4284 bool
lower_sub_sat()4285 elk_fs_visitor::lower_sub_sat()
4286 {
4287    bool progress = false;
4288 
4289    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4290       const fs_builder ibld(this, block, inst);
4291 
4292       if (inst->opcode == ELK_SHADER_OPCODE_USUB_SAT ||
4293           inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4294          /* The fundamental problem is the hardware performs source negation
4295           * at the bit width of the source.  If the source is 0x80000000D, the
4296           * negation is 0x80000000D.  As a result, subtractSaturate(0,
4297           * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
4298           * are at least three ways to resolve this:
4299           *
4300           * 1. Use the accumulator for the negated source.  The accumulator is
4301           *    33 bits, so our source 0x80000000 is sign-extended to
4302           *    0x1800000000.  The negation of which is 0x080000000.  This
4303           *    doesn't help for 64-bit integers (which are already bigger than
4304           *    33 bits).  There are also only 8 accumulators, so SIMD16 or
4305           *    SIMD32 instructions would have to be split into multiple SIMD8
4306           *    instructions.
4307           *
4308           * 2. Use slightly different math.  For any n-bit value x, we know (x
4309           *    >> 1) != -(x >> 1).  We can use this fact to only do
4310           *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
4311           *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
4312           *
4313           * 3. For unsigned sources, it is sufficient to replace the
4314           *    subtractSaturate with (a > b) ? a - b : 0.
4315           *
4316           * It may also be possible to use the SUBB instruction.  This
4317           * implicitly writes the accumulator, so it could only be used in the
4318           * same situations as #1 above.  It is further limited by only
4319           * allowing UD sources.
4320           */
4321          if (inst->exec_size == 8 && inst->src[0].type != ELK_REGISTER_TYPE_Q &&
4322              inst->src[0].type != ELK_REGISTER_TYPE_UQ) {
4323             elk_fs_reg acc(ARF, ELK_ARF_ACCUMULATOR, inst->src[1].type);
4324 
4325             ibld.MOV(acc, inst->src[1]);
4326             elk_fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
4327             add->saturate = true;
4328             add->src[0].negate = true;
4329          } else if (inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4330             /* tmp = src1 >> 1;
4331              * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
4332              */
4333             elk_fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
4334             elk_fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
4335             elk_fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
4336             elk_fs_inst *add;
4337 
4338             ibld.SHR(tmp1, inst->src[1], elk_imm_d(1));
4339 
4340             add = ibld.ADD(tmp2, inst->src[1], tmp1);
4341             add->src[1].negate = true;
4342 
4343             add = ibld.ADD(tmp3, inst->src[0], tmp1);
4344             add->src[1].negate = true;
4345             add->saturate = true;
4346 
4347             add = ibld.ADD(inst->dst, tmp3, tmp2);
4348             add->src[1].negate = true;
4349             add->saturate = true;
4350          } else {
4351             /* a > b ? a - b : 0 */
4352             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4353                      ELK_CONDITIONAL_G);
4354 
4355             elk_fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4356             add->src[1].negate = !add->src[1].negate;
4357 
4358             ibld.SEL(inst->dst, inst->dst, elk_imm_ud(0))
4359                ->predicate = ELK_PREDICATE_NORMAL;
4360          }
4361 
4362          inst->remove(block);
4363          progress = true;
4364       }
4365    }
4366 
4367    if (progress)
4368       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4369 
4370    return progress;
4371 }
4372 
4373 /**
4374  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4375  * by discard.  Due to the layout of the sample mask in the fragment shader
4376  * thread payload, \p bld is required to have a dispatch_width() not greater
4377  * than 16 for fragment shaders.
4378  */
4379 elk_fs_reg
elk_sample_mask_reg(const fs_builder & bld)4380 elk_sample_mask_reg(const fs_builder &bld)
4381 {
4382    const elk_fs_visitor &s = *bld.shader;
4383 
4384    if (s.stage != MESA_SHADER_FRAGMENT) {
4385       return elk_imm_ud(0xffffffff);
4386    } else if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4387       assert(bld.dispatch_width() <= 16);
4388       return elk_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
4389    } else {
4390       assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4391       assert(s.devinfo->ver < 20);
4392       return retype(elk_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4393                     ELK_REGISTER_TYPE_UW);
4394    }
4395 }
4396 
4397 uint32_t
elk_fb_write_msg_control(const elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data)4398 elk_fb_write_msg_control(const elk_fs_inst *inst,
4399                          const struct elk_wm_prog_data *prog_data)
4400 {
4401    uint32_t mctl;
4402 
4403    if (inst->opcode == ELK_FS_OPCODE_REP_FB_WRITE) {
4404       assert(inst->group == 0 && inst->exec_size == 16);
4405       mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4406    } else if (prog_data->dual_src_blend) {
4407       assert(inst->exec_size == 8);
4408 
4409       if (inst->group % 16 == 0)
4410          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4411       else if (inst->group % 16 == 8)
4412          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4413       else
4414          unreachable("Invalid dual-source FB write instruction group");
4415    } else {
4416       assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4417 
4418       if (inst->exec_size == 16)
4419          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4420       else if (inst->exec_size == 8)
4421          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4422       else
4423          unreachable("Invalid FB write execution size");
4424    }
4425 
4426    return mctl;
4427 }
4428 
4429  /**
4430  * Predicate the specified instruction on the sample mask.
4431  */
4432 void
elk_emit_predicate_on_sample_mask(const fs_builder & bld,elk_fs_inst * inst)4433 elk_emit_predicate_on_sample_mask(const fs_builder &bld, elk_fs_inst *inst)
4434 {
4435    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
4436           bld.group() == inst->group &&
4437           bld.dispatch_width() == inst->exec_size);
4438 
4439    const elk_fs_visitor &s = *bld.shader;
4440    const elk_fs_reg sample_mask = elk_sample_mask_reg(bld);
4441    const unsigned subreg = sample_mask_flag_subreg(s);
4442 
4443    if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4444       assert(sample_mask.file == ARF &&
4445              sample_mask.nr == elk_flag_subreg(subreg).nr &&
4446              sample_mask.subnr == elk_flag_subreg(
4447                 subreg + inst->group / 16).subnr);
4448    } else {
4449       bld.group(1, 0).exec_all()
4450          .MOV(elk_flag_subreg(subreg + inst->group / 16), sample_mask);
4451    }
4452 
4453    if (inst->predicate) {
4454       assert(inst->predicate == ELK_PREDICATE_NORMAL);
4455       assert(!inst->predicate_inverse);
4456       assert(inst->flag_subreg == 0);
4457       assert(s.devinfo->ver < 20);
4458       /* Combine the sample mask with the existing predicate by using a
4459        * vertical predication mode.
4460        */
4461       inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
4462    } else {
4463       inst->flag_subreg = subreg;
4464       inst->predicate = ELK_PREDICATE_NORMAL;
4465       inst->predicate_inverse = false;
4466    }
4467 }
4468 
4469 static bool
is_mixed_float_with_fp32_dst(const elk_fs_inst * inst)4470 is_mixed_float_with_fp32_dst(const elk_fs_inst *inst)
4471 {
4472    /* This opcode sometimes uses :W type on the source even if the operand is
4473     * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
4474     */
4475    if (inst->opcode == ELK_OPCODE_F16TO32)
4476       return true;
4477 
4478    if (inst->dst.type != ELK_REGISTER_TYPE_F)
4479       return false;
4480 
4481    for (int i = 0; i < inst->sources; i++) {
4482       if (inst->src[i].type == ELK_REGISTER_TYPE_HF)
4483          return true;
4484    }
4485 
4486    return false;
4487 }
4488 
4489 static bool
is_mixed_float_with_packed_fp16_dst(const elk_fs_inst * inst)4490 is_mixed_float_with_packed_fp16_dst(const elk_fs_inst *inst)
4491 {
4492    /* This opcode sometimes uses :W type on the destination even if the
4493     * destination is a :HF, because in gfx7 there is no support for :HF, and
4494     * thus it uses :W.
4495     */
4496    if (inst->opcode == ELK_OPCODE_F32TO16 &&
4497        inst->dst.stride == 1)
4498       return true;
4499 
4500    if (inst->dst.type != ELK_REGISTER_TYPE_HF ||
4501        inst->dst.stride != 1)
4502       return false;
4503 
4504    for (int i = 0; i < inst->sources; i++) {
4505       if (inst->src[i].type == ELK_REGISTER_TYPE_F)
4506          return true;
4507    }
4508 
4509    return false;
4510 }
4511 
4512 /**
4513  * Get the closest allowed SIMD width for instruction \p inst accounting for
4514  * some common regioning and execution control restrictions that apply to FPU
4515  * instructions.  These restrictions don't necessarily have any relevance to
4516  * instructions not executed by the FPU pipeline like extended math, control
4517  * flow or send message instructions.
4518  *
4519  * For virtual opcodes it's really up to the instruction -- In some cases
4520  * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
4521  * instructions) it may simplify virtual instruction lowering if we can
4522  * enforce FPU-like regioning restrictions already on the virtual instruction,
4523  * in other cases (e.g. virtual send-like instructions) this may be
4524  * excessively restrictive.
4525  */
4526 static unsigned
get_fpu_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4527 get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
4528                            const elk_fs_inst *inst)
4529 {
4530    const struct elk_compiler *compiler = shader->compiler;
4531    const struct intel_device_info *devinfo = compiler->devinfo;
4532 
4533    /* Maximum execution size representable in the instruction controls. */
4534    unsigned max_width = MIN2(32, inst->exec_size);
4535 
4536    /* Number of channels per polygon handled by a multipolygon PS shader. */
4537    const unsigned poly_width = shader->dispatch_width /
4538                                MAX2(1, shader->max_polygons);
4539 
4540    /* Number of registers that will be read by an ATTR source if
4541     * present for multipolygon PS shaders, since the PS vertex setup
4542     * data for each polygon is stored in different contiguous GRFs.
4543     */
4544    const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
4545                                     shader->max_polygons < 2 ? 0 :
4546                                     DIV_ROUND_UP(inst->exec_size,
4547                                                  poly_width) * reg_unit(devinfo));
4548 
4549    /* According to the PRMs:
4550     *  "A. In Direct Addressing mode, a source cannot span more than 2
4551     *      adjacent GRF registers.
4552     *   B. A destination cannot span more than 2 adjacent GRF registers."
4553     *
4554     * Look for the source or destination with the largest register region
4555     * which is the one that is going to limit the overall execution size of
4556     * the instruction due to this rule.
4557     */
4558    unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4559 
4560    for (unsigned i = 0; i < inst->sources; i++)
4561       reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE),
4562                        (inst->src[i].file == ATTR ? attr_reg_count : 0));
4563 
4564    /* Calculate the maximum execution size of the instruction based on the
4565     * factor by which it goes over the hardware limit of 2 GRFs.
4566     */
4567    const unsigned max_reg_count = 2 * reg_unit(devinfo);
4568    if (reg_count > max_reg_count)
4569       max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
4570 
4571    /* According to the IVB PRMs:
4572     *  "When destination spans two registers, the source MUST span two
4573     *   registers. The exception to the above rule:
4574     *
4575     *    - When source is scalar, the source registers are not incremented.
4576     *    - When source is packed integer Word and destination is packed
4577     *      integer DWord, the source register is not incremented but the
4578     *      source sub register is incremented."
4579     *
4580     * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
4581     * restrictions.  The code below intentionally doesn't check whether the
4582     * destination type is integer because empirically the hardware doesn't
4583     * seem to care what the actual type is as long as it's dword-aligned.
4584     *
4585     * HSW PRMs also add a note to the second exception:
4586     *  "When lower 8 channels are disabled, the sub register of source1
4587     *   operand is not incremented. If the lower 8 channels are expected
4588     *   to be disabled, say by predication, the instruction must be split
4589     *   into pair of simd8 operations."
4590     *
4591     * We can't reliably know if the channels won't be disabled due to,
4592     * for example, IMASK. So, play it safe and disallow packed-word exception
4593     * for src1.
4594     */
4595    if (devinfo->ver < 8) {
4596       for (unsigned i = 0; i < inst->sources; i++) {
4597          /* IVB implements DF scalars as <0;2,1> regions. */
4598          const bool is_scalar_exception = is_uniform(inst->src[i]) &&
4599             (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
4600          const bool is_packed_word_exception = i != 1 &&
4601             type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
4602             type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
4603 
4604          /* We check size_read(i) against size_written instead of REG_SIZE
4605           * because we want to properly handle SIMD32.  In SIMD32, you can end
4606           * up with writes to 4 registers and a source that reads 2 registers
4607           * and we may still need to lower all the way to SIMD8 in that case.
4608           */
4609          if (inst->size_written > REG_SIZE &&
4610              inst->size_read(i) != 0 &&
4611              inst->size_read(i) < inst->size_written &&
4612              !is_scalar_exception && !is_packed_word_exception) {
4613             const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4614             max_width = MIN2(max_width, inst->exec_size / reg_count);
4615          }
4616       }
4617    }
4618 
4619    if (devinfo->ver < 6) {
4620       /* From the G45 PRM, Volume 4 Page 361:
4621        *
4622        *    "Operand Alignment Rule: With the exceptions listed below, a
4623        *     source/destination operand in general should be aligned to even
4624        *     256-bit physical register with a region size equal to two 256-bit
4625        *     physical registers."
4626        *
4627        * Normally we enforce this by allocating virtual registers to the
4628        * even-aligned class.  But we need to handle payload registers.
4629        */
4630       for (unsigned i = 0; i < inst->sources; i++) {
4631          if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
4632              inst->size_read(i) > REG_SIZE) {
4633             max_width = MIN2(max_width, 8);
4634          }
4635       }
4636    }
4637 
4638    /* From the IVB PRMs:
4639     *  "When an instruction is SIMD32, the low 16 bits of the execution mask
4640     *   are applied for both halves of the SIMD32 instruction. If different
4641     *   execution mask channels are required, split the instruction into two
4642     *   SIMD16 instructions."
4643     *
4644     * There is similar text in the HSW PRMs.  Gfx4-6 don't even implement
4645     * 32-wide control flow support in hardware and will behave similarly.
4646     */
4647    if (devinfo->ver < 8 && !inst->force_writemask_all)
4648       max_width = MIN2(max_width, 16);
4649 
4650    /* From the IVB PRMs (applies to HSW too):
4651     *  "Instructions with condition modifiers must not use SIMD32."
4652     *
4653     * From the BDW PRMs (applies to later hardware too):
4654     *  "Ternary instruction with condition modifiers must not use SIMD32."
4655     */
4656    if (inst->conditional_mod && (devinfo->ver < 8 ||
4657                                  (inst->elk_is_3src(compiler) && devinfo->ver < 12)))
4658       max_width = MIN2(max_width, 16);
4659 
4660    /* From the IVB PRMs (applies to other devices that don't have the
4661     * intel_device_info::supports_simd16_3src flag set):
4662     *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
4663     *   SIMD8 is not allowed for DF operations."
4664     */
4665    if (inst->elk_is_3src(compiler) && !devinfo->supports_simd16_3src)
4666       max_width = MIN2(max_width, inst->exec_size / reg_count);
4667 
4668    /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
4669     * the 8-bit quarter of the execution mask signals specified in the
4670     * instruction control fields) for the second compressed half of any
4671     * single-precision instruction (for double-precision instructions
4672     * it's hardwired to use NibCtrl+1, at least on HSW), which means that
4673     * the EU will apply the wrong execution controls for the second
4674     * sequential GRF write if the number of channels per GRF is not exactly
4675     * eight in single-precision mode (or four in double-float mode).
4676     *
4677     * In this situation we calculate the maximum size of the split
4678     * instructions so they only ever write to a single register.
4679     */
4680    if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
4681        !inst->force_writemask_all) {
4682       const unsigned channels_per_grf = inst->exec_size /
4683          DIV_ROUND_UP(inst->size_written, REG_SIZE);
4684       const unsigned exec_type_size = get_exec_type_size(inst);
4685       assert(exec_type_size);
4686 
4687       /* The hardware shifts exactly 8 channels per compressed half of the
4688        * instruction in single-precision mode and exactly 4 in double-precision.
4689        */
4690       if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
4691          max_width = MIN2(max_width, channels_per_grf);
4692 
4693       /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
4694        * because HW applies the same channel enable signals to both halves of
4695        * the compressed instruction which will be just wrong under
4696        * non-uniform control flow.
4697        */
4698       if (devinfo->verx10 == 70 &&
4699           (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
4700          max_width = MIN2(max_width, 4);
4701    }
4702 
4703    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4704     * Float Operations:
4705     *
4706     *    "No SIMD16 in mixed mode when destination is f32. Instruction
4707     *     execution size must be no more than 8."
4708     *
4709     * FIXME: the simulator doesn't seem to complain if we don't do this and
4710     * empirical testing with existing CTS tests show that they pass just fine
4711     * without implementing this, however, since our interpretation of the PRM
4712     * is that conversion MOVs between HF and F are still mixed-float
4713     * instructions (and therefore subject to this restriction) we decided to
4714     * split them to be safe. Might be useful to do additional investigation to
4715     * lift the restriction if we can ensure that it is safe though, since these
4716     * conversions are common when half-float types are involved since many
4717     * instructions do not support HF types and conversions from/to F are
4718     * required.
4719     */
4720    if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
4721       max_width = MIN2(max_width, 8);
4722 
4723    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4724     * Float Operations:
4725     *
4726     *    "No SIMD16 in mixed mode when destination is packed f16 for both
4727     *     Align1 and Align16."
4728     */
4729    if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
4730       max_width = MIN2(max_width, 8);
4731 
4732    /* Only power-of-two execution sizes are representable in the instruction
4733     * control fields.
4734     */
4735    return 1 << util_logbase2(max_width);
4736 }
4737 
4738 /**
4739  * Get the maximum allowed SIMD width for instruction \p inst accounting for
4740  * various payload size restrictions that apply to sampler message
4741  * instructions.
4742  *
4743  * This is only intended to provide a maximum theoretical bound for the
4744  * execution size of the message based on the number of argument components
4745  * alone, which in most cases will determine whether the SIMD8 or SIMD16
4746  * variant of the message can be used, though some messages may have
4747  * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
4748  * the message length to determine the exact SIMD width and argument count,
4749  * which makes a number of sampler message combinations impossible to
4750  * represent).
4751  *
4752  * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
4753  * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
4754  */
4755 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const elk_fs_inst * inst)4756 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
4757                                const elk_fs_inst *inst)
4758 {
4759    /* If we have a min_lod parameter on anything other than a simple sample
4760     * message, it will push it over 5 arguments and we have to fall back to
4761     * SIMD8.
4762     */
4763    if (inst->opcode != ELK_SHADER_OPCODE_TEX &&
4764        inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
4765       return devinfo->ver < 20 ? 8 : 16;
4766 
4767    /* Calculate the number of coordinate components that have to be present
4768     * assuming that additional arguments follow the texel coordinates in the
4769     * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
4770     * need to pad to four or three components depending on the message,
4771     * pre-ILK we need to pad to at most three components.
4772     */
4773    const unsigned req_coord_components =
4774       (devinfo->ver >= 7 ||
4775        !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
4776       (devinfo->ver >= 5 && inst->opcode != ELK_SHADER_OPCODE_TXF_LOGICAL &&
4777                             inst->opcode != ELK_SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
4778       3;
4779 
4780    /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
4781     * variant of the TXL or TXF message.
4782     */
4783    const bool implicit_lod = devinfo->ver >= 9 &&
4784                              (inst->opcode == ELK_SHADER_OPCODE_TXL ||
4785                               inst->opcode == ELK_SHADER_OPCODE_TXF) &&
4786                              inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
4787 
4788    /* Calculate the total number of argument components that need to be passed
4789     * to the sampler unit.
4790     */
4791    const unsigned num_payload_components =
4792       MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
4793            req_coord_components) +
4794       inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
4795       (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
4796       inst->components_read(TEX_LOGICAL_SRC_LOD2) +
4797       inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
4798       (inst->opcode == ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
4799        inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
4800       inst->components_read(TEX_LOGICAL_SRC_MCS);
4801 
4802    const unsigned simd_limit = reg_unit(devinfo) *
4803       (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
4804 
4805    /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
4806     * maximum message size supported by the sampler, regardless of whether a
4807     * header is provided or not.
4808     */
4809    return MIN2(inst->exec_size, simd_limit);
4810 }
4811 
4812 /**
4813  * Get the closest native SIMD width supported by the hardware for instruction
4814  * \p inst.  The instruction will be left untouched by
4815  * elk_fs_visitor::lower_simd_width() if the returned value is equal to the
4816  * original execution size.
4817  */
4818 static unsigned
get_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4819 get_lowered_simd_width(const elk_fs_visitor *shader, const elk_fs_inst *inst)
4820 {
4821    const struct elk_compiler *compiler = shader->compiler;
4822    const struct intel_device_info *devinfo = compiler->devinfo;
4823 
4824    switch (inst->opcode) {
4825    case ELK_OPCODE_DP4A:
4826    case ELK_OPCODE_MOV:
4827    case ELK_OPCODE_SEL:
4828    case ELK_OPCODE_NOT:
4829    case ELK_OPCODE_AND:
4830    case ELK_OPCODE_OR:
4831    case ELK_OPCODE_XOR:
4832    case ELK_OPCODE_SHR:
4833    case ELK_OPCODE_SHL:
4834    case ELK_OPCODE_ASR:
4835    case ELK_OPCODE_ROR:
4836    case ELK_OPCODE_ROL:
4837    case ELK_OPCODE_CMPN:
4838    case ELK_OPCODE_CSEL:
4839    case ELK_OPCODE_F32TO16:
4840    case ELK_OPCODE_F16TO32:
4841    case ELK_OPCODE_BFREV:
4842    case ELK_OPCODE_BFE:
4843    case ELK_OPCODE_ADD:
4844    case ELK_OPCODE_MUL:
4845    case ELK_OPCODE_AVG:
4846    case ELK_OPCODE_FRC:
4847    case ELK_OPCODE_RNDU:
4848    case ELK_OPCODE_RNDD:
4849    case ELK_OPCODE_RNDE:
4850    case ELK_OPCODE_RNDZ:
4851    case ELK_OPCODE_LZD:
4852    case ELK_OPCODE_FBH:
4853    case ELK_OPCODE_FBL:
4854    case ELK_OPCODE_CBIT:
4855    case ELK_OPCODE_SAD2:
4856    case ELK_OPCODE_MAD:
4857    case ELK_OPCODE_LRP:
4858    case ELK_OPCODE_ADD3:
4859    case ELK_FS_OPCODE_PACK:
4860    case ELK_SHADER_OPCODE_SEL_EXEC:
4861    case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
4862    case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
4863       return get_fpu_lowered_simd_width(shader, inst);
4864 
4865    case ELK_OPCODE_CMP: {
4866       /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
4867        * when the destination is a GRF the dependency-clear bit on the flag
4868        * register is cleared early.
4869        *
4870        * Suggested workarounds are to disable coissuing CMP instructions
4871        * or to split CMP(16) instructions into two CMP(8) instructions.
4872        *
4873        * We choose to split into CMP(8) instructions since disabling
4874        * coissuing would affect CMP instructions not otherwise affected by
4875        * the errata.
4876        */
4877       const unsigned max_width = (devinfo->verx10 == 70 &&
4878                                   !inst->dst.is_null() ? 8 : ~0);
4879       return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
4880    }
4881    case ELK_OPCODE_BFI1:
4882    case ELK_OPCODE_BFI2:
4883       /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
4884        * should
4885        *  "Force BFI instructions to be executed always in SIMD8."
4886        */
4887       return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
4888                   get_fpu_lowered_simd_width(shader, inst));
4889 
4890    case ELK_OPCODE_IF:
4891       assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
4892       return inst->exec_size;
4893 
4894    case ELK_SHADER_OPCODE_RCP:
4895    case ELK_SHADER_OPCODE_RSQ:
4896    case ELK_SHADER_OPCODE_SQRT:
4897    case ELK_SHADER_OPCODE_EXP2:
4898    case ELK_SHADER_OPCODE_LOG2:
4899    case ELK_SHADER_OPCODE_SIN:
4900    case ELK_SHADER_OPCODE_COS: {
4901       /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
4902        * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
4903        */
4904       if (devinfo->ver == 6 || devinfo->verx10 == 40)
4905          return MIN2(8, inst->exec_size);
4906       if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4907          return MIN2(8, inst->exec_size);
4908       return MIN2(16, inst->exec_size);
4909    }
4910 
4911    case ELK_SHADER_OPCODE_POW: {
4912       /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
4913        * to SIMD8 with half-float
4914        */
4915       if (devinfo->ver < 7)
4916          return MIN2(8, inst->exec_size);
4917       if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4918          return MIN2(8, inst->exec_size);
4919       return MIN2(16, inst->exec_size);
4920    }
4921 
4922    case ELK_SHADER_OPCODE_USUB_SAT:
4923    case ELK_SHADER_OPCODE_ISUB_SAT:
4924       return get_fpu_lowered_simd_width(shader, inst);
4925 
4926    case ELK_SHADER_OPCODE_INT_QUOTIENT:
4927    case ELK_SHADER_OPCODE_INT_REMAINDER:
4928       /* Integer division is limited to SIMD8 on all generations. */
4929       return MIN2(8, inst->exec_size);
4930 
4931    case ELK_FS_OPCODE_LINTERP:
4932    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
4933    case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
4934    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
4935    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
4936    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
4937       return MIN2(16, inst->exec_size);
4938 
4939    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4940       /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
4941        * message used to implement varying pull constant loads, so expand it
4942        * to SIMD16.  An alternative with longer message payload length but
4943        * shorter return payload would be to use the SIMD8 sampler message that
4944        * takes (header, u, v, r) as parameters instead of (header, u).
4945        */
4946       return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
4947 
4948    case ELK_FS_OPCODE_DDX_COARSE:
4949    case ELK_FS_OPCODE_DDX_FINE:
4950    case ELK_FS_OPCODE_DDY_COARSE:
4951    case ELK_FS_OPCODE_DDY_FINE:
4952       /* The implementation of this virtual opcode may require emitting
4953        * compressed Align16 instructions, which are severely limited on some
4954        * generations.
4955        *
4956        * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
4957        * Region Restrictions):
4958        *
4959        *  "In Align16 access mode, SIMD16 is not allowed for DW operations
4960        *   and SIMD8 is not allowed for DF operations."
4961        *
4962        * In this context, "DW operations" means "operations acting on 32-bit
4963        * values", so it includes operations on floats.
4964        *
4965        * Gfx4 has a similar restriction.  From the i965 PRM, section 11.5.3
4966        * (Instruction Compression -> Rules and Restrictions):
4967        *
4968        *  "A compressed instruction must be in Align1 access mode. Align16
4969        *   mode instructions cannot be compressed."
4970        *
4971        * Similar text exists in the g45 PRM.
4972        *
4973        * Empirically, compressed align16 instructions using odd register
4974        * numbers don't appear to work on Sandybridge either.
4975        */
4976       return (devinfo->ver == 4 || devinfo->ver == 6 ||
4977               (devinfo->verx10 == 70) ?
4978               MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
4979 
4980    case ELK_SHADER_OPCODE_MULH:
4981       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4982        * is 8-wide on Gfx7+.
4983        */
4984       return (devinfo->ver >= 20 ? 16 :
4985               devinfo->ver >= 7 ? 8 :
4986               get_fpu_lowered_simd_width(shader, inst));
4987 
4988    case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
4989       /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
4990        * here.
4991        */
4992       assert(devinfo->ver != 6 ||
4993              inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4994              inst->exec_size == 8);
4995       /* Dual-source FB writes are unsupported in SIMD16 mode. */
4996       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4997               8 : MIN2(16, inst->exec_size));
4998 
4999    case ELK_FS_OPCODE_FB_READ_LOGICAL:
5000       return MIN2(16, inst->exec_size);
5001 
5002    case ELK_SHADER_OPCODE_TEX_LOGICAL:
5003    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
5004    case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
5005    case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
5006    case ELK_SHADER_OPCODE_LOD_LOGICAL:
5007    case ELK_SHADER_OPCODE_TG4_LOGICAL:
5008    case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
5009    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
5010    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
5011       return get_sampler_lowered_simd_width(devinfo, inst);
5012 
5013    /* On gfx12 parameters are fixed to 16-bit values and therefore they all
5014     * always fit regardless of the execution size.
5015     */
5016    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
5017       return MIN2(16, inst->exec_size);
5018 
5019    case ELK_SHADER_OPCODE_TXD_LOGICAL:
5020       /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
5021        * unsuppported on Xe2.
5022        */
5023       return devinfo->ver < 20 ? 8 : 16;
5024 
5025    case ELK_SHADER_OPCODE_TXL_LOGICAL:
5026    case ELK_FS_OPCODE_TXB_LOGICAL:
5027       /* Only one execution size is representable pre-ILK depending on whether
5028        * the shadow reference argument is present.
5029        */
5030       if (devinfo->ver == 4)
5031          return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
5032       else
5033          return get_sampler_lowered_simd_width(devinfo, inst);
5034 
5035    case ELK_SHADER_OPCODE_TXF_LOGICAL:
5036    case ELK_SHADER_OPCODE_TXS_LOGICAL:
5037       /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
5038        * messages.  Use SIMD16 instead.
5039        */
5040       if (devinfo->ver == 4)
5041          return 16;
5042       else
5043          return get_sampler_lowered_simd_width(devinfo, inst);
5044 
5045    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5046    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5047    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5048       return 8;
5049 
5050    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5051    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5052    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5053    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5054    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5055    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5056    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5057       return MIN2(16, inst->exec_size);
5058 
5059    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
5060    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
5061    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
5062    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
5063       return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
5064 
5065    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
5066    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
5067    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
5068       assert(inst->exec_size <= 16);
5069       return inst->exec_size;
5070 
5071    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
5072       return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
5073 
5074    case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
5075    case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
5076       return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
5077 
5078    case ELK_SHADER_OPCODE_QUAD_SWIZZLE: {
5079       const unsigned swiz = inst->src[1].ud;
5080       return (is_uniform(inst->src[0]) ?
5081                  get_fpu_lowered_simd_width(shader, inst) :
5082               devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
5083               swiz == ELK_SWIZZLE_XYXY || swiz == ELK_SWIZZLE_ZWZW ? 4 :
5084               get_fpu_lowered_simd_width(shader, inst));
5085    }
5086    case ELK_SHADER_OPCODE_MOV_INDIRECT: {
5087       /* From IVB and HSW PRMs:
5088        *
5089        * "2.When the destination requires two registers and the sources are
5090        *  indirect, the sources must use 1x1 regioning mode.
5091        *
5092        * In case of DF instructions in HSW/IVB, the exec_size is limited by
5093        * the EU decompression logic not handling VxH indirect addressing
5094        * correctly.
5095        */
5096       const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
5097       /* Prior to Broadwell, we only have 8 address subregisters. */
5098       return MIN3(devinfo->ver >= 8 ? 16 : 8,
5099                   max_size / (inst->dst.stride * type_sz(inst->dst.type)),
5100                   inst->exec_size);
5101    }
5102 
5103    case ELK_SHADER_OPCODE_LOAD_PAYLOAD: {
5104       const unsigned reg_count =
5105          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
5106 
5107       if (reg_count > 2) {
5108          /* Only LOAD_PAYLOAD instructions with per-channel destination region
5109           * can be easily lowered (which excludes headers and heterogeneous
5110           * types).
5111           */
5112          assert(!inst->header_size);
5113          for (unsigned i = 0; i < inst->sources; i++)
5114             assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
5115                    inst->src[i].file == BAD_FILE);
5116 
5117          return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
5118       } else {
5119          return inst->exec_size;
5120       }
5121    }
5122    default:
5123       return inst->exec_size;
5124    }
5125 }
5126 
5127 /**
5128  * Return true if splitting out the group of channels of instruction \p inst
5129  * given by lbld.group() requires allocating a temporary for the i-th source
5130  * of the lowered instruction.
5131  */
5132 static inline bool
needs_src_copy(const fs_builder & lbld,const elk_fs_inst * inst,unsigned i)5133 needs_src_copy(const fs_builder &lbld, const elk_fs_inst *inst, unsigned i)
5134 {
5135    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
5136             (inst->components_read(i) == 1 &&
5137              lbld.dispatch_width() <= inst->exec_size)) ||
5138           (inst->flags_written(lbld.shader->devinfo) &
5139            flag_mask(inst->src[i], type_sz(inst->src[i].type)));
5140 }
5141 
5142 /**
5143  * Extract the data that would be consumed by the channel group given by
5144  * lbld.group() from the i-th source region of instruction \p inst and return
5145  * it as result in packed form.
5146  */
5147 static elk_fs_reg
emit_unzip(const fs_builder & lbld,elk_fs_inst * inst,unsigned i)5148 emit_unzip(const fs_builder &lbld, elk_fs_inst *inst, unsigned i)
5149 {
5150    assert(lbld.group() >= inst->group);
5151 
5152    /* Specified channel group from the source region. */
5153    const elk_fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
5154 
5155    if (needs_src_copy(lbld, inst, i)) {
5156       /* Builder of the right width to perform the copy avoiding uninitialized
5157        * data if the lowered execution size is greater than the original
5158        * execution size of the instruction.
5159        */
5160       const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
5161                                               inst->exec_size), 0);
5162       const elk_fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
5163 
5164       for (unsigned k = 0; k < inst->components_read(i); ++k)
5165          cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
5166 
5167       return tmp;
5168 
5169    } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
5170       /* The source is invariant for all dispatch_width-wide groups of the
5171        * original region.
5172        */
5173       return inst->src[i];
5174 
5175    } else {
5176       /* We can just point the lowered instruction at the right channel group
5177        * from the original region.
5178        */
5179       return src;
5180    }
5181 }
5182 
5183 /**
5184  * Return true if splitting out the group of channels of instruction \p inst
5185  * given by lbld.group() requires allocating a temporary for the destination
5186  * of the lowered instruction and copying the data back to the original
5187  * destination region.
5188  */
5189 static inline bool
needs_dst_copy(const fs_builder & lbld,const elk_fs_inst * inst)5190 needs_dst_copy(const fs_builder &lbld, const elk_fs_inst *inst)
5191 {
5192    if (inst->dst.is_null())
5193       return false;
5194 
5195    /* If the instruction writes more than one component we'll have to shuffle
5196     * the results of multiple lowered instructions in order to make sure that
5197     * they end up arranged correctly in the original destination region.
5198     */
5199    if (inst->size_written > inst->dst.component_size(inst->exec_size))
5200       return true;
5201 
5202    /* If the lowered execution size is larger than the original the result of
5203     * the instruction won't fit in the original destination, so we'll have to
5204     * allocate a temporary in any case.
5205     */
5206    if (lbld.dispatch_width() > inst->exec_size)
5207       return true;
5208 
5209    for (unsigned i = 0; i < inst->sources; i++) {
5210       /* If we already made a copy of the source for other reasons there won't
5211        * be any overlap with the destination.
5212        */
5213       if (needs_src_copy(lbld, inst, i))
5214          continue;
5215 
5216       /* In order to keep the logic simple we emit a copy whenever the
5217        * destination region doesn't exactly match an overlapping source, which
5218        * may point at the source and destination not being aligned group by
5219        * group which could cause one of the lowered instructions to overwrite
5220        * the data read from the same source by other lowered instructions.
5221        */
5222       if (regions_overlap(inst->dst, inst->size_written,
5223                           inst->src[i], inst->size_read(i)) &&
5224           !inst->dst.equals(inst->src[i]))
5225         return true;
5226    }
5227 
5228    return false;
5229 }
5230 
5231 /**
5232  * Insert data from a packed temporary into the channel group given by
5233  * lbld.group() of the destination region of instruction \p inst and return
5234  * the temporary as result.  Any copy instructions that are required for
5235  * unzipping the previous value (in the case of partial writes) will be
5236  * inserted using \p lbld_before and any copy instructions required for
5237  * zipping up the destination of \p inst will be inserted using \p lbld_after.
5238  */
5239 static elk_fs_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,elk_fs_inst * inst)5240 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
5241          elk_fs_inst *inst)
5242 {
5243    assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
5244    assert(lbld_before.group() == lbld_after.group());
5245    assert(lbld_after.group() >= inst->group);
5246 
5247    const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
5248 
5249    /* Specified channel group from the destination region. */
5250    const elk_fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
5251 
5252    if (!needs_dst_copy(lbld_after, inst)) {
5253       /* No need to allocate a temporary for the lowered instruction, just
5254        * take the right group of channels from the original region.
5255        */
5256       return dst;
5257    }
5258 
5259    /* Deal with the residency data part later */
5260    const unsigned residency_size = inst->has_sampler_residency() ?
5261       (reg_unit(devinfo) * REG_SIZE) : 0;
5262    const unsigned dst_size = (inst->size_written - residency_size) /
5263       inst->dst.component_size(inst->exec_size);
5264 
5265    const elk_fs_reg tmp = lbld_after.vgrf(inst->dst.type,
5266                                       dst_size + inst->has_sampler_residency());
5267 
5268    if (inst->predicate) {
5269       /* Handle predication by copying the original contents of the
5270        * destination into the temporary before emitting the lowered
5271        * instruction.
5272        */
5273       const fs_builder gbld_before =
5274          lbld_before.group(MIN2(lbld_before.dispatch_width(),
5275                                 inst->exec_size), 0);
5276       for (unsigned k = 0; k < dst_size; ++k) {
5277          gbld_before.MOV(offset(tmp, lbld_before, k),
5278                          offset(dst, inst->exec_size, k));
5279       }
5280    }
5281 
5282    const fs_builder gbld_after =
5283       lbld_after.group(MIN2(lbld_after.dispatch_width(),
5284                             inst->exec_size), 0);
5285    for (unsigned k = 0; k < dst_size; ++k) {
5286       /* Use a builder of the right width to perform the copy avoiding
5287        * uninitialized data if the lowered execution size is greater than the
5288        * original execution size of the instruction.
5289        */
5290       gbld_after.MOV(offset(dst, inst->exec_size, k),
5291                      offset(tmp, lbld_after, k));
5292    }
5293 
5294    if (inst->has_sampler_residency()) {
5295       /* Sampler messages with residency need a special attention. In the
5296        * first lane of the last component are located the Pixel Null Mask
5297        * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
5298        * have to build a single 32bit value for the SIMD32 message out of 2
5299        * SIMD16 16 bit values.
5300        */
5301       const fs_builder rbld = gbld_after.exec_all().group(1, 0);
5302       elk_fs_reg local_res_reg = component(
5303          retype(offset(tmp, lbld_before, dst_size),
5304                 ELK_REGISTER_TYPE_UW), 0);
5305       elk_fs_reg final_res_reg =
5306          retype(byte_offset(inst->dst,
5307                             inst->size_written - residency_size +
5308                             gbld_after.group() / 8),
5309                 ELK_REGISTER_TYPE_UW);
5310       rbld.MOV(final_res_reg, local_res_reg);
5311    }
5312 
5313    return tmp;
5314 }
5315 
5316 bool
lower_simd_width()5317 elk_fs_visitor::lower_simd_width()
5318 {
5319    bool progress = false;
5320 
5321    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5322       const unsigned lower_width = get_lowered_simd_width(this, inst);
5323 
5324       if (lower_width != inst->exec_size) {
5325          /* Builder matching the original instruction.  We may also need to
5326           * emit an instruction of width larger than the original, set the
5327           * execution size of the builder to the highest of both for now so
5328           * we're sure that both cases can be handled.
5329           */
5330          const unsigned max_width = MAX2(inst->exec_size, lower_width);
5331 
5332          const fs_builder bld = fs_builder(this).at_end();
5333          const fs_builder ibld = bld.at(block, inst)
5334                                     .exec_all(inst->force_writemask_all)
5335                                     .group(max_width, inst->group / max_width);
5336 
5337          /* Split the copies in chunks of the execution width of either the
5338           * original or the lowered instruction, whichever is lower.
5339           */
5340          const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
5341          const unsigned residency_size = inst->has_sampler_residency() ?
5342             (reg_unit(devinfo) * REG_SIZE) : 0;
5343          const unsigned dst_size =
5344             (inst->size_written - residency_size) /
5345             inst->dst.component_size(inst->exec_size);
5346 
5347          assert(!inst->writes_accumulator && !inst->mlen);
5348 
5349          /* Inserting the zip, unzip, and duplicated instructions in all of
5350           * the right spots is somewhat tricky.  All of the unzip and any
5351           * instructions from the zip which unzip the destination prior to
5352           * writing need to happen before all of the per-group instructions
5353           * and the zip instructions need to happen after.  In order to sort
5354           * this all out, we insert the unzip instructions before \p inst,
5355           * insert the per-group instructions after \p inst (i.e. before
5356           * inst->next), and insert the zip instructions before the
5357           * instruction after \p inst.  Since we are inserting instructions
5358           * after \p inst, inst->next is a moving target and we need to save
5359           * it off here so that we insert the zip instructions in the right
5360           * place.
5361           *
5362           * Since we're inserting split instructions after after_inst, the
5363           * instructions will end up in the reverse order that we insert them.
5364           * However, certain render target writes require that the low group
5365           * instructions come before the high group.  From the Ivy Bridge PRM
5366           * Vol. 4, Pt. 1, Section 3.9.11:
5367           *
5368           *    "If multiple SIMD8 Dual Source messages are delivered by the
5369           *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
5370           *    issued before the SIMD8_DUALSRC_HI message with the same Slot
5371           *    Group Select setting."
5372           *
5373           * And, from Section 3.9.11.1 of the same PRM:
5374           *
5375           *    "When SIMD32 or SIMD16 PS threads send render target writes
5376           *    with multiple SIMD8 and SIMD16 messages, the following must
5377           *    hold:
5378           *
5379           *    All the slots (as described above) must have a corresponding
5380           *    render target write irrespective of the slot's validity. A slot
5381           *    is considered valid when at least one sample is enabled. For
5382           *    example, a SIMD16 PS thread must send two SIMD8 render target
5383           *    writes to cover all the slots.
5384           *
5385           *    PS thread must send SIMD render target write messages with
5386           *    increasing slot numbers. For example, SIMD16 thread has
5387           *    Slot[15:0] and if two SIMD8 render target writes are used, the
5388           *    first SIMD8 render target write must send Slot[7:0] and the
5389           *    next one must send Slot[15:8]."
5390           *
5391           * In order to make low group instructions come before high group
5392           * instructions (this is required for some render target writes), we
5393           * split from the highest group to lowest.
5394           */
5395          exec_node *const after_inst = inst->next;
5396          for (int i = n - 1; i >= 0; i--) {
5397             /* Emit a copy of the original instruction with the lowered width.
5398              * If the EOT flag was set throw it away except for the last
5399              * instruction to avoid killing the thread prematurely.
5400              */
5401             elk_fs_inst split_inst = *inst;
5402             split_inst.exec_size = lower_width;
5403             split_inst.eot = inst->eot && i == int(n - 1);
5404 
5405             /* Select the correct channel enables for the i-th group, then
5406              * transform the sources and destination and emit the lowered
5407              * instruction.
5408              */
5409             const fs_builder lbld = ibld.group(lower_width, i);
5410 
5411             for (unsigned j = 0; j < inst->sources; j++)
5412                split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
5413 
5414             split_inst.dst = emit_zip(lbld.at(block, inst),
5415                                       lbld.at(block, after_inst), inst);
5416             split_inst.size_written =
5417                split_inst.dst.component_size(lower_width) * dst_size +
5418                residency_size;
5419 
5420             lbld.at(block, inst->next).emit(split_inst);
5421          }
5422 
5423          inst->remove(block);
5424          progress = true;
5425       }
5426    }
5427 
5428    if (progress)
5429       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5430 
5431    return progress;
5432 }
5433 
5434 /**
5435  * Transform barycentric vectors into the interleaved form expected by the PLN
5436  * instruction and returned by the Gfx7+ PI shared function.
5437  *
5438  * For channels 0-15 in SIMD16 mode they are expected to be laid out as
5439  * follows in the register file:
5440  *
5441  *    rN+0: X[0-7]
5442  *    rN+1: Y[0-7]
5443  *    rN+2: X[8-15]
5444  *    rN+3: Y[8-15]
5445  *
5446  * There is no need to handle SIMD32 here -- This is expected to be run after
5447  * SIMD lowering, since SIMD lowering relies on vectors having the standard
5448  * component layout.
5449  */
5450 bool
lower_barycentrics()5451 elk_fs_visitor::lower_barycentrics()
5452 {
5453    const bool has_interleaved_layout = devinfo->has_pln ||
5454       (devinfo->ver >= 7 && devinfo->ver < 20);
5455    bool progress = false;
5456 
5457    if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
5458       return false;
5459 
5460    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5461       if (inst->exec_size < 16)
5462          continue;
5463 
5464       const fs_builder ibld(this, block, inst);
5465       const fs_builder ubld = ibld.exec_all().group(8, 0);
5466 
5467       switch (inst->opcode) {
5468       case ELK_FS_OPCODE_LINTERP : {
5469          assert(inst->exec_size == 16);
5470          const elk_fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
5471          elk_fs_reg srcs[4];
5472 
5473          for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
5474             srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
5475                                    8 * (i / 2));
5476 
5477          ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
5478 
5479          inst->src[0] = tmp;
5480          progress = true;
5481          break;
5482       }
5483       case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
5484       case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
5485       case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
5486          assert(inst->exec_size == 16);
5487          const elk_fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
5488 
5489          for (unsigned i = 0; i < 2; i++) {
5490             for (unsigned g = 0; g < inst->exec_size / 8; g++) {
5491                elk_fs_inst *mov = ibld.at(block, inst->next).group(8, g)
5492                                   .MOV(horiz_offset(offset(inst->dst, ibld, i),
5493                                                     8 * g),
5494                                        offset(tmp, ubld, 2 * g + i));
5495                mov->predicate = inst->predicate;
5496                mov->predicate_inverse = inst->predicate_inverse;
5497                mov->flag_subreg = inst->flag_subreg;
5498             }
5499          }
5500 
5501          inst->dst = tmp;
5502          progress = true;
5503          break;
5504       }
5505       default:
5506          break;
5507       }
5508    }
5509 
5510    if (progress)
5511       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5512 
5513    return progress;
5514 }
5515 
5516 /**
5517  * Lower a derivative instruction as the floating-point difference of two
5518  * swizzles of the source, specified as \p swz0 and \p swz1.
5519  */
5520 static bool
lower_derivative(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned swz0,unsigned swz1)5521 lower_derivative(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst,
5522                  unsigned swz0, unsigned swz1)
5523 {
5524    const fs_builder ubld = fs_builder(v, block, inst).exec_all();
5525    const elk_fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
5526    const elk_fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
5527 
5528    ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], elk_imm_ud(swz0));
5529    ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], elk_imm_ud(swz1));
5530 
5531    inst->resize_sources(2);
5532    inst->src[0] = negate(tmp0);
5533    inst->src[1] = tmp1;
5534    inst->opcode = ELK_OPCODE_ADD;
5535 
5536    return true;
5537 }
5538 
5539 /**
5540  * Lower derivative instructions on platforms where codegen cannot implement
5541  * them efficiently (i.e. XeHP).
5542  */
5543 bool
lower_derivatives()5544 elk_fs_visitor::lower_derivatives()
5545 {
5546    bool progress = false;
5547 
5548    if (devinfo->verx10 < 125)
5549       return false;
5550 
5551    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
5552       if (inst->opcode == ELK_FS_OPCODE_DDX_COARSE)
5553          progress |= lower_derivative(this, block, inst,
5554                                       ELK_SWIZZLE_XXXX, ELK_SWIZZLE_YYYY);
5555 
5556       else if (inst->opcode == ELK_FS_OPCODE_DDX_FINE)
5557          progress |= lower_derivative(this, block, inst,
5558                                       ELK_SWIZZLE_XXZZ, ELK_SWIZZLE_YYWW);
5559 
5560       else if (inst->opcode == ELK_FS_OPCODE_DDY_COARSE)
5561          progress |= lower_derivative(this, block, inst,
5562                                       ELK_SWIZZLE_XXXX, ELK_SWIZZLE_ZZZZ);
5563 
5564       else if (inst->opcode == ELK_FS_OPCODE_DDY_FINE)
5565          progress |= lower_derivative(this, block, inst,
5566                                       ELK_SWIZZLE_XYXY, ELK_SWIZZLE_ZWZW);
5567    }
5568 
5569    if (progress)
5570       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5571 
5572    return progress;
5573 }
5574 
5575 bool
lower_find_live_channel()5576 elk_fs_visitor::lower_find_live_channel()
5577 {
5578    bool progress = false;
5579 
5580    if (devinfo->ver < 8)
5581       return false;
5582 
5583    bool packed_dispatch =
5584       elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
5585                                     stage_prog_data);
5586    bool vmask =
5587       stage == MESA_SHADER_FRAGMENT &&
5588       elk_wm_prog_data(stage_prog_data)->uses_vmask;
5589 
5590    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5591       if (inst->opcode != ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL &&
5592           inst->opcode != ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
5593          continue;
5594 
5595       bool first = inst->opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL;
5596 
5597       /* Getting the first active channel index is easy on Gfx8: Just find
5598        * the first bit set in the execution mask.  The register exists on
5599        * HSW already but it reads back as all ones when the current
5600        * instruction has execution masking disabled, so it's kind of
5601        * useless there.
5602        */
5603       elk_fs_reg exec_mask(retype(elk_mask_reg(0), ELK_REGISTER_TYPE_UD));
5604 
5605       const fs_builder ibld(this, block, inst);
5606       if (!inst->is_partial_write())
5607          ibld.emit_undef_for_dst(inst);
5608 
5609       const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
5610 
5611       /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
5612        * so combine the execution and dispatch masks to obtain the true mask.
5613        *
5614        * If we're looking for the first live channel, and we have packed
5615        * dispatch, we can skip this step, as we know all dispatched channels
5616        * will appear at the front of the mask.
5617        */
5618       if (!(first && packed_dispatch)) {
5619          elk_fs_reg mask = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5620          ubld.UNDEF(mask);
5621          ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, mask, elk_imm_ud(vmask ? 3 : 2));
5622 
5623          /* Quarter control has the effect of magically shifting the value of
5624           * ce0 so you'll get the first/last active channel relative to the
5625           * specified quarter control as result.
5626           */
5627          if (inst->group > 0)
5628             ubld.SHR(mask, mask, elk_imm_ud(ALIGN(inst->group, 8)));
5629 
5630          ubld.AND(mask, exec_mask, mask);
5631          exec_mask = mask;
5632       }
5633 
5634       if (first) {
5635          ubld.FBL(inst->dst, exec_mask);
5636       } else {
5637          elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 1);
5638          ubld.UNDEF(tmp);
5639          ubld.LZD(tmp, exec_mask);
5640          ubld.ADD(inst->dst, negate(tmp), elk_imm_uw(31));
5641       }
5642 
5643       inst->remove(block);
5644       progress = true;
5645    }
5646 
5647    if (progress)
5648       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5649 
5650    return progress;
5651 }
5652 
5653 void
dump_instructions_to_file(FILE * file) const5654 elk_fs_visitor::dump_instructions_to_file(FILE *file) const
5655 {
5656    if (cfg) {
5657       const register_pressure &rp = regpressure_analysis.require();
5658       unsigned ip = 0, max_pressure = 0;
5659       unsigned cf_count = 0;
5660       foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5661          if (inst->is_control_flow_end())
5662             cf_count -= 1;
5663 
5664          max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5665          fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
5666          for (unsigned i = 0; i < cf_count; i++)
5667             fprintf(file, "  ");
5668          dump_instruction(inst, file);
5669          ip++;
5670 
5671          if (inst->is_control_flow_begin())
5672             cf_count += 1;
5673       }
5674       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
5675    } else {
5676       int ip = 0;
5677       foreach_in_list(elk_backend_instruction, inst, &instructions) {
5678          fprintf(file, "%4d: ", ip++);
5679          dump_instruction(inst, file);
5680       }
5681    }
5682 }
5683 
5684 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const5685 elk_fs_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
5686 {
5687    const elk_fs_inst *inst = (const elk_fs_inst *)be_inst;
5688 
5689    if (inst->predicate) {
5690       fprintf(file, "(%cf%d.%d) ",
5691               inst->predicate_inverse ? '-' : '+',
5692               inst->flag_subreg / 2,
5693               inst->flag_subreg % 2);
5694    }
5695 
5696    fprintf(file, "%s", elk_instruction_name(&compiler->isa, inst->opcode));
5697    if (inst->saturate)
5698       fprintf(file, ".sat");
5699    if (inst->conditional_mod) {
5700       fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
5701       if (!inst->predicate &&
5702           (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
5703                                 inst->opcode != ELK_OPCODE_CSEL &&
5704                                 inst->opcode != ELK_OPCODE_IF &&
5705                                 inst->opcode != ELK_OPCODE_WHILE))) {
5706          fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
5707                  inst->flag_subreg % 2);
5708       }
5709    }
5710    fprintf(file, "(%d) ", inst->exec_size);
5711 
5712    if (inst->mlen) {
5713       fprintf(file, "(mlen: %d) ", inst->mlen);
5714    }
5715 
5716    if (inst->ex_mlen) {
5717       fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
5718    }
5719 
5720    if (inst->eot) {
5721       fprintf(file, "(EOT) ");
5722    }
5723 
5724    switch (inst->dst.file) {
5725    case VGRF:
5726       fprintf(file, "vgrf%d", inst->dst.nr);
5727       break;
5728    case FIXED_GRF:
5729       fprintf(file, "g%d", inst->dst.nr);
5730       break;
5731    case MRF:
5732       fprintf(file, "m%d", inst->dst.nr);
5733       break;
5734    case BAD_FILE:
5735       fprintf(file, "(null)");
5736       break;
5737    case UNIFORM:
5738       fprintf(file, "***u%d***", inst->dst.nr);
5739       break;
5740    case ATTR:
5741       fprintf(file, "***attr%d***", inst->dst.nr);
5742       break;
5743    case ARF:
5744       switch (inst->dst.nr) {
5745       case ELK_ARF_NULL:
5746          fprintf(file, "null");
5747          break;
5748       case ELK_ARF_ADDRESS:
5749          fprintf(file, "a0.%d", inst->dst.subnr);
5750          break;
5751       case ELK_ARF_ACCUMULATOR:
5752          fprintf(file, "acc%d", inst->dst.subnr);
5753          break;
5754       case ELK_ARF_FLAG:
5755          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5756          break;
5757       default:
5758          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5759          break;
5760       }
5761       break;
5762    case IMM:
5763       unreachable("not reached");
5764    }
5765 
5766    if (inst->dst.offset ||
5767        (inst->dst.file == VGRF &&
5768         alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
5769       const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
5770       fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
5771               inst->dst.offset % reg_size);
5772    }
5773 
5774    if (inst->dst.stride != 1)
5775       fprintf(file, "<%u>", inst->dst.stride);
5776    fprintf(file, ":%s, ", elk_reg_type_to_letters(inst->dst.type));
5777 
5778    for (int i = 0; i < inst->sources; i++) {
5779       if (inst->src[i].negate)
5780          fprintf(file, "-");
5781       if (inst->src[i].abs)
5782          fprintf(file, "|");
5783       switch (inst->src[i].file) {
5784       case VGRF:
5785          fprintf(file, "vgrf%d", inst->src[i].nr);
5786          break;
5787       case FIXED_GRF:
5788          fprintf(file, "g%d", inst->src[i].nr);
5789          break;
5790       case MRF:
5791          fprintf(file, "***m%d***", inst->src[i].nr);
5792          break;
5793       case ATTR:
5794          fprintf(file, "attr%d", inst->src[i].nr);
5795          break;
5796       case UNIFORM:
5797          fprintf(file, "u%d", inst->src[i].nr);
5798          break;
5799       case BAD_FILE:
5800          fprintf(file, "(null)");
5801          break;
5802       case IMM:
5803          switch (inst->src[i].type) {
5804          case ELK_REGISTER_TYPE_HF:
5805             fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
5806             break;
5807          case ELK_REGISTER_TYPE_F:
5808             fprintf(file, "%-gf", inst->src[i].f);
5809             break;
5810          case ELK_REGISTER_TYPE_DF:
5811             fprintf(file, "%fdf", inst->src[i].df);
5812             break;
5813          case ELK_REGISTER_TYPE_W:
5814          case ELK_REGISTER_TYPE_D:
5815             fprintf(file, "%dd", inst->src[i].d);
5816             break;
5817          case ELK_REGISTER_TYPE_UW:
5818          case ELK_REGISTER_TYPE_UD:
5819             fprintf(file, "%uu", inst->src[i].ud);
5820             break;
5821          case ELK_REGISTER_TYPE_Q:
5822             fprintf(file, "%" PRId64 "q", inst->src[i].d64);
5823             break;
5824          case ELK_REGISTER_TYPE_UQ:
5825             fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
5826             break;
5827          case ELK_REGISTER_TYPE_VF:
5828             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
5829                     elk_vf_to_float((inst->src[i].ud >>  0) & 0xff),
5830                     elk_vf_to_float((inst->src[i].ud >>  8) & 0xff),
5831                     elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
5832                     elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
5833             break;
5834          case ELK_REGISTER_TYPE_V:
5835          case ELK_REGISTER_TYPE_UV:
5836             fprintf(file, "%08x%s", inst->src[i].ud,
5837                     inst->src[i].type == ELK_REGISTER_TYPE_V ? "V" : "UV");
5838             break;
5839          default:
5840             fprintf(file, "???");
5841             break;
5842          }
5843          break;
5844       case ARF:
5845          switch (inst->src[i].nr) {
5846          case ELK_ARF_NULL:
5847             fprintf(file, "null");
5848             break;
5849          case ELK_ARF_ADDRESS:
5850             fprintf(file, "a0.%d", inst->src[i].subnr);
5851             break;
5852          case ELK_ARF_ACCUMULATOR:
5853             fprintf(file, "acc%d", inst->src[i].subnr);
5854             break;
5855          case ELK_ARF_FLAG:
5856             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5857             break;
5858          default:
5859             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5860             break;
5861          }
5862          break;
5863       }
5864 
5865       if (inst->src[i].offset ||
5866           (inst->src[i].file == VGRF &&
5867            alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
5868          const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
5869          fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
5870                  inst->src[i].offset % reg_size);
5871       }
5872 
5873       if (inst->src[i].abs)
5874          fprintf(file, "|");
5875 
5876       if (inst->src[i].file != IMM) {
5877          unsigned stride;
5878          if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
5879             unsigned hstride = inst->src[i].hstride;
5880             stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
5881          } else {
5882             stride = inst->src[i].stride;
5883          }
5884          if (stride != 1)
5885             fprintf(file, "<%u>", stride);
5886 
5887          fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
5888       }
5889 
5890       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
5891          fprintf(file, ", ");
5892    }
5893 
5894    fprintf(file, " ");
5895 
5896    if (inst->force_writemask_all)
5897       fprintf(file, "NoMask ");
5898 
5899    if (inst->exec_size != dispatch_width)
5900       fprintf(file, "group%d ", inst->group);
5901 
5902    fprintf(file, "\n");
5903 }
5904 
register_pressure(const elk_fs_visitor * v)5905 elk::register_pressure::register_pressure(const elk_fs_visitor *v)
5906 {
5907    const fs_live_variables &live = v->live_analysis.require();
5908    const unsigned num_instructions = v->cfg->num_blocks ?
5909       v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
5910 
5911    regs_live_at_ip = new unsigned[num_instructions]();
5912 
5913    for (unsigned reg = 0; reg < v->alloc.count; reg++) {
5914       for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
5915          regs_live_at_ip[ip] += v->alloc.sizes[reg];
5916    }
5917 
5918    const unsigned payload_count = v->first_non_payload_grf;
5919 
5920    int *payload_last_use_ip = new int[payload_count];
5921    v->calculate_payload_ranges(payload_count, payload_last_use_ip);
5922 
5923    for (unsigned reg = 0; reg < payload_count; reg++) {
5924       for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
5925          ++regs_live_at_ip[ip];
5926    }
5927 
5928    delete[] payload_last_use_ip;
5929 }
5930 
~register_pressure()5931 elk::register_pressure::~register_pressure()
5932 {
5933    delete[] regs_live_at_ip;
5934 }
5935 
5936 void
invalidate_analysis(elk::analysis_dependency_class c)5937 elk_fs_visitor::invalidate_analysis(elk::analysis_dependency_class c)
5938 {
5939    elk_backend_shader::invalidate_analysis(c);
5940    live_analysis.invalidate(c);
5941    regpressure_analysis.invalidate(c);
5942 }
5943 
5944 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const5945 elk_fs_visitor::debug_optimizer(const nir_shader *nir,
5946                             const char *pass_name,
5947                             int iteration, int pass_num) const
5948 {
5949    if (!elk_should_print_shader(nir, DEBUG_OPTIMIZER))
5950       return;
5951 
5952    char *filename;
5953    int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
5954                       debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
5955                       _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
5956                       iteration, pass_num, pass_name);
5957    if (ret == -1)
5958       return;
5959    dump_instructions(filename);
5960    free(filename);
5961 }
5962 
5963 void
optimize()5964 elk_fs_visitor::optimize()
5965 {
5966    debug_optimizer(nir, "start", 0, 0);
5967 
5968    /* Start by validating the shader we currently have. */
5969    validate();
5970 
5971    bool progress = false;
5972    int iteration = 0;
5973    int pass_num = 0;
5974 
5975 #define OPT(pass, args...) ({                                           \
5976       pass_num++;                                                       \
5977       bool this_progress = pass(args);                                  \
5978                                                                         \
5979       if (this_progress)                                                \
5980          debug_optimizer(nir, #pass, iteration, pass_num);              \
5981                                                                         \
5982       validate();                                                       \
5983                                                                         \
5984       progress = progress || this_progress;                             \
5985       this_progress;                                                    \
5986    })
5987 
5988    assign_constant_locations();
5989    OPT(lower_constant_loads);
5990 
5991    validate();
5992 
5993    OPT(split_virtual_grfs);
5994 
5995    /* Before anything else, eliminate dead code.  The results of some NIR
5996     * instructions may effectively be calculated twice.  Once when the
5997     * instruction is encountered, and again when the user of that result is
5998     * encountered.  Wipe those away before algebraic optimizations and
5999     * especially copy propagation can mix things up.
6000     */
6001    OPT(dead_code_eliminate);
6002 
6003    OPT(remove_extra_rounding_modes);
6004 
6005    do {
6006       progress = false;
6007       pass_num = 0;
6008       iteration++;
6009 
6010       OPT(remove_duplicate_mrf_writes);
6011 
6012       OPT(opt_algebraic);
6013       OPT(opt_cse);
6014       OPT(opt_copy_propagation);
6015       OPT(elk_opt_predicated_break, this);
6016       OPT(opt_cmod_propagation);
6017       OPT(dead_code_eliminate);
6018       OPT(opt_peephole_sel);
6019       OPT(elk_dead_control_flow_eliminate, this);
6020       OPT(opt_saturate_propagation);
6021       OPT(register_coalesce);
6022       OPT(compute_to_mrf);
6023       OPT(eliminate_find_live_channel);
6024 
6025       OPT(compact_virtual_grfs);
6026    } while (progress);
6027 
6028    progress = false;
6029    pass_num = 0;
6030 
6031    if (OPT(lower_pack)) {
6032       OPT(register_coalesce);
6033       OPT(dead_code_eliminate);
6034    }
6035 
6036    OPT(lower_simd_width);
6037    OPT(lower_barycentrics);
6038    OPT(lower_logical_sends);
6039 
6040    /* After logical SEND lowering. */
6041 
6042    if (OPT(opt_copy_propagation))
6043       OPT(opt_algebraic);
6044 
6045    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
6046     * Do this before splitting SENDs.
6047     */
6048    if (devinfo->ver >= 7) {
6049       if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
6050          OPT(opt_algebraic);
6051    }
6052 
6053    OPT(opt_split_sends);
6054    OPT(fixup_nomask_control_flow);
6055 
6056    if (progress) {
6057       if (OPT(opt_copy_propagation))
6058          OPT(opt_algebraic);
6059 
6060       /* Run after logical send lowering to give it a chance to CSE the
6061        * LOAD_PAYLOAD instructions created to construct the payloads of
6062        * e.g. texturing messages in cases where it wasn't possible to CSE the
6063        * whole logical instruction.
6064        */
6065       OPT(opt_cse);
6066       OPT(register_coalesce);
6067       OPT(compute_to_mrf);
6068       OPT(dead_code_eliminate);
6069       OPT(remove_duplicate_mrf_writes);
6070       OPT(opt_peephole_sel);
6071    }
6072 
6073    OPT(opt_redundant_halt);
6074 
6075    if (OPT(lower_load_payload)) {
6076       OPT(split_virtual_grfs);
6077 
6078       /* Lower 64 bit MOVs generated by payload lowering. */
6079       if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
6080          OPT(opt_algebraic);
6081 
6082       OPT(register_coalesce);
6083       OPT(lower_simd_width);
6084       OPT(compute_to_mrf);
6085       OPT(dead_code_eliminate);
6086    }
6087 
6088    OPT(opt_combine_constants);
6089    if (OPT(lower_integer_multiplication)) {
6090       /* If lower_integer_multiplication made progress, it may have produced
6091        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
6092        * one more time to clean those up if they exist.
6093        */
6094       OPT(lower_integer_multiplication);
6095    }
6096    OPT(lower_sub_sat);
6097 
6098    if (devinfo->ver <= 5 && OPT(lower_minmax)) {
6099       OPT(opt_cmod_propagation);
6100       OPT(opt_cse);
6101       if (OPT(opt_copy_propagation))
6102          OPT(opt_algebraic);
6103       OPT(dead_code_eliminate);
6104    }
6105 
6106    progress = false;
6107    OPT(lower_derivatives);
6108    OPT(lower_regioning);
6109    if (progress) {
6110       if (OPT(opt_copy_propagation))
6111          OPT(opt_algebraic);
6112       OPT(dead_code_eliminate);
6113       OPT(lower_simd_width);
6114    }
6115 
6116    OPT(fixup_sends_duplicate_payload);
6117 
6118    OPT(lower_uniform_pull_constant_loads);
6119 
6120    OPT(lower_find_live_channel);
6121 
6122    validate();
6123 }
6124 
6125 /**
6126  * From the Skylake PRM Vol. 2a docs for sends:
6127  *
6128  *    "It is required that the second block of GRFs does not overlap with the
6129  *    first block."
6130  *
6131  * There are plenty of cases where we may accidentally violate this due to
6132  * having, for instance, both sources be the constant 0.  This little pass
6133  * just adds a new vgrf for the second payload and copies it over.
6134  */
6135 bool
fixup_sends_duplicate_payload()6136 elk_fs_visitor::fixup_sends_duplicate_payload()
6137 {
6138    bool progress = false;
6139 
6140    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
6141       if (inst->opcode == ELK_SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
6142           regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
6143                           inst->src[3], inst->ex_mlen * REG_SIZE)) {
6144          elk_fs_reg tmp = elk_fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
6145                              ELK_REGISTER_TYPE_UD);
6146          /* Sadly, we've lost all notion of channels and bit sizes at this
6147           * point.  Just WE_all it.
6148           */
6149          const fs_builder ibld = fs_builder(this, block, inst).exec_all().group(16, 0);
6150          elk_fs_reg copy_src = retype(inst->src[3], ELK_REGISTER_TYPE_UD);
6151          elk_fs_reg copy_dst = tmp;
6152          for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
6153             if (inst->ex_mlen == i + 1) {
6154                /* Only one register left; do SIMD8 */
6155                ibld.group(8, 0).MOV(copy_dst, copy_src);
6156             } else {
6157                ibld.MOV(copy_dst, copy_src);
6158             }
6159             copy_src = offset(copy_src, ibld, 1);
6160             copy_dst = offset(copy_dst, ibld, 1);
6161          }
6162          inst->src[3] = tmp;
6163          progress = true;
6164       }
6165    }
6166 
6167    if (progress)
6168       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6169 
6170    return progress;
6171 }
6172 
6173 /**
6174  * Three source instruction must have a GRF/MRF destination register.
6175  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
6176  */
6177 void
fixup_3src_null_dest()6178 elk_fs_visitor::fixup_3src_null_dest()
6179 {
6180    bool progress = false;
6181 
6182    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
6183       if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
6184          inst->dst = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
6185                             inst->dst.type);
6186          progress = true;
6187       }
6188    }
6189 
6190    if (progress)
6191       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
6192                           DEPENDENCY_VARIABLES);
6193 }
6194 
6195 static bool
needs_dummy_fence(const intel_device_info * devinfo,elk_fs_inst * inst)6196 needs_dummy_fence(const intel_device_info *devinfo, elk_fs_inst *inst)
6197 {
6198    /* This workaround is about making sure that any instruction writing
6199     * through UGM has completed before we hit EOT.
6200     */
6201    if (inst->sfid != GFX12_SFID_UGM)
6202       return false;
6203 
6204    /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
6205     * where the L1-cache override is NOT among {WB, WS, WT}
6206     */
6207    enum elk_lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
6208    if (elk_lsc_opcode_is_store(opcode)) {
6209       switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
6210       case LSC_CACHE_STORE_L1STATE_L3MOCS:
6211       case LSC_CACHE_STORE_L1WB_L3WB:
6212       case LSC_CACHE_STORE_L1S_L3UC:
6213       case LSC_CACHE_STORE_L1S_L3WB:
6214       case LSC_CACHE_STORE_L1WT_L3UC:
6215       case LSC_CACHE_STORE_L1WT_L3WB:
6216          return false;
6217 
6218       default:
6219          return true;
6220       }
6221    }
6222 
6223    /* Any UGM Atomic message WITHOUT return value */
6224    if (elk_lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
6225       return true;
6226 
6227    return false;
6228 }
6229 
6230 /* Wa_14015360517
6231  *
6232  * The first instruction of any kernel should have non-zero emask.
6233  * Make sure this happens by introducing a dummy mov instruction.
6234  */
6235 void
emit_dummy_mov_instruction()6236 elk_fs_visitor::emit_dummy_mov_instruction()
6237 {
6238    if (!intel_needs_workaround(devinfo, 14015360517))
6239       return;
6240 
6241    struct elk_backend_instruction *first_inst =
6242       cfg->first_block()->start();
6243 
6244    /* We can skip the WA if first instruction is marked with
6245     * force_writemask_all or exec_size equals dispatch_width.
6246     */
6247    if (first_inst->force_writemask_all ||
6248        first_inst->exec_size == dispatch_width)
6249       return;
6250 
6251    /* Insert dummy mov as first instruction. */
6252    const fs_builder ubld =
6253       fs_builder(this, cfg->first_block(), (elk_fs_inst *)first_inst).exec_all().group(8, 0);
6254    ubld.MOV(ubld.null_reg_ud(), elk_imm_ud(0u));
6255 
6256    invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6257 }
6258 
6259 /* Wa_22013689345
6260  *
6261  * We need to emit UGM fence message before EOT, if shader has any UGM write
6262  * or atomic message.
6263  *
6264  * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
6265  *                We probably need a better criteria in needs_dummy_fence().
6266  */
6267 void
emit_dummy_memory_fence_before_eot()6268 elk_fs_visitor::emit_dummy_memory_fence_before_eot()
6269 {
6270    bool progress = false;
6271    bool has_ugm_write_or_atomic = false;
6272 
6273    if (!intel_needs_workaround(devinfo, 22013689345))
6274       return;
6275 
6276    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
6277       if (!inst->eot) {
6278          if (needs_dummy_fence(devinfo, inst))
6279             has_ugm_write_or_atomic = true;
6280          continue;
6281       }
6282 
6283       if (!has_ugm_write_or_atomic)
6284          break;
6285 
6286       const fs_builder ibld(this, block, inst);
6287       const fs_builder ubld = ibld.exec_all().group(1, 0);
6288 
6289       elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
6290       elk_fs_inst *dummy_fence = ubld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE,
6291                                        dst, elk_vec8_grf(0, 0),
6292                                        /* commit enable */ elk_imm_ud(1),
6293                                        /* bti */ elk_imm_ud(0));
6294       dummy_fence->sfid = GFX12_SFID_UGM;
6295       dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
6296                                              LSC_FLUSH_TYPE_NONE_6, false);
6297       ubld.emit(ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
6298       progress = true;
6299       /* TODO: remove this break if we ever have shader with multiple EOT. */
6300       break;
6301    }
6302 
6303    if (progress) {
6304       invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
6305                           DEPENDENCY_VARIABLES);
6306    }
6307 }
6308 
6309 /**
6310  * Find the first instruction in the program that might start a region of
6311  * divergent control flow due to a HALT jump.  There is no
6312  * find_halt_control_flow_region_end(), the region of divergence extends until
6313  * the only ELK_SHADER_OPCODE_HALT_TARGET in the program.
6314  */
6315 static const elk_fs_inst *
find_halt_control_flow_region_start(const elk_fs_visitor * v)6316 find_halt_control_flow_region_start(const elk_fs_visitor *v)
6317 {
6318    foreach_block_and_inst(block, elk_fs_inst, inst, v->cfg) {
6319       if (inst->opcode == ELK_OPCODE_HALT ||
6320           inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET)
6321          return inst;
6322    }
6323 
6324    return NULL;
6325 }
6326 
6327 /**
6328  * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
6329  * can cause a BB to be executed with all channels disabled, which will lead
6330  * to the execution of any NoMask instructions in it, even though any
6331  * execution-masked instructions will be correctly shot down.  This may break
6332  * assumptions of some NoMask SEND messages whose descriptor depends on data
6333  * generated by live invocations of the shader.
6334  *
6335  * This avoids the problem by predicating certain instructions on an ANY
6336  * horizontal predicate that makes sure that their execution is omitted when
6337  * all channels of the program are disabled.
6338  */
6339 bool
fixup_nomask_control_flow()6340 elk_fs_visitor::fixup_nomask_control_flow()
6341 {
6342    if (devinfo->ver != 12)
6343       return false;
6344 
6345    const elk_predicate pred = dispatch_width > 16 ? ELK_PREDICATE_ALIGN1_ANY32H :
6346                               dispatch_width > 8 ? ELK_PREDICATE_ALIGN1_ANY16H :
6347                               ELK_PREDICATE_ALIGN1_ANY8H;
6348    const elk_fs_inst *halt_start = find_halt_control_flow_region_start(this);
6349    unsigned depth = 0;
6350    bool progress = false;
6351 
6352    const fs_live_variables &live_vars = live_analysis.require();
6353 
6354    /* Scan the program backwards in order to be able to easily determine
6355     * whether the flag register is live at any point.
6356     */
6357    foreach_block_reverse_safe(block, cfg) {
6358       BITSET_WORD flag_liveout = live_vars.block_data[block->num]
6359                                                .flag_liveout[0];
6360       STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
6361 
6362       foreach_inst_in_block_reverse_safe(elk_fs_inst, inst, block) {
6363          if (!inst->predicate && inst->exec_size >= 8)
6364             flag_liveout &= ~inst->flags_written(devinfo);
6365 
6366          switch (inst->opcode) {
6367          case ELK_OPCODE_DO:
6368          case ELK_OPCODE_IF:
6369             /* Note that this doesn't handle ELK_OPCODE_HALT since only
6370              * the first one in the program closes the region of divergent
6371              * control flow due to any HALT instructions -- Instead this is
6372              * handled with the halt_start check below.
6373              */
6374             depth--;
6375             break;
6376 
6377          case ELK_OPCODE_WHILE:
6378          case ELK_OPCODE_ENDIF:
6379          case ELK_SHADER_OPCODE_HALT_TARGET:
6380             depth++;
6381             break;
6382 
6383          default:
6384             /* Note that the vast majority of NoMask SEND instructions in the
6385              * program are harmless while executed in a block with all
6386              * channels disabled, since any instructions with side effects we
6387              * could hit here should be execution-masked.
6388              *
6389              * The main concern is NoMask SEND instructions where the message
6390              * descriptor or header depends on data generated by live
6391              * invocations of the shader (RESINFO and
6392              * ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
6393              * computed surface index seem to be the only examples right now
6394              * where this could easily lead to GPU hangs).  Unfortunately we
6395              * have no straightforward way to detect that currently, so just
6396              * predicate any NoMask SEND instructions we find under control
6397              * flow.
6398              *
6399              * If this proves to have a measurable performance impact it can
6400              * be easily extended with a whitelist of messages we know we can
6401              * safely omit the predication for.
6402              */
6403             if (depth && inst->force_writemask_all &&
6404                 is_send(inst) && !inst->predicate) {
6405                /* We need to load the execution mask into the flag register by
6406                 * using a builder with channel group matching the whole shader
6407                 * (rather than the default which is derived from the original
6408                 * instruction), in order to avoid getting a right-shifted
6409                 * value.
6410                 */
6411                const fs_builder ubld = fs_builder(this, block, inst)
6412                                        .exec_all().group(dispatch_width, 0);
6413                const elk_fs_reg flag = retype(elk_flag_reg(0, 0),
6414                                           ELK_REGISTER_TYPE_UD);
6415 
6416                /* Due to the lack of flag register allocation we need to save
6417                 * and restore the flag register if it's live.
6418                 */
6419                const bool save_flag = flag_liveout &
6420                                       flag_mask(flag, dispatch_width / 8);
6421                const elk_fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
6422 
6423                if (save_flag) {
6424                   ubld.group(8, 0).UNDEF(tmp);
6425                   ubld.group(1, 0).MOV(tmp, flag);
6426                }
6427 
6428                ubld.emit(ELK_FS_OPCODE_LOAD_LIVE_CHANNELS);
6429 
6430                set_predicate(pred, inst);
6431                inst->flag_subreg = 0;
6432                inst->predicate_trivial = true;
6433 
6434                if (save_flag)
6435                   ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
6436 
6437                progress = true;
6438             }
6439             break;
6440          }
6441 
6442          if (inst == halt_start)
6443             depth--;
6444 
6445          flag_liveout |= inst->flags_read(devinfo);
6446       }
6447    }
6448 
6449    if (progress)
6450       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6451 
6452    return progress;
6453 }
6454 
6455 uint32_t
compute_max_register_pressure()6456 elk_fs_visitor::compute_max_register_pressure()
6457 {
6458    const register_pressure &rp = regpressure_analysis.require();
6459    uint32_t ip = 0, max_pressure = 0;
6460    foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
6461       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
6462       ip++;
6463    }
6464    return max_pressure;
6465 }
6466 
6467 static elk_fs_inst **
save_instruction_order(const struct elk_cfg_t * cfg)6468 save_instruction_order(const struct elk_cfg_t *cfg)
6469 {
6470    /* Before we schedule anything, stash off the instruction order as an array
6471     * of elk_fs_inst *.  This way, we can reset it between scheduling passes to
6472     * prevent dependencies between the different scheduling modes.
6473     */
6474    int num_insts = cfg->last_block()->end_ip + 1;
6475    elk_fs_inst **inst_arr = new elk_fs_inst * [num_insts];
6476 
6477    int ip = 0;
6478    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
6479       assert(ip >= block->start_ip && ip <= block->end_ip);
6480       inst_arr[ip++] = inst;
6481    }
6482    assert(ip == num_insts);
6483 
6484    return inst_arr;
6485 }
6486 
6487 static void
restore_instruction_order(struct elk_cfg_t * cfg,elk_fs_inst ** inst_arr)6488 restore_instruction_order(struct elk_cfg_t *cfg, elk_fs_inst **inst_arr)
6489 {
6490    ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
6491 
6492    int ip = 0;
6493    foreach_block (block, cfg) {
6494       block->instructions.make_empty();
6495 
6496       assert(ip == block->start_ip);
6497       for (; ip <= block->end_ip; ip++)
6498          block->instructions.push_tail(inst_arr[ip]);
6499    }
6500    assert(ip == num_insts);
6501 }
6502 
6503 void
allocate_registers(bool allow_spilling)6504 elk_fs_visitor::allocate_registers(bool allow_spilling)
6505 {
6506    bool allocated;
6507 
6508    static const enum instruction_scheduler_mode pre_modes[] = {
6509       SCHEDULE_PRE,
6510       SCHEDULE_PRE_NON_LIFO,
6511       SCHEDULE_NONE,
6512       SCHEDULE_PRE_LIFO,
6513    };
6514 
6515    static const char *scheduler_mode_name[] = {
6516       [SCHEDULE_PRE] = "top-down",
6517       [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
6518       [SCHEDULE_PRE_LIFO] = "lifo",
6519       [SCHEDULE_POST] = "post",
6520       [SCHEDULE_NONE] = "none",
6521    };
6522 
6523    uint32_t best_register_pressure = UINT32_MAX;
6524    enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
6525 
6526    compact_virtual_grfs();
6527 
6528    if (needs_register_pressure)
6529       shader_stats.max_register_pressure = compute_max_register_pressure();
6530 
6531    debug_optimizer(nir, "pre_register_allocate", 90, 90);
6532 
6533    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
6534 
6535    /* Before we schedule anything, stash off the instruction order as an array
6536     * of elk_fs_inst *.  This way, we can reset it between scheduling passes to
6537     * prevent dependencies between the different scheduling modes.
6538     */
6539    elk_fs_inst **orig_order = save_instruction_order(cfg);
6540    elk_fs_inst **best_pressure_order = NULL;
6541 
6542    void *scheduler_ctx = ralloc_context(NULL);
6543    elk_fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
6544 
6545    /* Try each scheduling heuristic to see if it can successfully register
6546     * allocate without spilling.  They should be ordered by decreasing
6547     * performance but increasing likelihood of allocating.
6548     */
6549    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
6550       enum instruction_scheduler_mode sched_mode = pre_modes[i];
6551 
6552       schedule_instructions_pre_ra(sched, sched_mode);
6553       this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
6554 
6555       debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
6556 
6557       if (0) {
6558          assign_regs_trivial();
6559          allocated = true;
6560          break;
6561       }
6562 
6563       /* We should only spill registers on the last scheduling. */
6564       assert(!spilled_any_registers);
6565 
6566       allocated = assign_regs(false, spill_all);
6567       if (allocated)
6568          break;
6569 
6570       /* Save the maximum register pressure */
6571       uint32_t this_pressure = compute_max_register_pressure();
6572 
6573       if (0) {
6574          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
6575                  scheduler_mode_name[sched_mode], this_pressure);
6576       }
6577 
6578       if (this_pressure < best_register_pressure) {
6579          best_register_pressure = this_pressure;
6580          best_sched = sched_mode;
6581          delete[] best_pressure_order;
6582          best_pressure_order = save_instruction_order(cfg);
6583       }
6584 
6585       /* Reset back to the original order before trying the next mode */
6586       restore_instruction_order(cfg, orig_order);
6587       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
6588    }
6589 
6590    ralloc_free(scheduler_ctx);
6591 
6592    if (!allocated) {
6593       if (0) {
6594          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
6595                  scheduler_mode_name[best_sched]);
6596       }
6597       restore_instruction_order(cfg, best_pressure_order);
6598       shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
6599 
6600       allocated = assign_regs(allow_spilling, spill_all);
6601    }
6602 
6603    delete[] orig_order;
6604    delete[] best_pressure_order;
6605 
6606    if (!allocated) {
6607       fail("Failure to register allocate.  Reduce number of "
6608            "live scalar values to avoid this.");
6609    } else if (spilled_any_registers) {
6610       elk_shader_perf_log(compiler, log_data,
6611                           "%s shader triggered register spilling.  "
6612                           "Try reducing the number of live scalar "
6613                           "values to improve performance.\n",
6614                           _mesa_shader_stage_to_string(stage));
6615    }
6616 
6617    /* This must come after all optimization and register allocation, since
6618     * it inserts dead code that happens to have side effects, and it does
6619     * so based on the actual physical registers in use.
6620     */
6621    insert_gfx4_send_dependency_workarounds();
6622 
6623    if (failed)
6624       return;
6625 
6626    opt_bank_conflicts();
6627 
6628    schedule_instructions_post_ra();
6629 
6630    if (last_scratch > 0) {
6631       ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
6632 
6633       /* Take the max of any previously compiled variant of the shader. In the
6634        * case of bindless shaders with return parts, this will also take the
6635        * max of all parts.
6636        */
6637       prog_data->total_scratch = MAX2(elk_get_scratch_size(last_scratch),
6638                                       prog_data->total_scratch);
6639 
6640       if (gl_shader_stage_is_compute(stage)) {
6641          if (devinfo->platform == INTEL_PLATFORM_HSW) {
6642             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
6643              * field documentation, Haswell supports a minimum of 2kB of
6644              * scratch space for compute shaders, unlike every other stage
6645              * and platform.
6646              */
6647             prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
6648          } else if (devinfo->ver <= 7) {
6649             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
6650              * field documentation, platforms prior to Haswell measure scratch
6651              * size linearly with a range of [1kB, 12kB] and 1kB granularity.
6652              */
6653             prog_data->total_scratch = ALIGN(last_scratch, 1024);
6654             max_scratch_size = 12 * 1024;
6655          }
6656       }
6657 
6658       /* We currently only support up to 2MB of scratch space.  If we
6659        * need to support more eventually, the documentation suggests
6660        * that we could allocate a larger buffer, and partition it out
6661        * ourselves.  We'd just have to undo the hardware's address
6662        * calculation by subtracting (FFTID * Per Thread Scratch Space)
6663        * and then add FFTID * (Larger Per Thread Scratch Space).
6664        *
6665        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
6666        * Thread Group Tracking > Local Memory/Scratch Space.
6667        */
6668       assert(prog_data->total_scratch < max_scratch_size);
6669    }
6670 }
6671 
6672 bool
run_vs()6673 elk_fs_visitor::run_vs()
6674 {
6675    assert(stage == MESA_SHADER_VERTEX);
6676 
6677    payload_ = new elk_vs_thread_payload(*this);
6678 
6679    nir_to_elk(this);
6680 
6681    if (failed)
6682       return false;
6683 
6684    emit_urb_writes();
6685 
6686    calculate_cfg();
6687 
6688    optimize();
6689 
6690    assign_curb_setup();
6691    assign_vs_urb_setup();
6692 
6693    fixup_3src_null_dest();
6694    emit_dummy_memory_fence_before_eot();
6695 
6696    /* Wa_14015360517 */
6697    emit_dummy_mov_instruction();
6698 
6699    allocate_registers(true /* allow_spilling */);
6700 
6701    return !failed;
6702 }
6703 
6704 void
set_tcs_invocation_id()6705 elk_fs_visitor::set_tcs_invocation_id()
6706 {
6707    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(prog_data);
6708    struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
6709    const fs_builder bld = fs_builder(this).at_end();
6710 
6711    const unsigned instance_id_mask =
6712       (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
6713       (devinfo->ver >= 11)     ? INTEL_MASK(22, 16) :
6714                                  INTEL_MASK(23, 17);
6715    const unsigned instance_id_shift =
6716       (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
6717 
6718    /* Get instance number from g0.2 bits:
6719     *  * 7:0 on DG2+
6720     *  * 22:16 on gfx11+
6721     *  * 23:17 otherwise
6722     */
6723    elk_fs_reg t = bld.vgrf(ELK_REGISTER_TYPE_UD);
6724    bld.AND(t, elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD)),
6725            elk_imm_ud(instance_id_mask));
6726 
6727    invocation_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
6728 
6729    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
6730       /* gl_InvocationID is just the thread number */
6731       bld.SHR(invocation_id, t, elk_imm_ud(instance_id_shift));
6732       return;
6733    }
6734 
6735    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
6736 
6737    elk_fs_reg channels_uw = bld.vgrf(ELK_REGISTER_TYPE_UW);
6738    elk_fs_reg channels_ud = bld.vgrf(ELK_REGISTER_TYPE_UD);
6739    bld.MOV(channels_uw, elk_fs_reg(elk_imm_uv(0x76543210)));
6740    bld.MOV(channels_ud, channels_uw);
6741 
6742    if (tcs_prog_data->instances == 1) {
6743       invocation_id = channels_ud;
6744    } else {
6745       elk_fs_reg instance_times_8 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6746       bld.SHR(instance_times_8, t, elk_imm_ud(instance_id_shift - 3));
6747       bld.ADD(invocation_id, instance_times_8, channels_ud);
6748    }
6749 }
6750 
6751 void
emit_tcs_thread_end()6752 elk_fs_visitor::emit_tcs_thread_end()
6753 {
6754    /* Try and tag the last URB write with EOT instead of emitting a whole
6755     * separate write just to finish the thread.  There isn't guaranteed to
6756     * be one, so this may not succeed.
6757     */
6758    if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
6759       return;
6760 
6761    const fs_builder bld = fs_builder(this).at_end();
6762 
6763    /* Emit a URB write to end the thread.  On Broadwell, we use this to write
6764     * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
6765     * algorithm to set it optimally).  On other platforms, we simply write
6766     * zero to a reserved/MBZ patch header DWord which has no consequence.
6767     */
6768    elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
6769    srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
6770    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = elk_imm_ud(WRITEMASK_X << 16);
6771    srcs[URB_LOGICAL_SRC_DATA] = elk_imm_ud(0);
6772    srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
6773    elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL,
6774                             reg_undef, srcs, ARRAY_SIZE(srcs));
6775    inst->eot = true;
6776 }
6777 
6778 bool
run_tcs()6779 elk_fs_visitor::run_tcs()
6780 {
6781    assert(stage == MESA_SHADER_TESS_CTRL);
6782 
6783    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
6784    const fs_builder bld = fs_builder(this).at_end();
6785 
6786    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
6787           vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
6788 
6789    payload_ = new elk_tcs_thread_payload(*this);
6790 
6791    /* Initialize gl_InvocationID */
6792    set_tcs_invocation_id();
6793 
6794    const bool fix_dispatch_mask =
6795       vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
6796       (nir->info.tess.tcs_vertices_out % 8) != 0;
6797 
6798    /* Fix the disptach mask */
6799    if (fix_dispatch_mask) {
6800       bld.CMP(bld.null_reg_ud(), invocation_id,
6801               elk_imm_ud(nir->info.tess.tcs_vertices_out), ELK_CONDITIONAL_L);
6802       bld.IF(ELK_PREDICATE_NORMAL);
6803    }
6804 
6805    nir_to_elk(this);
6806 
6807    if (fix_dispatch_mask) {
6808       bld.emit(ELK_OPCODE_ENDIF);
6809    }
6810 
6811    emit_tcs_thread_end();
6812 
6813    if (failed)
6814       return false;
6815 
6816    calculate_cfg();
6817 
6818    optimize();
6819 
6820    assign_curb_setup();
6821    assign_tcs_urb_setup();
6822 
6823    fixup_3src_null_dest();
6824    emit_dummy_memory_fence_before_eot();
6825 
6826    /* Wa_14015360517 */
6827    emit_dummy_mov_instruction();
6828 
6829    allocate_registers(true /* allow_spilling */);
6830 
6831    return !failed;
6832 }
6833 
6834 bool
run_tes()6835 elk_fs_visitor::run_tes()
6836 {
6837    assert(stage == MESA_SHADER_TESS_EVAL);
6838 
6839    payload_ = new elk_tes_thread_payload(*this);
6840 
6841    nir_to_elk(this);
6842 
6843    if (failed)
6844       return false;
6845 
6846    emit_urb_writes();
6847 
6848    calculate_cfg();
6849 
6850    optimize();
6851 
6852    assign_curb_setup();
6853    assign_tes_urb_setup();
6854 
6855    fixup_3src_null_dest();
6856    emit_dummy_memory_fence_before_eot();
6857 
6858    /* Wa_14015360517 */
6859    emit_dummy_mov_instruction();
6860 
6861    allocate_registers(true /* allow_spilling */);
6862 
6863    return !failed;
6864 }
6865 
6866 bool
run_gs()6867 elk_fs_visitor::run_gs()
6868 {
6869    assert(stage == MESA_SHADER_GEOMETRY);
6870 
6871    payload_ = new elk_gs_thread_payload(*this);
6872 
6873    this->final_gs_vertex_count = vgrf(glsl_uint_type());
6874 
6875    if (gs_compile->control_data_header_size_bits > 0) {
6876       /* Create a VGRF to store accumulated control data bits. */
6877       this->control_data_bits = vgrf(glsl_uint_type());
6878 
6879       /* If we're outputting more than 32 control data bits, then EmitVertex()
6880        * will set control_data_bits to 0 after emitting the first vertex.
6881        * Otherwise, we need to initialize it to 0 here.
6882        */
6883       if (gs_compile->control_data_header_size_bits <= 32) {
6884          const fs_builder bld = fs_builder(this).at_end();
6885          const fs_builder abld = bld.annotate("initialize control data bits");
6886          abld.MOV(this->control_data_bits, elk_imm_ud(0u));
6887       }
6888    }
6889 
6890    nir_to_elk(this);
6891 
6892    emit_gs_thread_end();
6893 
6894    if (failed)
6895       return false;
6896 
6897    calculate_cfg();
6898 
6899    optimize();
6900 
6901    assign_curb_setup();
6902    assign_gs_urb_setup();
6903 
6904    fixup_3src_null_dest();
6905    emit_dummy_memory_fence_before_eot();
6906 
6907    /* Wa_14015360517 */
6908    emit_dummy_mov_instruction();
6909 
6910    allocate_registers(true /* allow_spilling */);
6911 
6912    return !failed;
6913 }
6914 
6915 /* From the SKL PRM, Volume 16, Workarounds:
6916  *
6917  *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
6918  *              only header phases (R0-R2)
6919  *
6920  *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
6921  *       have been header only.
6922  *
6923  * Instead of enabling push constants one can alternatively enable one of the
6924  * inputs. Here one simply chooses "layer" which shouldn't impose much
6925  * overhead.
6926  */
6927 static void
gfx9_ps_header_only_workaround(struct elk_wm_prog_data * wm_prog_data)6928 gfx9_ps_header_only_workaround(struct elk_wm_prog_data *wm_prog_data)
6929 {
6930    if (wm_prog_data->num_varying_inputs)
6931       return;
6932 
6933    if (wm_prog_data->base.curb_read_length)
6934       return;
6935 
6936    wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
6937    wm_prog_data->num_varying_inputs = 1;
6938 
6939    elk_compute_urb_setup_index(wm_prog_data);
6940 }
6941 
6942 bool
run_fs(bool allow_spilling,bool do_rep_send)6943 elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
6944 {
6945    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(this->prog_data);
6946    elk_wm_prog_key *wm_key = (elk_wm_prog_key *) this->key;
6947    const fs_builder bld = fs_builder(this).at_end();
6948 
6949    assert(stage == MESA_SHADER_FRAGMENT);
6950 
6951    payload_ = new elk_fs_thread_payload(*this, source_depth_to_render_target,
6952                                     runtime_check_aads_emit);
6953 
6954    if (do_rep_send) {
6955       assert(dispatch_width == 16);
6956       emit_repclear_shader();
6957    } else {
6958       if (nir->info.inputs_read > 0 ||
6959           BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
6960           (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
6961          if (devinfo->ver < 6)
6962             emit_interpolation_setup_gfx4();
6963          else
6964             emit_interpolation_setup_gfx6();
6965       }
6966 
6967       /* We handle discards by keeping track of the still-live pixels in f0.1.
6968        * Initialize it with the dispatched pixels.
6969        */
6970       if (wm_prog_data->uses_kill) {
6971          const unsigned lower_width = MIN2(dispatch_width, 16);
6972          for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
6973             /* According to the "PS Thread Payload for Normal
6974              * Dispatch" pages on the BSpec, the dispatch mask is
6975              * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
6976              * gfx6+.
6977              */
6978             const elk_fs_reg dispatch_mask =
6979                devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
6980                devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) :
6981                elk_vec1_grf(0, 0);
6982             bld.exec_all().group(1, 0)
6983                .MOV(elk_sample_mask_reg(bld.group(lower_width, i)),
6984                     retype(dispatch_mask, ELK_REGISTER_TYPE_UW));
6985          }
6986       }
6987 
6988       if (nir->info.writes_memory)
6989          wm_prog_data->has_side_effects = true;
6990 
6991       nir_to_elk(this);
6992 
6993       if (failed)
6994 	 return false;
6995 
6996       if (wm_key->emit_alpha_test)
6997          emit_alpha_test();
6998 
6999       emit_fb_writes();
7000 
7001       calculate_cfg();
7002 
7003       optimize();
7004 
7005       assign_curb_setup();
7006 
7007       if (devinfo->ver == 9)
7008          gfx9_ps_header_only_workaround(wm_prog_data);
7009 
7010       assign_urb_setup();
7011 
7012       fixup_3src_null_dest();
7013       emit_dummy_memory_fence_before_eot();
7014 
7015       /* Wa_14015360517 */
7016       emit_dummy_mov_instruction();
7017 
7018       allocate_registers(allow_spilling);
7019    }
7020 
7021    return !failed;
7022 }
7023 
7024 bool
run_cs(bool allow_spilling)7025 elk_fs_visitor::run_cs(bool allow_spilling)
7026 {
7027    assert(gl_shader_stage_is_compute(stage));
7028    assert(devinfo->ver >= 7);
7029    const fs_builder bld = fs_builder(this).at_end();
7030 
7031    payload_ = new elk_cs_thread_payload(*this);
7032 
7033    if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
7034       /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
7035       const fs_builder abld = bld.exec_all().group(1, 0);
7036       abld.MOV(retype(elk_sr0_reg(1), ELK_REGISTER_TYPE_UW),
7037                suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW), 1));
7038    }
7039 
7040    nir_to_elk(this);
7041 
7042    if (failed)
7043       return false;
7044 
7045    emit_cs_terminate();
7046 
7047    calculate_cfg();
7048 
7049    optimize();
7050 
7051    assign_curb_setup();
7052 
7053    fixup_3src_null_dest();
7054    emit_dummy_memory_fence_before_eot();
7055 
7056    /* Wa_14015360517 */
7057    emit_dummy_mov_instruction();
7058 
7059    allocate_registers(allow_spilling);
7060 
7061    return !failed;
7062 }
7063 
7064 static bool
is_used_in_not_interp_frag_coord(nir_def * def)7065 is_used_in_not_interp_frag_coord(nir_def *def)
7066 {
7067    nir_foreach_use_including_if(src, def) {
7068       if (nir_src_is_if(src))
7069          return true;
7070 
7071       if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
7072          return true;
7073 
7074       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
7075       if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
7076          return true;
7077    }
7078 
7079    return false;
7080 }
7081 
7082 /**
7083  * Return a bitfield where bit n is set if barycentric interpolation mode n
7084  * (see enum elk_barycentric_mode) is needed by the fragment shader.
7085  *
7086  * We examine the load_barycentric intrinsics rather than looking at input
7087  * variables so that we catch interpolateAtCentroid() messages too, which
7088  * also need the ELK_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
7089  */
7090 static unsigned
elk_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)7091 elk_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
7092                                      const nir_shader *shader)
7093 {
7094    unsigned barycentric_interp_modes = 0;
7095 
7096    nir_foreach_function_impl(impl, shader) {
7097       nir_foreach_block(block, impl) {
7098          nir_foreach_instr(instr, block) {
7099             if (instr->type != nir_instr_type_intrinsic)
7100                continue;
7101 
7102             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
7103             switch (intrin->intrinsic) {
7104             case nir_intrinsic_load_barycentric_pixel:
7105             case nir_intrinsic_load_barycentric_centroid:
7106             case nir_intrinsic_load_barycentric_sample:
7107             case nir_intrinsic_load_barycentric_at_sample:
7108             case nir_intrinsic_load_barycentric_at_offset:
7109                break;
7110             default:
7111                continue;
7112             }
7113 
7114             /* Ignore WPOS; it doesn't require interpolation. */
7115             if (!is_used_in_not_interp_frag_coord(&intrin->def))
7116                continue;
7117 
7118             nir_intrinsic_op bary_op = intrin->intrinsic;
7119             enum elk_barycentric_mode bary =
7120                elk_barycentric_mode(intrin);
7121 
7122             barycentric_interp_modes |= 1 << bary;
7123 
7124             if (devinfo->needs_unlit_centroid_workaround &&
7125                 bary_op == nir_intrinsic_load_barycentric_centroid)
7126                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
7127          }
7128       }
7129    }
7130 
7131    return barycentric_interp_modes;
7132 }
7133 
7134 static void
elk_compute_flat_inputs(struct elk_wm_prog_data * prog_data,const nir_shader * shader)7135 elk_compute_flat_inputs(struct elk_wm_prog_data *prog_data,
7136                         const nir_shader *shader)
7137 {
7138    prog_data->flat_inputs = 0;
7139 
7140    nir_foreach_shader_in_variable(var, shader) {
7141       /* flat shading */
7142       if (var->data.interpolation != INTERP_MODE_FLAT)
7143          continue;
7144 
7145       if (var->data.per_primitive)
7146          continue;
7147 
7148       unsigned slots = glsl_count_attribute_slots(var->type, false);
7149       for (unsigned s = 0; s < slots; s++) {
7150          int input_index = prog_data->urb_setup[var->data.location + s];
7151 
7152          if (input_index >= 0)
7153             prog_data->flat_inputs |= 1 << input_index;
7154       }
7155    }
7156 }
7157 
7158 static uint8_t
computed_depth_mode(const nir_shader * shader)7159 computed_depth_mode(const nir_shader *shader)
7160 {
7161    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
7162       switch (shader->info.fs.depth_layout) {
7163       case FRAG_DEPTH_LAYOUT_NONE:
7164       case FRAG_DEPTH_LAYOUT_ANY:
7165          return ELK_PSCDEPTH_ON;
7166       case FRAG_DEPTH_LAYOUT_GREATER:
7167          return ELK_PSCDEPTH_ON_GE;
7168       case FRAG_DEPTH_LAYOUT_LESS:
7169          return ELK_PSCDEPTH_ON_LE;
7170       case FRAG_DEPTH_LAYOUT_UNCHANGED:
7171          /* We initially set this to OFF, but having the shader write the
7172           * depth means we allocate register space in the SEND message. The
7173           * difference between the SEND register count and the OFF state
7174           * programming makes the HW hang.
7175           *
7176           * Removing the depth writes also leads to test failures. So use
7177           * LesserThanOrEqual, which fits writing the same value
7178           * (unchanged/equal).
7179           *
7180           */
7181          return ELK_PSCDEPTH_ON_LE;
7182       }
7183    }
7184    return ELK_PSCDEPTH_OFF;
7185 }
7186 
7187 /**
7188  * Move load_interpolated_input with simple (payload-based) barycentric modes
7189  * to the top of the program so we don't emit multiple PLNs for the same input.
7190  *
7191  * This works around CSE not being able to handle non-dominating cases
7192  * such as:
7193  *
7194  *    if (...) {
7195  *       interpolate input
7196  *    } else {
7197  *       interpolate the same exact input
7198  *    }
7199  *
7200  * This should be replaced by global value numbering someday.
7201  */
7202 bool
elk_nir_move_interpolation_to_top(nir_shader * nir)7203 elk_nir_move_interpolation_to_top(nir_shader *nir)
7204 {
7205    bool progress = false;
7206 
7207    nir_foreach_function_impl(impl, nir) {
7208       nir_block *top = nir_start_block(impl);
7209       nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
7210       bool impl_progress = false;
7211 
7212       for (nir_block *block = nir_block_cf_tree_next(top);
7213            block != NULL;
7214            block = nir_block_cf_tree_next(block)) {
7215 
7216          nir_foreach_instr_safe(instr, block) {
7217             if (instr->type != nir_instr_type_intrinsic)
7218                continue;
7219 
7220             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
7221             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
7222                continue;
7223             nir_intrinsic_instr *bary_intrinsic =
7224                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
7225             nir_intrinsic_op op = bary_intrinsic->intrinsic;
7226 
7227             /* Leave interpolateAtSample/Offset() where they are. */
7228             if (op == nir_intrinsic_load_barycentric_at_sample ||
7229                 op == nir_intrinsic_load_barycentric_at_offset)
7230                continue;
7231 
7232             nir_instr *move[3] = {
7233                &bary_intrinsic->instr,
7234                intrin->src[1].ssa->parent_instr,
7235                instr
7236             };
7237 
7238             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
7239                if (move[i]->block != top) {
7240                   nir_instr_move(cursor, move[i]);
7241                   impl_progress = true;
7242                }
7243             }
7244          }
7245       }
7246 
7247       progress = progress || impl_progress;
7248 
7249       nir_metadata_preserve(impl, impl_progress ? (nir_metadata_block_index |
7250                                                       nir_metadata_dominance)
7251                                                    : nir_metadata_all);
7252    }
7253 
7254    return progress;
7255 }
7256 
7257 static void
elk_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data)7258 elk_nir_populate_wm_prog_data(nir_shader *shader,
7259                               const struct intel_device_info *devinfo,
7260                               const struct elk_wm_prog_key *key,
7261                               struct elk_wm_prog_data *prog_data)
7262 {
7263    /* key->alpha_test_func means simulating alpha testing via discards,
7264     * so the shader definitely kills pixels.
7265     */
7266    prog_data->uses_kill = shader->info.fs.uses_discard ||
7267                           shader->info.fs.uses_demote ||
7268                           key->emit_alpha_test;
7269    prog_data->uses_omask = !key->ignore_sample_mask_out &&
7270       (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
7271    prog_data->color_outputs_written = key->color_outputs_valid;
7272    prog_data->max_polygons = 1;
7273    prog_data->computed_depth_mode = computed_depth_mode(shader);
7274    prog_data->computed_stencil =
7275       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
7276 
7277    prog_data->sample_shading =
7278       shader->info.fs.uses_sample_shading ||
7279       shader->info.outputs_read;
7280 
7281    assert(key->multisample_fbo != ELK_NEVER ||
7282           key->persample_interp == ELK_NEVER);
7283 
7284    prog_data->persample_dispatch = key->persample_interp;
7285    if (prog_data->sample_shading)
7286       prog_data->persample_dispatch = ELK_ALWAYS;
7287 
7288    /* We can only persample dispatch if we have a multisample FBO */
7289    prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
7290                                         key->multisample_fbo);
7291 
7292    /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
7293     * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
7294     * to definitively tell whether alpha_to_coverage is on or off.
7295     */
7296    prog_data->alpha_to_coverage = key->alpha_to_coverage;
7297    assert(prog_data->alpha_to_coverage != ELK_SOMETIMES ||
7298           prog_data->persample_dispatch == ELK_SOMETIMES);
7299 
7300    if (devinfo->ver >= 6) {
7301       prog_data->uses_sample_mask =
7302          BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
7303 
7304       /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
7305        *
7306        *    "MSDISPMODE_PERSAMPLE is required in order to select
7307        *    POSOFFSET_SAMPLE"
7308        *
7309        * So we can only really get sample positions if we are doing real
7310        * per-sample dispatch.  If we need gl_SamplePosition and we don't have
7311        * persample dispatch, we hard-code it to 0.5.
7312        */
7313       prog_data->uses_pos_offset =
7314          prog_data->persample_dispatch != ELK_NEVER &&
7315          (BITSET_TEST(shader->info.system_values_read,
7316                       SYSTEM_VALUE_SAMPLE_POS) ||
7317           BITSET_TEST(shader->info.system_values_read,
7318                       SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
7319    }
7320 
7321    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
7322    prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
7323    prog_data->inner_coverage = shader->info.fs.inner_coverage;
7324 
7325    prog_data->barycentric_interp_modes =
7326       elk_compute_barycentric_interp_modes(devinfo, shader);
7327 
7328    /* From the BDW PRM documentation for 3DSTATE_WM:
7329     *
7330     *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
7331     *     Sample or Non- perspective Sample barycentric coordinates."
7332     *
7333     * So cleanup any potentially set sample barycentric mode when not in per
7334     * sample dispatch.
7335     */
7336    if (prog_data->persample_dispatch == ELK_NEVER) {
7337       prog_data->barycentric_interp_modes &=
7338          ~BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
7339    }
7340 
7341    prog_data->uses_nonperspective_interp_modes |=
7342       (prog_data->barycentric_interp_modes &
7343       ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
7344 
7345    /* The current VK_EXT_graphics_pipeline_library specification requires
7346     * coarse to specified at compile time. But per sample interpolation can be
7347     * dynamic. So we should never be in a situation where coarse &
7348     * persample_interp are both respectively true & ELK_ALWAYS.
7349     *
7350     * Coarse will dynamically turned off when persample_interp is active.
7351     */
7352    assert(!key->coarse_pixel || key->persample_interp != ELK_ALWAYS);
7353 
7354    prog_data->coarse_pixel_dispatch =
7355       elk_sometimes_invert(prog_data->persample_dispatch);
7356    if (!key->coarse_pixel ||
7357        prog_data->uses_omask ||
7358        prog_data->sample_shading ||
7359        prog_data->uses_sample_mask ||
7360        (prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF) ||
7361        prog_data->computed_stencil) {
7362       prog_data->coarse_pixel_dispatch = ELK_NEVER;
7363    }
7364 
7365    /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
7366     * Message Descriptor :
7367     *
7368     *    "Message Type. Specifies the type of message being sent when
7369     *     pixel-rate evaluation is requested :
7370     *
7371     *     Format = U2
7372     *       0: Per Message Offset (eval_snapped with immediate offset)
7373     *       1: Sample Position Offset (eval_sindex)
7374     *       2: Centroid Position Offset (eval_centroid)
7375     *       3: Per Slot Offset (eval_snapped with register offset)
7376     *
7377     *     Message Type. Specifies the type of message being sent when
7378     *     coarse-rate evaluation is requested :
7379     *
7380     *     Format = U2
7381     *       0: Coarse to Pixel Mapping Message (internal message)
7382     *       1: Reserved
7383     *       2: Coarse Centroid Position (eval_centroid)
7384     *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
7385     *
7386     * The Sample Position Offset is marked as reserved for coarse rate
7387     * evaluation and leads to hangs if we try to use it. So disable coarse
7388     * pixel shading if we have any intrinsic that will result in a pixel
7389     * interpolater message at sample.
7390     */
7391    if (intel_nir_pulls_at_sample(shader))
7392       prog_data->coarse_pixel_dispatch = ELK_NEVER;
7393 
7394    /* We choose to always enable VMask prior to XeHP, as it would cause
7395     * us to lose out on the eliminate_find_live_channel() optimization.
7396     */
7397    prog_data->uses_vmask = devinfo->verx10 < 125 ||
7398                            shader->info.fs.needs_quad_helper_invocations ||
7399                            shader->info.uses_wide_subgroup_intrinsics ||
7400                            prog_data->coarse_pixel_dispatch != ELK_NEVER;
7401 
7402    prog_data->uses_src_w =
7403       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
7404    prog_data->uses_src_depth =
7405       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
7406       prog_data->coarse_pixel_dispatch != ELK_ALWAYS;
7407    prog_data->uses_depth_w_coefficients =
7408       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
7409       prog_data->coarse_pixel_dispatch != ELK_NEVER;
7410 
7411    calculate_urb_setup(devinfo, key, prog_data, shader);
7412    elk_compute_flat_inputs(prog_data, shader);
7413 }
7414 
7415 /**
7416  * Pre-gfx6, the register file of the EUs was shared between threads,
7417  * and each thread used some subset allocated on a 16-register block
7418  * granularity.  The unit states wanted these block counts.
7419  */
7420 static inline int
elk_register_blocks(int reg_count)7421 elk_register_blocks(int reg_count)
7422 {
7423    return ALIGN(reg_count, 16) / 16 - 1;
7424 }
7425 
7426 const unsigned *
elk_compile_fs(const struct elk_compiler * compiler,struct elk_compile_fs_params * params)7427 elk_compile_fs(const struct elk_compiler *compiler,
7428                struct elk_compile_fs_params *params)
7429 {
7430    struct nir_shader *nir = params->base.nir;
7431    const struct elk_wm_prog_key *key = params->key;
7432    struct elk_wm_prog_data *prog_data = params->prog_data;
7433    bool allow_spilling = params->allow_spilling;
7434    const bool debug_enabled =
7435       elk_should_print_shader(nir, params->base.debug_flag ?
7436                                    params->base.debug_flag : DEBUG_WM);
7437 
7438    prog_data->base.stage = MESA_SHADER_FRAGMENT;
7439    prog_data->base.ray_queries = nir->info.ray_queries;
7440    prog_data->base.total_scratch = 0;
7441 
7442    const struct intel_device_info *devinfo = compiler->devinfo;
7443    const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
7444 
7445    elk_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
7446    elk_nir_lower_fs_inputs(nir, devinfo, key);
7447    elk_nir_lower_fs_outputs(nir);
7448 
7449    if (devinfo->ver < 6)
7450       elk_setup_vue_interpolation(params->vue_map, nir, prog_data);
7451 
7452    /* From the SKL PRM, Volume 7, "Alpha Coverage":
7453     *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
7454     *   hardware, regardless of the state setting for this feature."
7455     */
7456    if (devinfo->ver > 6 && key->alpha_to_coverage != ELK_NEVER) {
7457       /* Run constant fold optimization in order to get the correct source
7458        * offset to determine render target 0 store instruction in
7459        * emit_alpha_to_coverage pass.
7460        */
7461       NIR_PASS(_, nir, nir_opt_constant_folding);
7462       NIR_PASS(_, nir, elk_nir_lower_alpha_to_coverage, key, prog_data);
7463    }
7464 
7465    NIR_PASS(_, nir, elk_nir_move_interpolation_to_top);
7466    elk_postprocess_nir(nir, compiler, debug_enabled,
7467                        key->base.robust_flags);
7468 
7469    elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
7470 
7471    std::unique_ptr<elk_fs_visitor> v8, v16, v32, vmulti;
7472    elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
7473       *multi_cfg = NULL;
7474    float throughput = 0;
7475    bool has_spilled = false;
7476 
7477    if (devinfo->ver < 20) {
7478       v8 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
7479                                         prog_data, nir, 8, 1,
7480                                         params->base.stats != NULL,
7481                                         debug_enabled);
7482       if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
7483          params->base.error_str = ralloc_strdup(params->base.mem_ctx,
7484                                                 v8->fail_msg);
7485          return NULL;
7486       } else if (INTEL_SIMD(FS, 8)) {
7487          simd8_cfg = v8->cfg;
7488 
7489          assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
7490          prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
7491 
7492          prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
7493          const performance &perf = v8->performance_analysis.require();
7494          throughput = MAX2(throughput, perf.throughput);
7495          has_spilled = v8->spilled_any_registers;
7496          allow_spilling = false;
7497       }
7498    }
7499 
7500    /* Limit dispatch width to simd8 with dual source blending on gfx8.
7501     * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
7502     */
7503    if (devinfo->ver == 8 && prog_data->dual_src_blend &&
7504        INTEL_SIMD(FS, 8)) {
7505       assert(!params->use_rep_send);
7506       v8->limit_dispatch_width(8, "gfx8 workaround: "
7507                                "using SIMD8 when dual src blending.\n");
7508    }
7509 
7510    if (key->coarse_pixel && devinfo->ver < 20) {
7511       if (prog_data->dual_src_blend) {
7512          v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
7513                                   " use SIMD8 messages.\n");
7514       }
7515       v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
7516                                " pixel shading.\n");
7517    }
7518 
7519    if (nir->info.ray_queries > 0 && v8)
7520       v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
7521 
7522    if (!has_spilled &&
7523        (!v8 || v8->max_dispatch_width >= 16) &&
7524        (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
7525       /* Try a SIMD16 compile */
7526       v16 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
7527                                          prog_data, nir, 16, 1,
7528                                          params->base.stats != NULL,
7529                                          debug_enabled);
7530       if (v8)
7531          v16->import_uniforms(v8.get());
7532       if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
7533          elk_shader_perf_log(compiler, params->base.log_data,
7534                              "SIMD16 shader failed to compile: %s\n",
7535                              v16->fail_msg);
7536       } else {
7537          simd16_cfg = v16->cfg;
7538 
7539          assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
7540          prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
7541 
7542          prog_data->reg_blocks_16 = elk_register_blocks(v16->grf_used);
7543          const performance &perf = v16->performance_analysis.require();
7544          throughput = MAX2(throughput, perf.throughput);
7545          has_spilled = v16->spilled_any_registers;
7546          allow_spilling = false;
7547       }
7548    }
7549 
7550    const bool simd16_failed = v16 && !simd16_cfg;
7551 
7552    /* Currently, the compiler only supports SIMD32 on SNB+ */
7553    if (!has_spilled &&
7554        (!v8 || v8->max_dispatch_width >= 32) &&
7555        (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
7556        devinfo->ver >= 6 && !simd16_failed &&
7557        INTEL_SIMD(FS, 32)) {
7558       /* Try a SIMD32 compile */
7559       v32 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
7560                                          prog_data, nir, 32, 1,
7561                                          params->base.stats != NULL,
7562                                          debug_enabled);
7563       if (v8)
7564          v32->import_uniforms(v8.get());
7565       else if (v16)
7566          v32->import_uniforms(v16.get());
7567 
7568       if (!v32->run_fs(allow_spilling, false)) {
7569          elk_shader_perf_log(compiler, params->base.log_data,
7570                              "SIMD32 shader failed to compile: %s\n",
7571                              v32->fail_msg);
7572       } else {
7573          const performance &perf = v32->performance_analysis.require();
7574 
7575          if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
7576             elk_shader_perf_log(compiler, params->base.log_data,
7577                                 "SIMD32 shader inefficient\n");
7578          } else {
7579             simd32_cfg = v32->cfg;
7580 
7581             assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
7582             prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
7583 
7584             prog_data->reg_blocks_32 = elk_register_blocks(v32->grf_used);
7585             throughput = MAX2(throughput, perf.throughput);
7586          }
7587       }
7588    }
7589 
7590    if (devinfo->ver >= 12 && !has_spilled &&
7591        params->max_polygons >= 2 && !key->coarse_pixel) {
7592       elk_fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
7593       assert(vbase);
7594 
7595       if (devinfo->ver >= 20 &&
7596           params->max_polygons >= 4 &&
7597           vbase->max_dispatch_width >= 32 &&
7598           4 * prog_data->num_varying_inputs <= MAX_VARYING &&
7599           INTEL_SIMD(FS, 4X8)) {
7600          /* Try a quad-SIMD8 compile */
7601          vmulti = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
7602                                                prog_data, nir, 32, 4,
7603                                                params->base.stats != NULL,
7604                                                debug_enabled);
7605          vmulti->import_uniforms(vbase);
7606          if (!vmulti->run_fs(false, params->use_rep_send)) {
7607             elk_shader_perf_log(compiler, params->base.log_data,
7608                                 "Quad-SIMD8 shader failed to compile: %s\n",
7609                                 vmulti->fail_msg);
7610          } else {
7611             multi_cfg = vmulti->cfg;
7612             assert(!vmulti->spilled_any_registers);
7613          }
7614       }
7615 
7616       if (!multi_cfg && devinfo->ver >= 20 &&
7617           vbase->max_dispatch_width >= 32 &&
7618           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
7619           INTEL_SIMD(FS, 2X16)) {
7620          /* Try a dual-SIMD16 compile */
7621          vmulti = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
7622                                                prog_data, nir, 32, 2,
7623                                                params->base.stats != NULL,
7624                                                debug_enabled);
7625          vmulti->import_uniforms(vbase);
7626          if (!vmulti->run_fs(false, params->use_rep_send)) {
7627             elk_shader_perf_log(compiler, params->base.log_data,
7628                                 "Dual-SIMD16 shader failed to compile: %s\n",
7629                                 vmulti->fail_msg);
7630          } else {
7631             multi_cfg = vmulti->cfg;
7632             assert(!vmulti->spilled_any_registers);
7633          }
7634       }
7635 
7636       if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
7637           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
7638           INTEL_SIMD(FS, 2X8)) {
7639          /* Try a dual-SIMD8 compile */
7640          vmulti = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
7641                                                prog_data, nir, 16, 2,
7642                                                params->base.stats != NULL,
7643                                                debug_enabled);
7644          vmulti->import_uniforms(vbase);
7645          if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
7646             elk_shader_perf_log(compiler, params->base.log_data,
7647                                 "Dual-SIMD8 shader failed to compile: %s\n",
7648                                 vmulti->fail_msg);
7649          } else {
7650             multi_cfg = vmulti->cfg;
7651          }
7652       }
7653 
7654       if (multi_cfg) {
7655          assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
7656          prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
7657 
7658          prog_data->reg_blocks_8 = elk_register_blocks(vmulti->grf_used);
7659       }
7660    }
7661 
7662    /* When the caller requests a repclear shader, they want SIMD16-only */
7663    if (params->use_rep_send)
7664       simd8_cfg = NULL;
7665 
7666    /* Prior to Iron Lake, the PS had a single shader offset with a jump table
7667     * at the top to select the shader.  We've never implemented that.
7668     * Instead, we just give them exactly one shader and we pick the widest one
7669     * available.
7670     */
7671    if (compiler->devinfo->ver < 5) {
7672       if (simd32_cfg || simd16_cfg)
7673          simd8_cfg = NULL;
7674       if (simd32_cfg)
7675          simd16_cfg = NULL;
7676    }
7677 
7678    /* If computed depth is enabled SNB only allows SIMD8. */
7679    if (compiler->devinfo->ver == 6 &&
7680        prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF)
7681       assert(simd16_cfg == NULL && simd32_cfg == NULL);
7682 
7683    if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
7684       /* Iron lake and earlier only have one Dispatch GRF start field.  Make
7685        * the data available in the base prog data struct for convenience.
7686        */
7687       if (simd16_cfg) {
7688          prog_data->base.dispatch_grf_start_reg =
7689             prog_data->dispatch_grf_start_reg_16;
7690       } else if (simd32_cfg) {
7691          prog_data->base.dispatch_grf_start_reg =
7692             prog_data->dispatch_grf_start_reg_32;
7693       }
7694    }
7695 
7696    elk_fs_generator g(compiler, &params->base, &prog_data->base,
7697                   v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
7698 
7699    if (unlikely(debug_enabled)) {
7700       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
7701                                      "%s fragment shader %s",
7702                                      nir->info.label ?
7703                                         nir->info.label : "unnamed",
7704                                      nir->info.name));
7705    }
7706 
7707    struct elk_compile_stats *stats = params->base.stats;
7708    uint32_t max_dispatch_width = 0;
7709 
7710    if (multi_cfg) {
7711       prog_data->dispatch_multi = vmulti->dispatch_width;
7712       prog_data->max_polygons = vmulti->max_polygons;
7713       g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
7714                       vmulti->performance_analysis.require(),
7715                       stats, vmulti->max_polygons);
7716       stats = stats ? stats + 1 : NULL;
7717       max_dispatch_width = vmulti->dispatch_width;
7718 
7719    } else if (simd8_cfg) {
7720       prog_data->dispatch_8 = true;
7721       g.generate_code(simd8_cfg, 8, v8->shader_stats,
7722                       v8->performance_analysis.require(), stats, 1);
7723       stats = stats ? stats + 1 : NULL;
7724       max_dispatch_width = 8;
7725    }
7726 
7727    if (simd16_cfg) {
7728       prog_data->dispatch_16 = true;
7729       prog_data->prog_offset_16 = g.generate_code(
7730          simd16_cfg, 16, v16->shader_stats,
7731          v16->performance_analysis.require(), stats, 1);
7732       stats = stats ? stats + 1 : NULL;
7733       max_dispatch_width = 16;
7734    }
7735 
7736    if (simd32_cfg) {
7737       prog_data->dispatch_32 = true;
7738       prog_data->prog_offset_32 = g.generate_code(
7739          simd32_cfg, 32, v32->shader_stats,
7740          v32->performance_analysis.require(), stats, 1);
7741       stats = stats ? stats + 1 : NULL;
7742       max_dispatch_width = 32;
7743    }
7744 
7745    for (struct elk_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
7746       s->max_dispatch_width = max_dispatch_width;
7747 
7748    g.add_const_data(nir->constant_data, nir->constant_data_size);
7749    return g.get_assembly();
7750 }
7751 
7752 unsigned
elk_cs_push_const_total_size(const struct elk_cs_prog_data * cs_prog_data,unsigned threads)7753 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
7754                              unsigned threads)
7755 {
7756    assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
7757    assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
7758    return cs_prog_data->push.per_thread.size * threads +
7759           cs_prog_data->push.cross_thread.size;
7760 }
7761 
7762 static void
fill_push_const_block_info(struct elk_push_const_block * block,unsigned dwords)7763 fill_push_const_block_info(struct elk_push_const_block *block, unsigned dwords)
7764 {
7765    block->dwords = dwords;
7766    block->regs = DIV_ROUND_UP(dwords, 8);
7767    block->size = block->regs * 32;
7768 }
7769 
7770 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct elk_cs_prog_data * cs_prog_data)7771 cs_fill_push_const_info(const struct intel_device_info *devinfo,
7772                         struct elk_cs_prog_data *cs_prog_data)
7773 {
7774    const struct elk_stage_prog_data *prog_data = &cs_prog_data->base;
7775    int subgroup_id_index = elk_get_subgroup_id_param_index(devinfo, prog_data);
7776    bool cross_thread_supported = devinfo->verx10 >= 75;
7777 
7778    /* The thread ID should be stored in the last param dword */
7779    assert(subgroup_id_index == -1 ||
7780           subgroup_id_index == (int)prog_data->nr_params - 1);
7781 
7782    unsigned cross_thread_dwords, per_thread_dwords;
7783    if (!cross_thread_supported) {
7784       cross_thread_dwords = 0u;
7785       per_thread_dwords = prog_data->nr_params;
7786    } else if (subgroup_id_index >= 0) {
7787       /* Fill all but the last register with cross-thread payload */
7788       cross_thread_dwords = 8 * (subgroup_id_index / 8);
7789       per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
7790       assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
7791    } else {
7792       /* Fill all data using cross-thread payload */
7793       cross_thread_dwords = prog_data->nr_params;
7794       per_thread_dwords = 0u;
7795    }
7796 
7797    fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
7798    fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
7799 
7800    assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
7801           cs_prog_data->push.per_thread.size == 0);
7802    assert(cs_prog_data->push.cross_thread.dwords +
7803           cs_prog_data->push.per_thread.dwords ==
7804              prog_data->nr_params);
7805 }
7806 
7807 static bool
filter_simd(const nir_instr * instr,const void *)7808 filter_simd(const nir_instr *instr, const void * /* options */)
7809 {
7810    if (instr->type != nir_instr_type_intrinsic)
7811       return false;
7812 
7813    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
7814    case nir_intrinsic_load_simd_width_intel:
7815    case nir_intrinsic_load_subgroup_id:
7816       return true;
7817 
7818    default:
7819       return false;
7820    }
7821 }
7822 
7823 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)7824 lower_simd(nir_builder *b, nir_instr *instr, void *options)
7825 {
7826    uintptr_t simd_width = (uintptr_t)options;
7827 
7828    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
7829    case nir_intrinsic_load_simd_width_intel:
7830       return nir_imm_int(b, simd_width);
7831 
7832    case nir_intrinsic_load_subgroup_id:
7833       /* If the whole workgroup fits in one thread, we can lower subgroup_id
7834        * to a constant zero.
7835        */
7836       if (!b->shader->info.workgroup_size_variable) {
7837          unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
7838                                          b->shader->info.workgroup_size[1] *
7839                                          b->shader->info.workgroup_size[2];
7840          if (local_workgroup_size <= simd_width)
7841             return nir_imm_int(b, 0);
7842       }
7843       return NULL;
7844 
7845    default:
7846       return NULL;
7847    }
7848 }
7849 
7850 bool
elk_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)7851 elk_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
7852 {
7853    return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
7854                                  (void *)(uintptr_t)dispatch_width);
7855 }
7856 
7857 const unsigned *
elk_compile_cs(const struct elk_compiler * compiler,struct elk_compile_cs_params * params)7858 elk_compile_cs(const struct elk_compiler *compiler,
7859                struct elk_compile_cs_params *params)
7860 {
7861    const nir_shader *nir = params->base.nir;
7862    const struct elk_cs_prog_key *key = params->key;
7863    struct elk_cs_prog_data *prog_data = params->prog_data;
7864 
7865    const bool debug_enabled =
7866       elk_should_print_shader(nir, params->base.debug_flag ?
7867                                    params->base.debug_flag : DEBUG_CS);
7868 
7869    prog_data->base.stage = MESA_SHADER_COMPUTE;
7870    prog_data->base.total_shared = nir->info.shared_size;
7871    prog_data->base.ray_queries = nir->info.ray_queries;
7872    prog_data->base.total_scratch = 0;
7873 
7874    if (!nir->info.workgroup_size_variable) {
7875       prog_data->local_size[0] = nir->info.workgroup_size[0];
7876       prog_data->local_size[1] = nir->info.workgroup_size[1];
7877       prog_data->local_size[2] = nir->info.workgroup_size[2];
7878    }
7879 
7880    elk_simd_selection_state simd_state{
7881       .devinfo = compiler->devinfo,
7882       .prog_data = prog_data,
7883       .required_width = elk_required_dispatch_width(&nir->info),
7884    };
7885 
7886    std::unique_ptr<elk_fs_visitor> v[3];
7887 
7888    for (unsigned simd = 0; simd < 3; simd++) {
7889       if (!elk_simd_should_compile(simd_state, simd))
7890          continue;
7891 
7892       const unsigned dispatch_width = 8u << simd;
7893 
7894       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
7895       elk_nir_apply_key(shader, compiler, &key->base,
7896                         dispatch_width);
7897 
7898       NIR_PASS(_, shader, elk_nir_lower_simd, dispatch_width);
7899 
7900       /* Clean up after the local index and ID calculations. */
7901       NIR_PASS(_, shader, nir_opt_constant_folding);
7902       NIR_PASS(_, shader, nir_opt_dce);
7903 
7904       elk_postprocess_nir(shader, compiler, debug_enabled,
7905                           key->base.robust_flags);
7906 
7907       v[simd] = std::make_unique<elk_fs_visitor>(compiler, &params->base,
7908                                              &key->base,
7909                                              &prog_data->base,
7910                                              shader, dispatch_width,
7911                                              params->base.stats != NULL,
7912                                              debug_enabled);
7913 
7914       const int first = elk_simd_first_compiled(simd_state);
7915       if (first >= 0)
7916          v[simd]->import_uniforms(v[first].get());
7917 
7918       const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
7919 
7920       if (v[simd]->run_cs(allow_spilling)) {
7921          cs_fill_push_const_info(compiler->devinfo, prog_data);
7922 
7923          elk_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
7924       } else {
7925          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
7926          if (simd > 0) {
7927             elk_shader_perf_log(compiler, params->base.log_data,
7928                                 "SIMD%u shader failed to compile: %s\n",
7929                                 dispatch_width, v[simd]->fail_msg);
7930          }
7931       }
7932    }
7933 
7934    const int selected_simd = elk_simd_select(simd_state);
7935    if (selected_simd < 0) {
7936       params->base.error_str =
7937          ralloc_asprintf(params->base.mem_ctx,
7938                          "Can't compile shader: "
7939                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
7940                          simd_state.error[0], simd_state.error[1],
7941                          simd_state.error[2]);
7942       return NULL;
7943    }
7944 
7945    assert(selected_simd < 3);
7946    elk_fs_visitor *selected = v[selected_simd].get();
7947 
7948    if (!nir->info.workgroup_size_variable)
7949       prog_data->prog_mask = 1 << selected_simd;
7950 
7951    elk_fs_generator g(compiler, &params->base, &prog_data->base,
7952                   selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
7953    if (unlikely(debug_enabled)) {
7954       char *name = ralloc_asprintf(params->base.mem_ctx,
7955                                    "%s compute shader %s",
7956                                    nir->info.label ?
7957                                    nir->info.label : "unnamed",
7958                                    nir->info.name);
7959       g.enable_debug(name);
7960    }
7961 
7962    uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
7963 
7964    struct elk_compile_stats *stats = params->base.stats;
7965    for (unsigned simd = 0; simd < 3; simd++) {
7966       if (prog_data->prog_mask & (1u << simd)) {
7967          assert(v[simd]);
7968          prog_data->prog_offset[simd] =
7969             g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
7970                             v[simd]->performance_analysis.require(), stats);
7971          if (stats)
7972             stats->max_dispatch_width = max_dispatch_width;
7973          stats = stats ? stats + 1 : NULL;
7974          max_dispatch_width = 8u << simd;
7975       }
7976    }
7977 
7978    g.add_const_data(nir->constant_data, nir->constant_data_size);
7979 
7980    return g.get_assembly();
7981 }
7982 
7983 struct intel_cs_dispatch_info
elk_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct elk_cs_prog_data * prog_data,const unsigned * override_local_size)7984 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
7985                          const struct elk_cs_prog_data *prog_data,
7986                          const unsigned *override_local_size)
7987 {
7988    struct intel_cs_dispatch_info info = {};
7989 
7990    const unsigned *sizes =
7991       override_local_size ? override_local_size :
7992                             prog_data->local_size;
7993 
7994    const int simd = elk_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
7995    assert(simd >= 0 && simd < 3);
7996 
7997    info.group_size = sizes[0] * sizes[1] * sizes[2];
7998    info.simd_size = 8u << simd;
7999    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
8000 
8001    const uint32_t remainder = info.group_size & (info.simd_size - 1);
8002    if (remainder > 0)
8003       info.right_mask = ~0u >> (32 - remainder);
8004    else
8005       info.right_mask = ~0u >> (32 - info.simd_size);
8006 
8007    return info;
8008 }
8009 
8010 uint64_t
elk_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)8011 elk_bsr(const struct intel_device_info *devinfo,
8012         uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
8013 {
8014    assert(offset % 64 == 0);
8015    assert(simd_size == 8 || simd_size == 16);
8016    assert(local_arg_offset % 8 == 0);
8017 
8018    return offset |
8019           SET_BITS(simd_size == 8, 4, 4) |
8020           SET_BITS(local_arg_offset / 8, 2, 0);
8021 }
8022 
8023 /**
8024  * Test the dispatch mask packing assumptions of
8025  * elk_stage_has_packed_dispatch().  Call this from e.g. the top of
8026  * elk_fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
8027  * executed with an unexpected dispatch mask.
8028  */
8029 static UNUSED void
elk_fs_test_dispatch_packing(const fs_builder & bld)8030 elk_fs_test_dispatch_packing(const fs_builder &bld)
8031 {
8032    const elk_fs_visitor *shader = static_cast<const elk_fs_visitor *>(bld.shader);
8033    const gl_shader_stage stage = shader->stage;
8034    const bool uses_vmask =
8035       stage == MESA_SHADER_FRAGMENT &&
8036       elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;
8037 
8038    if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
8039                                      shader->max_polygons,
8040                                      shader->stage_prog_data)) {
8041       const fs_builder ubld = bld.exec_all().group(1, 0);
8042       const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);
8043       const elk_fs_reg mask = uses_vmask ? elk_vmask_reg() : elk_dmask_reg();
8044 
8045       ubld.ADD(tmp, mask, elk_imm_ud(1));
8046       ubld.AND(tmp, mask, tmp);
8047 
8048       /* This will loop forever if the dispatch mask doesn't have the expected
8049        * form '2^n-1', in which case tmp will be non-zero.
8050        */
8051       bld.emit(ELK_OPCODE_DO);
8052       bld.CMP(bld.null_reg_ud(), tmp, elk_imm_ud(0), ELK_CONDITIONAL_NZ);
8053       set_predicate(ELK_PREDICATE_NORMAL, bld.emit(ELK_OPCODE_WHILE));
8054    }
8055 }
8056 
8057 unsigned
workgroup_size() const8058 elk_fs_visitor::workgroup_size() const
8059 {
8060    assert(gl_shader_stage_uses_workgroup(stage));
8061    const struct elk_cs_prog_data *cs = elk_cs_prog_data(prog_data);
8062    return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
8063 }
8064 
elk_should_print_shader(const nir_shader * shader,uint64_t debug_flag)8065 bool elk_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
8066 {
8067    return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
8068 }
8069 
8070 namespace elk {
8071    elk_fs_reg
fetch_payload_reg(const elk::fs_builder & bld,uint8_t regs[2],elk_reg_type type,unsigned n)8072    fetch_payload_reg(const elk::fs_builder &bld, uint8_t regs[2],
8073                      elk_reg_type type, unsigned n)
8074    {
8075       if (!regs[0])
8076          return elk_fs_reg();
8077 
8078       if (bld.dispatch_width() > 16) {
8079          const elk_fs_reg tmp = bld.vgrf(type, n);
8080          const elk::fs_builder hbld = bld.exec_all().group(16, 0);
8081          const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
8082          elk_fs_reg *const components = new elk_fs_reg[m * n];
8083 
8084          for (unsigned c = 0; c < n; c++) {
8085             for (unsigned g = 0; g < m; g++)
8086                components[c * m + g] =
8087                   offset(retype(elk_vec8_grf(regs[g], 0), type), hbld, c);
8088          }
8089 
8090          hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
8091 
8092          delete[] components;
8093          return tmp;
8094 
8095       } else {
8096          return elk_fs_reg(retype(elk_vec8_grf(regs[0], 0), type));
8097       }
8098    }
8099 
8100    elk_fs_reg
fetch_barycentric_reg(const elk::fs_builder & bld,uint8_t regs[2])8101    fetch_barycentric_reg(const elk::fs_builder &bld, uint8_t regs[2])
8102    {
8103       if (!regs[0])
8104          return elk_fs_reg();
8105       else if (bld.shader->devinfo->ver >= 20)
8106          return fetch_payload_reg(bld, regs, ELK_REGISTER_TYPE_F, 2);
8107 
8108       const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
8109       const elk::fs_builder hbld = bld.exec_all().group(8, 0);
8110       const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
8111       elk_fs_reg *const components = new elk_fs_reg[2 * m];
8112 
8113       for (unsigned c = 0; c < 2; c++) {
8114          for (unsigned g = 0; g < m; g++)
8115             components[c * m + g] = offset(elk_vec8_grf(regs[g / 2], 0),
8116                                            hbld, c + 2 * (g % 2));
8117       }
8118 
8119       hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
8120 
8121       delete[] components;
8122       return tmp;
8123    }
8124 
8125    void
check_dynamic_msaa_flag(const fs_builder & bld,const struct elk_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)8126    check_dynamic_msaa_flag(const fs_builder &bld,
8127                            const struct elk_wm_prog_data *wm_prog_data,
8128                            enum intel_msaa_flags flag)
8129    {
8130       elk_fs_inst *inst = bld.AND(bld.null_reg_ud(),
8131                               dynamic_msaa_flags(wm_prog_data),
8132                               elk_imm_ud(flag));
8133       inst->conditional_mod = ELK_CONDITIONAL_NZ;
8134    }
8135 }
8136