• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file elk_fs.cpp
25  *
26  * This file drives the GLSL IR -> LIR translation, contains the
27  * optimizations on the LIR, and drives the generation of native code
28  * from the LIR.
29  */
30 
31 #include "elk_eu.h"
32 #include "elk_fs.h"
33 #include "elk_fs_builder.h"
34 #include "elk_fs_live_variables.h"
35 #include "elk_nir.h"
36 #include "elk_vec4_gs_visitor.h"
37 #include "elk_cfg.h"
38 #include "elk_dead_control_flow.h"
39 #include "elk_private.h"
40 #include "../intel_nir.h"
41 #include "shader_enums.h"
42 #include "dev/intel_debug.h"
43 #include "dev/intel_wa.h"
44 #include "compiler/glsl_types.h"
45 #include "compiler/nir/nir_builder.h"
46 #include "util/u_math.h"
47 
48 #include <memory>
49 
50 using namespace elk;
51 
52 static unsigned get_lowered_simd_width(const elk_fs_visitor *shader,
53                                        const elk_fs_inst *inst);
54 
55 void
init(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources)56 elk_fs_inst::init(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
57               const elk_fs_reg *src, unsigned sources)
58 {
59    memset((void*)this, 0, sizeof(*this));
60 
61    this->src = new elk_fs_reg[MAX2(sources, 3)];
62    for (unsigned i = 0; i < sources; i++)
63       this->src[i] = src[i];
64 
65    this->opcode = opcode;
66    this->dst = dst;
67    this->sources = sources;
68    this->exec_size = exec_size;
69    this->base_mrf = -1;
70 
71    assert(dst.file != IMM && dst.file != UNIFORM);
72 
73    assert(this->exec_size != 0);
74 
75    this->conditional_mod = ELK_CONDITIONAL_NONE;
76 
77    /* This will be the case for almost all instructions. */
78    switch (dst.file) {
79    case VGRF:
80    case ARF:
81    case FIXED_GRF:
82    case MRF:
83    case ATTR:
84       this->size_written = dst.component_size(exec_size);
85       break;
86    case BAD_FILE:
87       this->size_written = 0;
88       break;
89    case IMM:
90    case UNIFORM:
91       unreachable("Invalid destination register file");
92    }
93 
94    this->writes_accumulator = false;
95 }
96 
elk_fs_inst()97 elk_fs_inst::elk_fs_inst()
98 {
99    init(ELK_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size)102 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size)
103 {
104    init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst)107 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst)
108 {
109    init(opcode, exec_size, dst, NULL, 0);
110 }
111 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0)112 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
113                  const elk_fs_reg &src0)
114 {
115    const elk_fs_reg src[1] = { src0 };
116    init(opcode, exec_size, dst, src, 1);
117 }
118 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1)119 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
120                  const elk_fs_reg &src0, const elk_fs_reg &src1)
121 {
122    const elk_fs_reg src[2] = { src0, src1 };
123    init(opcode, exec_size, dst, src, 2);
124 }
125 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1,const elk_fs_reg & src2)126 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
127                  const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2)
128 {
129    const elk_fs_reg src[3] = { src0, src1, src2 };
130    init(opcode, exec_size, dst, src, 3);
131 }
132 
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_width,const elk_fs_reg & dst,const elk_fs_reg src[],unsigned sources)133 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
134                  const elk_fs_reg src[], unsigned sources)
135 {
136    init(opcode, exec_width, dst, src, sources);
137 }
138 
elk_fs_inst(const elk_fs_inst & that)139 elk_fs_inst::elk_fs_inst(const elk_fs_inst &that)
140 {
141    memcpy((void*)this, &that, sizeof(that));
142 
143    this->src = new elk_fs_reg[MAX2(that.sources, 3)];
144 
145    for (unsigned i = 0; i < that.sources; i++)
146       this->src[i] = that.src[i];
147 }
148 
~elk_fs_inst()149 elk_fs_inst::~elk_fs_inst()
150 {
151    delete[] this->src;
152 }
153 
154 void
resize_sources(uint8_t num_sources)155 elk_fs_inst::resize_sources(uint8_t num_sources)
156 {
157    if (this->sources != num_sources) {
158       elk_fs_reg *src = new elk_fs_reg[MAX2(num_sources, 3)];
159 
160       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161          src[i] = this->src[i];
162 
163       delete[] this->src;
164       this->src = src;
165       this->sources = num_sources;
166    }
167 }
168 
169 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & surface,const elk_fs_reg & surface_handle,const elk_fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)170 elk_fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171                                        const elk_fs_reg &dst,
172                                        const elk_fs_reg &surface,
173                                        const elk_fs_reg &surface_handle,
174                                        const elk_fs_reg &varying_offset,
175                                        uint32_t const_offset,
176                                        uint8_t alignment,
177                                        unsigned components)
178 {
179    assert(components <= 4);
180 
181    /* We have our constant surface use a pitch of 4 bytes, so our index can
182     * be any component of a vector, and then we load 4 contiguous
183     * components starting from that.  TODO: Support loading fewer than 4.
184     */
185    elk_fs_reg total_offset = vgrf(glsl_uint_type());
186    bld.ADD(total_offset, varying_offset, elk_imm_ud(const_offset));
187 
188    /* The pull load message will load a vec4 (16 bytes). If we are loading
189     * a double this means we are only loading 2 elements worth of data.
190     * We also want to use a 32-bit data type for the dst of the load operation
191     * so other parts of the driver don't get confused about the size of the
192     * result.
193     */
194    elk_fs_reg vec4_result = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
195 
196    elk_fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
197    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
198    srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
199    srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
200    srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = elk_imm_ud(alignment);
201 
202    elk_fs_inst *inst = bld.emit(ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
203                             vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
204    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
205 
206    elk_shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
207 }
208 
209 /**
210  * A helper for MOV generation for fixing up broken hardware SEND dependency
211  * handling.
212  */
213 void
DEP_RESOLVE_MOV(const fs_builder & bld,int grf)214 elk_fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
215 {
216    /* The caller always wants uncompressed to emit the minimal extra
217     * dependencies, and to avoid having to deal with aligning its regs to 2.
218     */
219    const fs_builder ubld = bld.annotate("send dependency resolve")
220                               .quarter(0);
221 
222    ubld.MOV(ubld.null_reg_f(), elk_fs_reg(VGRF, grf, ELK_REGISTER_TYPE_F));
223 }
224 
225 bool
is_send_from_grf() const226 elk_fs_inst::is_send_from_grf() const
227 {
228    switch (opcode) {
229    case ELK_SHADER_OPCODE_SEND:
230    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
231    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
232    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
233    case ELK_SHADER_OPCODE_INTERLOCK:
234    case ELK_SHADER_OPCODE_MEMORY_FENCE:
235    case ELK_SHADER_OPCODE_BARRIER:
236       return true;
237    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238       return src[1].file == VGRF;
239    case ELK_FS_OPCODE_FB_WRITE:
240       return src[0].file == VGRF;
241    default:
242       return false;
243    }
244 }
245 
246 bool
is_control_source(unsigned arg) const247 elk_fs_inst::is_control_source(unsigned arg) const
248 {
249    switch (opcode) {
250    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
251    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
252       return arg == 0;
253 
254    case ELK_SHADER_OPCODE_BROADCAST:
255    case ELK_SHADER_OPCODE_SHUFFLE:
256    case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
257    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
258    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
259    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
260       return arg == 1;
261 
262    case ELK_SHADER_OPCODE_MOV_INDIRECT:
263    case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
264    case ELK_SHADER_OPCODE_TEX:
265    case ELK_FS_OPCODE_TXB:
266    case ELK_SHADER_OPCODE_TXD:
267    case ELK_SHADER_OPCODE_TXF:
268    case ELK_SHADER_OPCODE_TXF_LZ:
269    case ELK_SHADER_OPCODE_TXF_CMS:
270    case ELK_SHADER_OPCODE_TXF_CMS_W:
271    case ELK_SHADER_OPCODE_TXF_UMS:
272    case ELK_SHADER_OPCODE_TXF_MCS:
273    case ELK_SHADER_OPCODE_TXL:
274    case ELK_SHADER_OPCODE_TXL_LZ:
275    case ELK_SHADER_OPCODE_TXS:
276    case ELK_SHADER_OPCODE_LOD:
277    case ELK_SHADER_OPCODE_TG4:
278    case ELK_SHADER_OPCODE_TG4_OFFSET:
279    case ELK_SHADER_OPCODE_SAMPLEINFO:
280       return arg == 1 || arg == 2;
281 
282    case ELK_SHADER_OPCODE_SEND:
283       return arg == 0;
284 
285    default:
286       return false;
287    }
288 }
289 
290 bool
is_payload(unsigned arg) const291 elk_fs_inst::is_payload(unsigned arg) const
292 {
293    switch (opcode) {
294    case ELK_FS_OPCODE_FB_WRITE:
295    case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
296    case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
297    case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
298    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
299    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
300    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
301    case ELK_SHADER_OPCODE_INTERLOCK:
302    case ELK_SHADER_OPCODE_MEMORY_FENCE:
303    case ELK_SHADER_OPCODE_BARRIER:
304    case ELK_SHADER_OPCODE_TEX:
305    case ELK_FS_OPCODE_TXB:
306    case ELK_SHADER_OPCODE_TXD:
307    case ELK_SHADER_OPCODE_TXF:
308    case ELK_SHADER_OPCODE_TXF_LZ:
309    case ELK_SHADER_OPCODE_TXF_CMS:
310    case ELK_SHADER_OPCODE_TXF_CMS_W:
311    case ELK_SHADER_OPCODE_TXF_UMS:
312    case ELK_SHADER_OPCODE_TXF_MCS:
313    case ELK_SHADER_OPCODE_TXL:
314    case ELK_SHADER_OPCODE_TXL_LZ:
315    case ELK_SHADER_OPCODE_TXS:
316    case ELK_SHADER_OPCODE_LOD:
317    case ELK_SHADER_OPCODE_TG4:
318    case ELK_SHADER_OPCODE_TG4_OFFSET:
319    case ELK_SHADER_OPCODE_SAMPLEINFO:
320       return arg == 0;
321 
322    case ELK_SHADER_OPCODE_SEND:
323       return arg == 1;
324 
325    default:
326       return false;
327    }
328 }
329 
330 /**
331  * Returns true if this instruction's sources and destinations cannot
332  * safely be the same register.
333  *
334  * In most cases, a register can be written over safely by the same
335  * instruction that is its last use.  For a single instruction, the
336  * sources are dereferenced before writing of the destination starts
337  * (naturally).
338  *
339  * However, there are a few cases where this can be problematic:
340  *
341  * - Virtual opcodes that translate to multiple instructions in the
342  *   code generator: if src == dst and one instruction writes the
343  *   destination before a later instruction reads the source, then
344  *   src will have been clobbered.
345  *
346  * - SIMD16 compressed instructions with certain regioning (see below).
347  *
348  * The register allocator uses this information to set up conflicts between
349  * GRF sources and the destination.
350  */
351 bool
has_source_and_destination_hazard() const352 elk_fs_inst::has_source_and_destination_hazard() const
353 {
354    switch (opcode) {
355    case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
356       /* Multiple partial writes to the destination */
357       return true;
358    case ELK_SHADER_OPCODE_SHUFFLE:
359       /* This instruction returns an arbitrary channel from the source and
360        * gets split into smaller instructions in the generator.  It's possible
361        * that one of the instructions will read from a channel corresponding
362        * to an earlier instruction.
363        */
364    case ELK_SHADER_OPCODE_SEL_EXEC:
365       /* This is implemented as
366        *
367        * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
368        * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
369        *
370        * Because the source is only read in the second instruction, the first
371        * may stomp all over it.
372        */
373       return true;
374    case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
375       switch (src[1].ud) {
376       case ELK_SWIZZLE_XXXX:
377       case ELK_SWIZZLE_YYYY:
378       case ELK_SWIZZLE_ZZZZ:
379       case ELK_SWIZZLE_WWWW:
380       case ELK_SWIZZLE_XXZZ:
381       case ELK_SWIZZLE_YYWW:
382       case ELK_SWIZZLE_XYXY:
383       case ELK_SWIZZLE_ZWZW:
384          /* These can be implemented as a single Align1 region on all
385           * platforms, so there's never a hazard between source and
386           * destination.  C.f. elk_fs_generator::generate_quad_swizzle().
387           */
388          return false;
389       default:
390          return !is_uniform(src[0]);
391       }
392    default:
393       /* The SIMD16 compressed instruction
394        *
395        * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
396        *
397        * is actually decoded in hardware as:
398        *
399        * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
400        * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
401        *
402        * Which is safe.  However, if we have uniform accesses
403        * happening, we get into trouble:
404        *
405        * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
406        * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
407        *
408        * Now our destination for the first instruction overwrote the
409        * second instruction's src0, and we get garbage for those 8
410        * pixels.  There's a similar issue for the pre-gfx6
411        * pixel_x/pixel_y, which are registers of 16-bit values and thus
412        * would get stomped by the first decode as well.
413        */
414       if (exec_size == 16) {
415          for (int i = 0; i < sources; i++) {
416             if (src[i].file == VGRF && (src[i].stride == 0 ||
417                                         src[i].type == ELK_REGISTER_TYPE_UW ||
418                                         src[i].type == ELK_REGISTER_TYPE_W ||
419                                         src[i].type == ELK_REGISTER_TYPE_UB ||
420                                         src[i].type == ELK_REGISTER_TYPE_B)) {
421                return true;
422             }
423          }
424       }
425       return false;
426    }
427 }
428 
429 bool
can_do_source_mods(const struct intel_device_info * devinfo) const430 elk_fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
431 {
432    if (devinfo->ver == 6 && is_math())
433       return false;
434 
435    if (is_send_from_grf())
436       return false;
437 
438    return elk_backend_instruction::can_do_source_mods();
439 }
440 
441 bool
can_do_cmod()442 elk_fs_inst::can_do_cmod()
443 {
444    if (!elk_backend_instruction::can_do_cmod())
445       return false;
446 
447    /* The accumulator result appears to get used for the conditional modifier
448     * generation.  When negating a UD value, there is a 33rd bit generated for
449     * the sign in the accumulator value, so now you can't check, for example,
450     * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
451     */
452    for (unsigned i = 0; i < sources; i++) {
453       if (elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
454          return false;
455    }
456 
457    return true;
458 }
459 
460 bool
can_change_types() const461 elk_fs_inst::can_change_types() const
462 {
463    return dst.type == src[0].type &&
464           !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
465           (opcode == ELK_OPCODE_MOV ||
466            (opcode == ELK_OPCODE_SEL &&
467             dst.type == src[1].type &&
468             predicate != ELK_PREDICATE_NONE &&
469             !src[1].abs && !src[1].negate && src[1].file != ATTR));
470 }
471 
472 void
init()473 elk_fs_reg::init()
474 {
475    memset((void*)this, 0, sizeof(*this));
476    type = ELK_REGISTER_TYPE_UD;
477    stride = 1;
478 }
479 
480 /** Generic unset register constructor. */
elk_fs_reg()481 elk_fs_reg::elk_fs_reg()
482 {
483    init();
484    this->file = BAD_FILE;
485 }
486 
elk_fs_reg(struct::elk_reg reg)487 elk_fs_reg::elk_fs_reg(struct ::elk_reg reg) :
488    elk_backend_reg(reg)
489 {
490    this->offset = 0;
491    this->stride = 1;
492    if (this->file == IMM &&
493        (this->type != ELK_REGISTER_TYPE_V &&
494         this->type != ELK_REGISTER_TYPE_UV &&
495         this->type != ELK_REGISTER_TYPE_VF)) {
496       this->stride = 0;
497    }
498 }
499 
500 bool
equals(const elk_fs_reg & r) const501 elk_fs_reg::equals(const elk_fs_reg &r) const
502 {
503    return (this->elk_backend_reg::equals(r) &&
504            stride == r.stride);
505 }
506 
507 bool
negative_equals(const elk_fs_reg & r) const508 elk_fs_reg::negative_equals(const elk_fs_reg &r) const
509 {
510    return (this->elk_backend_reg::negative_equals(r) &&
511            stride == r.stride);
512 }
513 
514 bool
is_contiguous() const515 elk_fs_reg::is_contiguous() const
516 {
517    switch (file) {
518    case ARF:
519    case FIXED_GRF:
520       return hstride == ELK_HORIZONTAL_STRIDE_1 &&
521              vstride == width + hstride;
522    case MRF:
523    case VGRF:
524    case ATTR:
525       return stride == 1;
526    case UNIFORM:
527    case IMM:
528    case BAD_FILE:
529       return true;
530    }
531 
532    unreachable("Invalid register file");
533 }
534 
535 unsigned
component_size(unsigned width) const536 elk_fs_reg::component_size(unsigned width) const
537 {
538    if (file == ARF || file == FIXED_GRF) {
539       const unsigned w = MIN2(width, 1u << this->width);
540       const unsigned h = width >> this->width;
541       const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
542       const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
543       assert(w > 0);
544       return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
545    } else {
546       return MAX2(width * stride, 1) * type_sz(type);
547    }
548 }
549 
550 void
vfail(const char * format,va_list va)551 elk_fs_visitor::vfail(const char *format, va_list va)
552 {
553    char *msg;
554 
555    if (failed)
556       return;
557 
558    failed = true;
559 
560    msg = ralloc_vasprintf(mem_ctx, format, va);
561    msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
562          dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
563 
564    this->fail_msg = msg;
565 
566    if (unlikely(debug_enabled)) {
567       fprintf(stderr, "%s",  msg);
568    }
569 }
570 
571 void
fail(const char * format,...)572 elk_fs_visitor::fail(const char *format, ...)
573 {
574    va_list va;
575 
576    va_start(va, format);
577    vfail(format, va);
578    va_end(va);
579 }
580 
581 /**
582  * Mark this program as impossible to compile with dispatch width greater
583  * than n.
584  *
585  * During the SIMD8 compile (which happens first), we can detect and flag
586  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
587  * SIMD16+ compile altogether.
588  *
589  * During a compile of dispatch width greater than n (if one happens anyway),
590  * this just calls fail().
591  */
592 void
limit_dispatch_width(unsigned n,const char * msg)593 elk_fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
594 {
595    if (dispatch_width > n) {
596       fail("%s", msg);
597    } else {
598       max_dispatch_width = MIN2(max_dispatch_width, n);
599       elk_shader_perf_log(compiler, log_data,
600                           "Shader dispatch width limited to SIMD%d: %s\n",
601                           n, msg);
602    }
603 }
604 
605 /**
606  * Returns true if the instruction has a flag that means it won't
607  * update an entire destination register.
608  *
609  * For example, dead code elimination and live variable analysis want to know
610  * when a write to a variable screens off any preceding values that were in
611  * it.
612  */
613 bool
is_partial_write() const614 elk_fs_inst::is_partial_write() const
615 {
616    if (this->predicate && !this->predicate_trivial &&
617        this->opcode != ELK_OPCODE_SEL)
618       return true;
619 
620    if (this->dst.offset % REG_SIZE != 0)
621       return true;
622 
623    /* SEND instructions always write whole registers */
624    if (this->opcode == ELK_SHADER_OPCODE_SEND)
625       return false;
626 
627    /* Special case UNDEF since a lot of places in the backend do things like this :
628     *
629     *  fs_builder ubld = bld.exec_all().group(1, 0);
630     *  elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
631     *  ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
632     */
633    if (this->opcode == ELK_SHADER_OPCODE_UNDEF) {
634       assert(this->dst.is_contiguous());
635       return this->size_written < 32;
636    }
637 
638    return this->exec_size * type_sz(this->dst.type) < 32 ||
639           !this->dst.is_contiguous();
640 }
641 
642 unsigned
components_read(unsigned i) const643 elk_fs_inst::components_read(unsigned i) const
644 {
645    /* Return zero if the source is not present. */
646    if (src[i].file == BAD_FILE)
647       return 0;
648 
649    switch (opcode) {
650    case ELK_FS_OPCODE_LINTERP:
651       if (i == 0)
652          return 2;
653       else
654          return 1;
655 
656    case ELK_FS_OPCODE_PIXEL_X:
657    case ELK_FS_OPCODE_PIXEL_Y:
658       assert(i < 2);
659       if (i == 0)
660          return 2;
661       else
662          return 1;
663 
664    case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
665       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
666       /* First/second FB write color. */
667       if (i < 2)
668          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
669       else
670          return 1;
671 
672    case ELK_SHADER_OPCODE_TEX_LOGICAL:
673    case ELK_SHADER_OPCODE_TXD_LOGICAL:
674    case ELK_SHADER_OPCODE_TXF_LOGICAL:
675    case ELK_SHADER_OPCODE_TXL_LOGICAL:
676    case ELK_SHADER_OPCODE_TXS_LOGICAL:
677    case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
678    case ELK_FS_OPCODE_TXB_LOGICAL:
679    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
680    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
681    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
682    case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
683    case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
684    case ELK_SHADER_OPCODE_LOD_LOGICAL:
685    case ELK_SHADER_OPCODE_TG4_LOGICAL:
686    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
687    case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
688       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
689              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
690              src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
691       /* Texture coordinates. */
692       if (i == TEX_LOGICAL_SRC_COORDINATE)
693          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
694       /* Texture derivatives. */
695       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
696                opcode == ELK_SHADER_OPCODE_TXD_LOGICAL)
697          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
698       /* Texture offset. */
699       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
700          return 2;
701       /* MCS */
702       else if (i == TEX_LOGICAL_SRC_MCS) {
703          if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL)
704             return 2;
705          else if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
706             return 4;
707          else
708             return 1;
709       } else
710          return 1;
711 
712    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
713    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
714       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
715       /* Surface coordinates. */
716       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
717          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
718       /* Surface operation source (ignored for reads). */
719       else if (i == SURFACE_LOGICAL_SRC_DATA)
720          return 0;
721       else
722          return 1;
723 
724    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
725    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
726       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
727              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
728       /* Surface coordinates. */
729       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
730          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
731       /* Surface operation source. */
732       else if (i == SURFACE_LOGICAL_SRC_DATA)
733          return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
734       else
735          return 1;
736 
737    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
738    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
739    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
740       assert(src[A64_LOGICAL_ARG].file == IMM);
741       return 1;
742 
743    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
744       assert(src[A64_LOGICAL_ARG].file == IMM);
745       if (i == A64_LOGICAL_SRC) { /* data to write */
746          const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
747          assert(comps > 0);
748          return comps;
749       } else {
750          return 1;
751       }
752 
753    case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
754       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
755       return 1;
756 
757    case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
758       assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
759       if (i == SURFACE_LOGICAL_SRC_DATA) {
760          const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
761          assert(comps > 0);
762          return comps;
763       } else {
764          return 1;
765       }
766 
767    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
768       assert(src[A64_LOGICAL_ARG].file == IMM);
769       return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
770 
771    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
772       assert(src[A64_LOGICAL_ARG].file == IMM);
773       return i == A64_LOGICAL_SRC ?
774              lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
775 
776    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
777    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
778       /* Scattered logical opcodes use the following params:
779        * src[0] Surface coordinates
780        * src[1] Surface operation source (ignored for reads)
781        * src[2] Surface
782        * src[3] IMM with always 1 dimension.
783        * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
784        */
785       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
786              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
787       return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
788 
789    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
790    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
791       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
792              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
793       return 1;
794 
795    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
796    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
797       assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
798              src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
799       const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
800       /* Surface coordinates. */
801       if (i == SURFACE_LOGICAL_SRC_ADDRESS)
802          return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
803       /* Surface operation source. */
804       else if (i == SURFACE_LOGICAL_SRC_DATA)
805          return lsc_op_num_data_values(op);
806       else
807          return 1;
808    }
809    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
810       return (i == 0 ? 2 : 1);
811 
812    case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
813       assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
814 
815       if (i == URB_LOGICAL_SRC_DATA)
816          return src[URB_LOGICAL_SRC_COMPONENTS].ud;
817       else
818          return 1;
819 
820    default:
821       return 1;
822    }
823 }
824 
825 unsigned
size_read(int arg) const826 elk_fs_inst::size_read(int arg) const
827 {
828    switch (opcode) {
829    case ELK_SHADER_OPCODE_SEND:
830       if (arg == 1) {
831          return mlen * REG_SIZE;
832       }
833       break;
834 
835    case ELK_FS_OPCODE_FB_WRITE:
836    case ELK_FS_OPCODE_REP_FB_WRITE:
837       if (arg == 0) {
838          if (base_mrf >= 0)
839             return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
840          else
841             return mlen * REG_SIZE;
842       }
843       break;
844 
845    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
846    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
847       if (arg == 0)
848          return mlen * REG_SIZE;
849       break;
850 
851    case ELK_FS_OPCODE_SET_SAMPLE_ID:
852       if (arg == 1)
853          return 1;
854       break;
855 
856    case ELK_FS_OPCODE_LINTERP:
857       if (arg == 1)
858          return 16;
859       break;
860 
861    case ELK_SHADER_OPCODE_LOAD_PAYLOAD:
862       if (arg < this->header_size)
863          return retype(src[arg], ELK_REGISTER_TYPE_UD).component_size(8);
864       break;
865 
866    case ELK_CS_OPCODE_CS_TERMINATE:
867    case ELK_SHADER_OPCODE_BARRIER:
868       return REG_SIZE;
869 
870    case ELK_SHADER_OPCODE_MOV_INDIRECT:
871       if (arg == 0) {
872          assert(src[2].file == IMM);
873          return src[2].ud;
874       }
875       break;
876 
877    case ELK_SHADER_OPCODE_TEX:
878    case ELK_FS_OPCODE_TXB:
879    case ELK_SHADER_OPCODE_TXD:
880    case ELK_SHADER_OPCODE_TXF:
881    case ELK_SHADER_OPCODE_TXF_LZ:
882    case ELK_SHADER_OPCODE_TXF_CMS:
883    case ELK_SHADER_OPCODE_TXF_CMS_W:
884    case ELK_SHADER_OPCODE_TXF_UMS:
885    case ELK_SHADER_OPCODE_TXF_MCS:
886    case ELK_SHADER_OPCODE_TXL:
887    case ELK_SHADER_OPCODE_TXL_LZ:
888    case ELK_SHADER_OPCODE_TXS:
889    case ELK_SHADER_OPCODE_LOD:
890    case ELK_SHADER_OPCODE_TG4:
891    case ELK_SHADER_OPCODE_TG4_OFFSET:
892    case ELK_SHADER_OPCODE_SAMPLEINFO:
893       if (arg == 0 && src[0].file == VGRF)
894          return mlen * REG_SIZE;
895       break;
896 
897    default:
898       break;
899    }
900 
901    switch (src[arg].file) {
902    case UNIFORM:
903    case IMM:
904       return components_read(arg) * type_sz(src[arg].type);
905    case BAD_FILE:
906    case ARF:
907    case FIXED_GRF:
908    case VGRF:
909    case ATTR:
910       return components_read(arg) * src[arg].component_size(exec_size);
911    case MRF:
912       unreachable("MRF registers are not allowed as sources");
913    }
914    return 0;
915 }
916 
917 namespace {
918    unsigned
predicate_width(const intel_device_info * devinfo,elk_predicate predicate)919    predicate_width(const intel_device_info *devinfo, elk_predicate predicate)
920    {
921       switch (predicate) {
922       case ELK_PREDICATE_NONE:            return 1;
923       case ELK_PREDICATE_NORMAL:          return 1;
924       case ELK_PREDICATE_ALIGN1_ANY2H:    return 2;
925       case ELK_PREDICATE_ALIGN1_ALL2H:    return 2;
926       case ELK_PREDICATE_ALIGN1_ANY4H:    return 4;
927       case ELK_PREDICATE_ALIGN1_ALL4H:    return 4;
928       case ELK_PREDICATE_ALIGN1_ANY8H:    return 8;
929       case ELK_PREDICATE_ALIGN1_ALL8H:    return 8;
930       case ELK_PREDICATE_ALIGN1_ANY16H:   return 16;
931       case ELK_PREDICATE_ALIGN1_ALL16H:   return 16;
932       case ELK_PREDICATE_ALIGN1_ANY32H:   return 32;
933       case ELK_PREDICATE_ALIGN1_ALL32H:   return 32;
934       default: unreachable("Unsupported predicate");
935       }
936    }
937 
938    /* Return the subset of flag registers that an instruction could
939     * potentially read or write based on the execution controls and flag
940     * subregister number of the instruction.
941     */
942    unsigned
flag_mask(const elk_fs_inst * inst,unsigned width)943    flag_mask(const elk_fs_inst *inst, unsigned width)
944    {
945       assert(util_is_power_of_two_nonzero(width));
946       const unsigned start = (inst->flag_subreg * 16 + inst->group) &
947                              ~(width - 1);
948       const unsigned end = start + ALIGN(inst->exec_size, width);
949       return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
950    }
951 
952    unsigned
bit_mask(unsigned n)953    bit_mask(unsigned n)
954    {
955       return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
956    }
957 
958    unsigned
flag_mask(const elk_fs_reg & r,unsigned sz)959    flag_mask(const elk_fs_reg &r, unsigned sz)
960    {
961       if (r.file == ARF) {
962          const unsigned start = (r.nr - ELK_ARF_FLAG) * 4 + r.subnr;
963          const unsigned end = start + sz;
964          return bit_mask(end) & ~bit_mask(start);
965       } else {
966          return 0;
967       }
968    }
969 }
970 
971 unsigned
flags_read(const intel_device_info * devinfo) const972 elk_fs_inst::flags_read(const intel_device_info *devinfo) const
973 {
974    if (predicate == ELK_PREDICATE_ALIGN1_ANYV ||
975        predicate == ELK_PREDICATE_ALIGN1_ALLV) {
976       /* The vertical predication modes combine corresponding bits from
977        * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
978        */
979       const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
980       return flag_mask(this, 1) << shift | flag_mask(this, 1);
981    } else if (predicate) {
982       return flag_mask(this, predicate_width(devinfo, predicate));
983    } else {
984       unsigned mask = 0;
985       for (int i = 0; i < sources; i++) {
986          mask |= flag_mask(src[i], size_read(i));
987       }
988       return mask;
989    }
990 }
991 
992 unsigned
flags_written(const intel_device_info * devinfo) const993 elk_fs_inst::flags_written(const intel_device_info *devinfo) const
994 {
995    /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
996     * using a separate cmpn and sel instruction.  This lowering occurs in
997     * fs_vistor::lower_minmax which is called very, very late.
998     */
999    if ((conditional_mod && ((opcode != ELK_OPCODE_SEL || devinfo->ver <= 5) &&
1000                             opcode != ELK_OPCODE_CSEL &&
1001                             opcode != ELK_OPCODE_IF &&
1002                             opcode != ELK_OPCODE_WHILE)) ||
1003        opcode == ELK_FS_OPCODE_FB_WRITE) {
1004       return flag_mask(this, 1);
1005    } else if (opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1006               opcode == ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1007               opcode == ELK_FS_OPCODE_LOAD_LIVE_CHANNELS) {
1008       return flag_mask(this, 32);
1009    } else {
1010       return flag_mask(dst, size_written);
1011    }
1012 }
1013 
1014 /**
1015  * Returns how many MRFs an FS opcode will write over.
1016  *
1017  * Note that this is not the 0 or 1 implied writes in an actual gen
1018  * instruction -- the FS opcodes often generate MOVs in addition.
1019  */
1020 unsigned
implied_mrf_writes() const1021 elk_fs_inst::implied_mrf_writes() const
1022 {
1023    if (mlen == 0)
1024       return 0;
1025 
1026    if (base_mrf == -1)
1027       return 0;
1028 
1029    switch (opcode) {
1030    case ELK_SHADER_OPCODE_RCP:
1031    case ELK_SHADER_OPCODE_RSQ:
1032    case ELK_SHADER_OPCODE_SQRT:
1033    case ELK_SHADER_OPCODE_EXP2:
1034    case ELK_SHADER_OPCODE_LOG2:
1035    case ELK_SHADER_OPCODE_SIN:
1036    case ELK_SHADER_OPCODE_COS:
1037       return 1 * exec_size / 8;
1038    case ELK_SHADER_OPCODE_POW:
1039    case ELK_SHADER_OPCODE_INT_QUOTIENT:
1040    case ELK_SHADER_OPCODE_INT_REMAINDER:
1041       return 2 * exec_size / 8;
1042    case ELK_SHADER_OPCODE_TEX:
1043    case ELK_FS_OPCODE_TXB:
1044    case ELK_SHADER_OPCODE_TXD:
1045    case ELK_SHADER_OPCODE_TXF:
1046    case ELK_SHADER_OPCODE_TXF_CMS:
1047    case ELK_SHADER_OPCODE_TXF_MCS:
1048    case ELK_SHADER_OPCODE_TG4:
1049    case ELK_SHADER_OPCODE_TG4_OFFSET:
1050    case ELK_SHADER_OPCODE_TXL:
1051    case ELK_SHADER_OPCODE_TXS:
1052    case ELK_SHADER_OPCODE_LOD:
1053    case ELK_SHADER_OPCODE_SAMPLEINFO:
1054       return 1;
1055    case ELK_FS_OPCODE_FB_WRITE:
1056    case ELK_FS_OPCODE_REP_FB_WRITE:
1057       return src[0].file == BAD_FILE ? 0 : 2;
1058    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1059    case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1060       return 1;
1061    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1062       return mlen;
1063    case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1064       return mlen;
1065    default:
1066       unreachable("not reached");
1067    }
1068 }
1069 
1070 bool
has_sampler_residency() const1071 elk_fs_inst::has_sampler_residency() const
1072 {
1073    switch (opcode) {
1074    case ELK_SHADER_OPCODE_TEX_LOGICAL:
1075    case ELK_FS_OPCODE_TXB_LOGICAL:
1076    case ELK_SHADER_OPCODE_TXL_LOGICAL:
1077    case ELK_SHADER_OPCODE_TXD_LOGICAL:
1078    case ELK_SHADER_OPCODE_TXF_LOGICAL:
1079    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1080    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1081    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1082    case ELK_SHADER_OPCODE_TXS_LOGICAL:
1083    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1084    case ELK_SHADER_OPCODE_TG4_LOGICAL:
1085       assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1086       return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1087    default:
1088       return false;
1089    }
1090 }
1091 
1092 elk_fs_reg
vgrf(const glsl_type * const type)1093 elk_fs_visitor::vgrf(const glsl_type *const type)
1094 {
1095    int reg_width = dispatch_width / 8;
1096    return elk_fs_reg(VGRF,
1097                  alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1098                  elk_type_for_base_type(type));
1099 }
1100 
elk_fs_reg(enum elk_reg_file file,unsigned nr)1101 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr)
1102 {
1103    init();
1104    this->file = file;
1105    this->nr = nr;
1106    this->type = ELK_REGISTER_TYPE_F;
1107    this->stride = (file == UNIFORM ? 0 : 1);
1108 }
1109 
elk_fs_reg(enum elk_reg_file file,unsigned nr,enum elk_reg_type type)1110 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type)
1111 {
1112    init();
1113    this->file = file;
1114    this->nr = nr;
1115    this->type = type;
1116    this->stride = (file == UNIFORM ? 0 : 1);
1117 }
1118 
1119 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1120  * This brings in those uniform definitions
1121  */
1122 void
import_uniforms(elk_fs_visitor * v)1123 elk_fs_visitor::import_uniforms(elk_fs_visitor *v)
1124 {
1125    this->push_constant_loc = v->push_constant_loc;
1126    this->uniforms = v->uniforms;
1127 }
1128 
1129 enum elk_barycentric_mode
elk_barycentric_mode(nir_intrinsic_instr * intr)1130 elk_barycentric_mode(nir_intrinsic_instr *intr)
1131 {
1132    const glsl_interp_mode mode =
1133       (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1134 
1135    /* Barycentric modes don't make sense for flat inputs. */
1136    assert(mode != INTERP_MODE_FLAT);
1137 
1138    unsigned bary;
1139    switch (intr->intrinsic) {
1140    case nir_intrinsic_load_barycentric_pixel:
1141    case nir_intrinsic_load_barycentric_at_offset:
1142       bary = ELK_BARYCENTRIC_PERSPECTIVE_PIXEL;
1143       break;
1144    case nir_intrinsic_load_barycentric_centroid:
1145       bary = ELK_BARYCENTRIC_PERSPECTIVE_CENTROID;
1146       break;
1147    case nir_intrinsic_load_barycentric_sample:
1148    case nir_intrinsic_load_barycentric_at_sample:
1149       bary = ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1150       break;
1151    default:
1152       unreachable("invalid intrinsic");
1153    }
1154 
1155    if (mode == INTERP_MODE_NOPERSPECTIVE)
1156       bary += 3;
1157 
1158    return (enum elk_barycentric_mode) bary;
1159 }
1160 
1161 /**
1162  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1163  */
1164 static enum elk_barycentric_mode
centroid_to_pixel(enum elk_barycentric_mode bary)1165 centroid_to_pixel(enum elk_barycentric_mode bary)
1166 {
1167    assert(bary == ELK_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1168           bary == ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1169    return (enum elk_barycentric_mode) ((unsigned) bary - 1);
1170 }
1171 
1172 /**
1173  * Walk backwards from the end of the program looking for a URB write that
1174  * isn't in control flow, and mark it with EOT.
1175  *
1176  * Return true if successful or false if a separate EOT write is needed.
1177  */
1178 bool
mark_last_urb_write_with_eot()1179 elk_fs_visitor::mark_last_urb_write_with_eot()
1180 {
1181    foreach_in_list_reverse(elk_fs_inst, prev, &this->instructions) {
1182       if (prev->opcode == ELK_SHADER_OPCODE_URB_WRITE_LOGICAL) {
1183          prev->eot = true;
1184 
1185          /* Delete now dead instructions. */
1186          foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1187             if (dead == prev)
1188                break;
1189             dead->remove();
1190          }
1191          return true;
1192       } else if (prev->is_control_flow() || prev->has_side_effects()) {
1193          break;
1194       }
1195    }
1196 
1197    return false;
1198 }
1199 
1200 void
emit_gs_thread_end()1201 elk_fs_visitor::emit_gs_thread_end()
1202 {
1203    assert(stage == MESA_SHADER_GEOMETRY);
1204 
1205    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
1206 
1207    if (gs_compile->control_data_header_size_bits > 0) {
1208       emit_gs_control_data_bits(this->final_gs_vertex_count);
1209    }
1210 
1211    const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1212    elk_fs_inst *inst;
1213 
1214    if (gs_prog_data->static_vertex_count != -1) {
1215       /* Try and tag the last URB write with EOT instead of emitting a whole
1216        * separate write just to finish the thread.
1217        */
1218       if (mark_last_urb_write_with_eot())
1219          return;
1220 
1221       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1222       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1223       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(0);
1224       inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1225                        srcs, ARRAY_SIZE(srcs));
1226    } else {
1227       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1228       srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1229       srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1230       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
1231       inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1232                        srcs, ARRAY_SIZE(srcs));
1233    }
1234    inst->eot = true;
1235    inst->offset = 0;
1236 }
1237 
1238 void
assign_curb_setup()1239 elk_fs_visitor::assign_curb_setup()
1240 {
1241    unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1242 
1243    unsigned ubo_push_length = 0;
1244    unsigned ubo_push_start[4];
1245    for (int i = 0; i < 4; i++) {
1246       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1247       ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1248    }
1249 
1250    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1251 
1252    uint64_t used = 0;
1253 
1254    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1255    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1256       for (unsigned int i = 0; i < inst->sources; i++) {
1257 	 if (inst->src[i].file == UNIFORM) {
1258             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1259             int constant_nr;
1260             if (inst->src[i].nr >= UBO_START) {
1261                /* constant_nr is in 32-bit units, the rest are in bytes */
1262                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1263                              inst->src[i].offset / 4;
1264             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1265                constant_nr = push_constant_loc[uniform_nr];
1266             } else {
1267                /* Section 5.11 of the OpenGL 4.1 spec says:
1268                 * "Out-of-bounds reads return undefined values, which include
1269                 *  values from other variables of the active program or zero."
1270                 * Just return the first push constant.
1271                 */
1272                constant_nr = 0;
1273             }
1274 
1275             assert(constant_nr / 8 < 64);
1276             used |= BITFIELD64_BIT(constant_nr / 8);
1277 
1278 	    struct elk_reg elk_reg = elk_vec1_grf(payload().num_regs +
1279 						  constant_nr / 8,
1280 						  constant_nr % 8);
1281             elk_reg.abs = inst->src[i].abs;
1282             elk_reg.negate = inst->src[i].negate;
1283 
1284             assert(inst->src[i].stride == 0);
1285             inst->src[i] = byte_offset(
1286                retype(elk_reg, inst->src[i].type),
1287                inst->src[i].offset % 4);
1288 	 }
1289       }
1290    }
1291 
1292    uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1293    if (want_zero) {
1294       fs_builder ubld = fs_builder(this, 8).exec_all().at(
1295          cfg->first_block(), cfg->first_block()->start());
1296 
1297       /* push_reg_mask_param is in 32-bit units */
1298       unsigned mask_param = stage_prog_data->push_reg_mask_param;
1299       struct elk_reg mask = elk_vec1_grf(payload().num_regs + mask_param / 8,
1300                                                               mask_param % 8);
1301 
1302       elk_fs_reg b32;
1303       for (unsigned i = 0; i < 64; i++) {
1304          if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1305             elk_fs_reg shifted = ubld.vgrf(ELK_REGISTER_TYPE_W, 2);
1306             ubld.SHL(horiz_offset(shifted, 8),
1307                      byte_offset(retype(mask, ELK_REGISTER_TYPE_W), i / 8),
1308                      elk_imm_v(0x01234567));
1309             ubld.SHL(shifted, horiz_offset(shifted, 8), elk_imm_w(8));
1310 
1311             fs_builder ubld16 = ubld.group(16, 0);
1312             b32 = ubld16.vgrf(ELK_REGISTER_TYPE_D);
1313             ubld16.group(16, 0).ASR(b32, shifted, elk_imm_w(15));
1314          }
1315 
1316          if (want_zero & BITFIELD64_BIT(i)) {
1317             assert(i < prog_data->curb_read_length);
1318             struct elk_reg push_reg =
1319                retype(elk_vec8_grf(payload().num_regs + i, 0),
1320                       ELK_REGISTER_TYPE_D);
1321 
1322             ubld.AND(push_reg, push_reg, component(b32, i % 16));
1323          }
1324       }
1325 
1326       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1327    }
1328 
1329    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1330    this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1331 }
1332 
1333 /*
1334  * Build up an array of indices into the urb_setup array that
1335  * references the active entries of the urb_setup array.
1336  * Used to accelerate walking the active entries of the urb_setup array
1337  * on each upload.
1338  */
1339 void
elk_compute_urb_setup_index(struct elk_wm_prog_data * wm_prog_data)1340 elk_compute_urb_setup_index(struct elk_wm_prog_data *wm_prog_data)
1341 {
1342    /* Make sure uint8_t is sufficient */
1343    STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1344    uint8_t index = 0;
1345    for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1346       if (wm_prog_data->urb_setup[attr] >= 0) {
1347          wm_prog_data->urb_setup_attribs[index++] = attr;
1348       }
1349    }
1350    wm_prog_data->urb_setup_attribs_count = index;
1351 }
1352 
1353 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * nir)1354 calculate_urb_setup(const struct intel_device_info *devinfo,
1355                     const struct elk_wm_prog_key *key,
1356                     struct elk_wm_prog_data *prog_data,
1357                     const nir_shader *nir)
1358 {
1359    memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1360    memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1361 
1362    int urb_next = 0; /* in vec4s */
1363 
1364    const uint64_t inputs_read =
1365       nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1366 
1367    /* Figure out where each of the incoming setup attributes lands. */
1368    if (devinfo->ver >= 6) {
1369       assert(!nir->info.per_primitive_inputs);
1370 
1371       uint64_t vue_header_bits =
1372          VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1373 
1374       uint64_t unique_fs_attrs = inputs_read & ELK_FS_VARYING_INPUT_MASK;
1375 
1376       /* VUE header fields all live in the same URB slot, so we pass them
1377        * as a single FS input attribute.  We want to only count them once.
1378        */
1379       if (inputs_read & vue_header_bits) {
1380          unique_fs_attrs &= ~vue_header_bits;
1381          unique_fs_attrs |= VARYING_BIT_PSIZ;
1382       }
1383 
1384       if (util_bitcount64(unique_fs_attrs) <= 16) {
1385          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1386           * first 16 varying inputs, so we can put them wherever we want.
1387           * Just put them in order.
1388           *
1389           * This is useful because it means that (a) inputs not used by the
1390           * fragment shader won't take up valuable register space, and (b) we
1391           * won't have to recompile the fragment shader if it gets paired with
1392           * a different vertex (or geometry) shader.
1393           *
1394           * VUE header fields share the same FS input attribute.
1395           */
1396          if (inputs_read & vue_header_bits) {
1397             if (inputs_read & VARYING_BIT_PSIZ)
1398                prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1399             if (inputs_read & VARYING_BIT_LAYER)
1400                prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1401             if (inputs_read & VARYING_BIT_VIEWPORT)
1402                prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1403 
1404             urb_next++;
1405          }
1406 
1407          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1408             if (inputs_read & ELK_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1409                 BITFIELD64_BIT(i)) {
1410                prog_data->urb_setup[i] = urb_next++;
1411             }
1412          }
1413       } else {
1414          /* We have enough input varyings that the SF/SBE pipeline stage can't
1415           * arbitrarily rearrange them to suit our whim; we have to put them
1416           * in an order that matches the output of the previous pipeline stage
1417           * (geometry or vertex shader).
1418           */
1419 
1420          /* Re-compute the VUE map here in the case that the one coming from
1421           * geometry has more than one position slot (used for Primitive
1422           * Replication).
1423           */
1424          struct intel_vue_map prev_stage_vue_map;
1425          elk_compute_vue_map(devinfo, &prev_stage_vue_map,
1426                              key->input_slots_valid,
1427                              nir->info.separate_shader, 1);
1428 
1429          int first_slot =
1430             elk_compute_first_urb_slot_required(inputs_read,
1431                                                 &prev_stage_vue_map);
1432 
1433          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435               slot++) {
1436             int varying = prev_stage_vue_map.slot_to_varying[slot];
1437             if (varying != ELK_VARYING_SLOT_PAD &&
1438                 (inputs_read & ELK_FS_VARYING_INPUT_MASK &
1439                  BITFIELD64_BIT(varying))) {
1440                prog_data->urb_setup[varying] = slot - first_slot;
1441             }
1442          }
1443          urb_next = prev_stage_vue_map.num_slots - first_slot;
1444       }
1445    } else {
1446       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1447       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1448          /* Point size is packed into the header, not as a general attribute */
1449          if (i == VARYING_SLOT_PSIZ)
1450             continue;
1451 
1452 	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1453 	    /* The back color slot is skipped when the front color is
1454 	     * also written to.  In addition, some slots can be
1455 	     * written in the vertex shader and not read in the
1456 	     * fragment shader.  So the register number must always be
1457 	     * incremented, mapped or not.
1458 	     */
1459 	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1460 	       prog_data->urb_setup[i] = urb_next;
1461             urb_next++;
1462 	 }
1463       }
1464 
1465       /*
1466        * It's a FS only attribute, and we did interpolation for this attribute
1467        * in SF thread. So, count it here, too.
1468        *
1469        * See compile_sf_prog() for more info.
1470        */
1471       if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1472          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1473    }
1474 
1475    prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1476    prog_data->inputs = inputs_read;
1477 
1478    elk_compute_urb_setup_index(prog_data);
1479 }
1480 
1481 void
assign_urb_setup()1482 elk_fs_visitor::assign_urb_setup()
1483 {
1484    assert(stage == MESA_SHADER_FRAGMENT);
1485    struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
1486 
1487    int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1488 
1489    /* Offset all the urb_setup[] index by the actual position of the
1490     * setup regs, now that the location of the constants has been chosen.
1491     */
1492    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1493       for (int i = 0; i < inst->sources; i++) {
1494          if (inst->src[i].file == ATTR) {
1495             /* ATTR elk_fs_reg::nr in the FS is in units of logical scalar
1496              * inputs each of which consumes 16B on Gfx4-Gfx12.  In
1497              * single polygon mode this leads to the following layout
1498              * of the vertex setup plane parameters in the ATTR
1499              * register file:
1500              *
1501              *  elk_fs_reg::nr   Input   Comp0  Comp1  Comp2  Comp3
1502              *      0       Attr0.x  a1-a0  a2-a0   N/A    a0
1503              *      1       Attr0.y  a1-a0  a2-a0   N/A    a0
1504              *      2       Attr0.z  a1-a0  a2-a0   N/A    a0
1505              *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
1506              *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
1507              *     ...
1508              */
1509             const unsigned param_width = 1;
1510 
1511             /* Size of a single scalar component of a plane parameter
1512              * in bytes.
1513              */
1514             const unsigned chan_sz = 4;
1515             struct elk_reg reg;
1516 
1517             /* Calculate the base register on the thread payload of
1518              * either the block of vertex setup data or the block of
1519              * per-primitive constant data depending on whether we're
1520              * accessing a primitive or vertex input.  Also calculate
1521              * the index of the input within that block.
1522              */
1523             const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1524             const unsigned base = urb_start +
1525                (per_prim ? 0 :
1526                 ALIGN(prog_data->num_per_primitive_inputs / 2,
1527                       reg_unit(devinfo)));
1528             const unsigned idx = per_prim ? inst->src[i].nr :
1529                inst->src[i].nr - prog_data->num_per_primitive_inputs;
1530 
1531             /* Translate the offset within the param_width-wide
1532              * representation described above into an offset and a
1533              * grf, which contains the plane parameters for the first
1534              * polygon processed by the thread.
1535              *
1536              * Earlier platforms and per-primitive block pack 2 logical
1537              * input components per 32B register.
1538              */
1539             const unsigned grf = base + idx / 2;
1540             assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1541             const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1542                inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1543                inst->src[i].offset % chan_sz;
1544             reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1545                               delta);
1546 
1547             const unsigned width = inst->src[i].stride == 0 ?
1548                1 : MIN2(inst->exec_size, 8);
1549             reg = stride(reg, width * inst->src[i].stride,
1550                          width, inst->src[i].stride);
1551 
1552             reg.abs = inst->src[i].abs;
1553             reg.negate = inst->src[i].negate;
1554             inst->src[i] = reg;
1555          }
1556       }
1557    }
1558 
1559    /* Each attribute is 4 setup channels, each of which is half a reg,
1560     * but they may be replicated multiple times for multipolygon
1561     * dispatch.
1562     */
1563    this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1564 
1565    /* Unlike regular attributes, per-primitive attributes have all 4 channels
1566     * in the same slot, so each GRF can store two slots.
1567     */
1568    assert(prog_data->num_per_primitive_inputs % 2 == 0);
1569    this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
1570 }
1571 
1572 void
convert_attr_sources_to_hw_regs(elk_fs_inst * inst)1573 elk_fs_visitor::convert_attr_sources_to_hw_regs(elk_fs_inst *inst)
1574 {
1575    for (int i = 0; i < inst->sources; i++) {
1576       if (inst->src[i].file == ATTR) {
1577          assert(inst->src[i].nr == 0);
1578          int grf = payload().num_regs +
1579                    prog_data->curb_read_length +
1580                    inst->src[i].offset / REG_SIZE;
1581 
1582          /* As explained at elk_reg_from_fs_reg, From the Haswell PRM:
1583           *
1584           * VertStride must be used to cross GRF register boundaries. This
1585           * rule implies that elements within a 'Width' cannot cross GRF
1586           * boundaries.
1587           *
1588           * So, for registers that are large enough, we have to split the exec
1589           * size in two and trust the compression state to sort it out.
1590           */
1591          unsigned total_size = inst->exec_size *
1592                                inst->src[i].stride *
1593                                type_sz(inst->src[i].type);
1594 
1595          assert(total_size <= 2 * REG_SIZE);
1596          const unsigned exec_size =
1597             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1598 
1599          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1600          struct elk_reg reg =
1601             stride(byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1602                                inst->src[i].offset % REG_SIZE),
1603                    exec_size * inst->src[i].stride,
1604                    width, inst->src[i].stride);
1605          reg.abs = inst->src[i].abs;
1606          reg.negate = inst->src[i].negate;
1607 
1608          inst->src[i] = reg;
1609       }
1610    }
1611 }
1612 
1613 void
assign_vs_urb_setup()1614 elk_fs_visitor::assign_vs_urb_setup()
1615 {
1616    struct elk_vs_prog_data *vs_prog_data = elk_vs_prog_data(prog_data);
1617 
1618    assert(stage == MESA_SHADER_VERTEX);
1619 
1620    /* Each attribute is 4 regs. */
1621    this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1622 
1623    assert(vs_prog_data->base.urb_read_length <= 15);
1624 
1625    /* Rewrite all ATTR file references to the hw grf that they land in. */
1626    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1627       convert_attr_sources_to_hw_regs(inst);
1628    }
1629 }
1630 
1631 void
assign_tcs_urb_setup()1632 elk_fs_visitor::assign_tcs_urb_setup()
1633 {
1634    assert(stage == MESA_SHADER_TESS_CTRL);
1635 
1636    /* Rewrite all ATTR file references to HW_REGs. */
1637    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1638       convert_attr_sources_to_hw_regs(inst);
1639    }
1640 }
1641 
1642 void
assign_tes_urb_setup()1643 elk_fs_visitor::assign_tes_urb_setup()
1644 {
1645    assert(stage == MESA_SHADER_TESS_EVAL);
1646 
1647    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1648 
1649    first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1650 
1651    /* Rewrite all ATTR file references to HW_REGs. */
1652    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1653       convert_attr_sources_to_hw_regs(inst);
1654    }
1655 }
1656 
1657 void
assign_gs_urb_setup()1658 elk_fs_visitor::assign_gs_urb_setup()
1659 {
1660    assert(stage == MESA_SHADER_GEOMETRY);
1661 
1662    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1663 
1664    first_non_payload_grf +=
1665       8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1666 
1667    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1668       /* Rewrite all ATTR file references to GRFs. */
1669       convert_attr_sources_to_hw_regs(inst);
1670    }
1671 }
1672 
1673 
1674 /**
1675  * Split large virtual GRFs into separate components if we can.
1676  *
1677  * This pass aggressively splits VGRFs into as small a chunks as possible,
1678  * down to single registers if it can.  If no VGRFs can be split, we return
1679  * false so this pass can safely be used inside an optimization loop.  We
1680  * want to split, because virtual GRFs are what we register allocate and
1681  * spill (due to contiguousness requirements for some instructions), and
1682  * they're what we naturally generate in the codegen process, but most
1683  * virtual GRFs don't actually need to be contiguous sets of GRFs.  If we
1684  * split, we'll end up with reduced live intervals and better dead code
1685  * elimination and coalescing.
1686  */
1687 bool
split_virtual_grfs()1688 elk_fs_visitor::split_virtual_grfs()
1689 {
1690    /* Compact the register file so we eliminate dead vgrfs.  This
1691     * only defines split points for live registers, so if we have
1692     * too large dead registers they will hit assertions later.
1693     */
1694    compact_virtual_grfs();
1695 
1696    unsigned num_vars = this->alloc.count;
1697 
1698    /* Count the total number of registers */
1699    unsigned reg_count = 0;
1700    unsigned *vgrf_to_reg = new unsigned[num_vars];
1701    for (unsigned i = 0; i < num_vars; i++) {
1702       vgrf_to_reg[i] = reg_count;
1703       reg_count += alloc.sizes[i];
1704    }
1705 
1706    /* An array of "split points".  For each register slot, this indicates
1707     * if this slot can be separated from the previous slot.  Every time an
1708     * instruction uses multiple elements of a register (as a source or
1709     * destination), we mark the used slots as inseparable.  Then we go
1710     * through and split the registers into the smallest pieces we can.
1711     */
1712    bool *split_points = new bool[reg_count];
1713    memset(split_points, 0, reg_count * sizeof(*split_points));
1714 
1715    /* Mark all used registers as fully splittable */
1716    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1717       if (inst->dst.file == VGRF) {
1718          unsigned reg = vgrf_to_reg[inst->dst.nr];
1719          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1720             split_points[reg + j] = true;
1721       }
1722 
1723       for (unsigned i = 0; i < inst->sources; i++) {
1724          if (inst->src[i].file == VGRF) {
1725             unsigned reg = vgrf_to_reg[inst->src[i].nr];
1726             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1727                split_points[reg + j] = true;
1728          }
1729       }
1730    }
1731 
1732    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1733       /* We fix up undef instructions later */
1734       if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1735          assert(inst->dst.file == VGRF);
1736          continue;
1737       }
1738 
1739       if (inst->dst.file == VGRF) {
1740          unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1741          for (unsigned j = 1; j < regs_written(inst); j++)
1742             split_points[reg + j] = false;
1743       }
1744       for (unsigned i = 0; i < inst->sources; i++) {
1745          if (inst->src[i].file == VGRF) {
1746             unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1747             for (unsigned j = 1; j < regs_read(inst, i); j++)
1748                split_points[reg + j] = false;
1749          }
1750       }
1751    }
1752 
1753    /* Bitset of which registers have been split */
1754    bool *vgrf_has_split = new bool[num_vars];
1755    memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
1756 
1757    unsigned *new_virtual_grf = new unsigned[reg_count];
1758    unsigned *new_reg_offset = new unsigned[reg_count];
1759 
1760    unsigned reg = 0;
1761    bool has_splits = false;
1762    for (unsigned i = 0; i < num_vars; i++) {
1763       /* The first one should always be 0 as a quick sanity check. */
1764       assert(split_points[reg] == false);
1765 
1766       /* j = 0 case */
1767       new_reg_offset[reg] = 0;
1768       reg++;
1769       unsigned offset = 1;
1770 
1771       /* j > 0 case */
1772       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1773          /* If this is a split point, reset the offset to 0 and allocate a
1774           * new virtual GRF for the previous offset many registers
1775           */
1776          if (split_points[reg]) {
1777             has_splits = true;
1778             vgrf_has_split[i] = true;
1779             assert(offset <= MAX_VGRF_SIZE(devinfo));
1780             unsigned grf = alloc.allocate(offset);
1781             for (unsigned k = reg - offset; k < reg; k++)
1782                new_virtual_grf[k] = grf;
1783             offset = 0;
1784          }
1785          new_reg_offset[reg] = offset;
1786          offset++;
1787          reg++;
1788       }
1789 
1790       /* The last one gets the original register number */
1791       assert(offset <= MAX_VGRF_SIZE(devinfo));
1792       alloc.sizes[i] = offset;
1793       for (unsigned k = reg - offset; k < reg; k++)
1794          new_virtual_grf[k] = i;
1795    }
1796    assert(reg == reg_count);
1797 
1798    bool progress;
1799    if (!has_splits) {
1800       progress = false;
1801       goto cleanup;
1802    }
1803 
1804    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
1805       if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1806          assert(inst->dst.file == VGRF);
1807          if (vgrf_has_split[inst->dst.nr]) {
1808             const fs_builder ibld(this, block, inst);
1809             assert(inst->size_written % REG_SIZE == 0);
1810             unsigned reg_offset = inst->dst.offset / REG_SIZE;
1811             unsigned size_written = 0;
1812             while (size_written < inst->size_written) {
1813                reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
1814                elk_fs_inst *undef =
1815                   ibld.UNDEF(
1816                      byte_offset(elk_fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
1817                                  new_reg_offset[reg] * REG_SIZE));
1818                undef->size_written =
1819                   MIN2(inst->size_written - size_written, undef->size_written);
1820                assert(undef->size_written % REG_SIZE == 0);
1821                size_written += undef->size_written;
1822             }
1823             inst->remove(block);
1824          } else {
1825             reg = vgrf_to_reg[inst->dst.nr];
1826             assert(new_reg_offset[reg] == 0);
1827             assert(new_virtual_grf[reg] == inst->dst.nr);
1828          }
1829          continue;
1830       }
1831 
1832       if (inst->dst.file == VGRF) {
1833          reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1834          if (vgrf_has_split[inst->dst.nr]) {
1835             inst->dst.nr = new_virtual_grf[reg];
1836             inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
1837                                inst->dst.offset % REG_SIZE;
1838             assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1839          } else {
1840             assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
1841             assert(new_virtual_grf[reg] == inst->dst.nr);
1842          }
1843       }
1844       for (unsigned i = 0; i < inst->sources; i++) {
1845 	 if (inst->src[i].file != VGRF)
1846             continue;
1847 
1848          reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1849          if (vgrf_has_split[inst->src[i].nr]) {
1850             inst->src[i].nr = new_virtual_grf[reg];
1851             inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
1852                                   inst->src[i].offset % REG_SIZE;
1853             assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1854          } else {
1855             assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
1856             assert(new_virtual_grf[reg] == inst->src[i].nr);
1857          }
1858       }
1859    }
1860    invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1861 
1862    progress = true;
1863 
1864 cleanup:
1865    delete[] split_points;
1866    delete[] vgrf_has_split;
1867    delete[] new_virtual_grf;
1868    delete[] new_reg_offset;
1869    delete[] vgrf_to_reg;
1870 
1871    return progress;
1872 }
1873 
1874 /**
1875  * Remove unused virtual GRFs and compact the vgrf_* arrays.
1876  *
1877  * During code generation, we create tons of temporary variables, many of
1878  * which get immediately killed and are never used again.  Yet, in later
1879  * optimization and analysis passes, such as compute_live_intervals, we need
1880  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1881  * overhead.
1882  */
1883 bool
compact_virtual_grfs()1884 elk_fs_visitor::compact_virtual_grfs()
1885 {
1886    bool progress = false;
1887    int *remap_table = new int[this->alloc.count];
1888    memset(remap_table, -1, this->alloc.count * sizeof(int));
1889 
1890    /* Mark which virtual GRFs are used. */
1891    foreach_block_and_inst(block, const elk_fs_inst, inst, cfg) {
1892       if (inst->dst.file == VGRF)
1893          remap_table[inst->dst.nr] = 0;
1894 
1895       for (int i = 0; i < inst->sources; i++) {
1896          if (inst->src[i].file == VGRF)
1897             remap_table[inst->src[i].nr] = 0;
1898       }
1899    }
1900 
1901    /* Compact the GRF arrays. */
1902    int new_index = 0;
1903    for (unsigned i = 0; i < this->alloc.count; i++) {
1904       if (remap_table[i] == -1) {
1905          /* We just found an unused register.  This means that we are
1906           * actually going to compact something.
1907           */
1908          progress = true;
1909       } else {
1910          remap_table[i] = new_index;
1911          alloc.sizes[new_index] = alloc.sizes[i];
1912          invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1913          ++new_index;
1914       }
1915    }
1916 
1917    this->alloc.count = new_index;
1918 
1919    /* Patch all the instructions to use the newly renumbered registers */
1920    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1921       if (inst->dst.file == VGRF)
1922          inst->dst.nr = remap_table[inst->dst.nr];
1923 
1924       for (int i = 0; i < inst->sources; i++) {
1925          if (inst->src[i].file == VGRF)
1926             inst->src[i].nr = remap_table[inst->src[i].nr];
1927       }
1928    }
1929 
1930    /* Patch all the references to delta_xy, since they're used in register
1931     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1932     * think some random VGRF is delta_xy.
1933     */
1934    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1935       if (delta_xy[i].file == VGRF) {
1936          if (remap_table[delta_xy[i].nr] != -1) {
1937             delta_xy[i].nr = remap_table[delta_xy[i].nr];
1938          } else {
1939             delta_xy[i].file = BAD_FILE;
1940          }
1941       }
1942    }
1943 
1944    delete[] remap_table;
1945 
1946    return progress;
1947 }
1948 
1949 int
elk_get_subgroup_id_param_index(const intel_device_info * devinfo,const elk_stage_prog_data * prog_data)1950 elk_get_subgroup_id_param_index(const intel_device_info *devinfo,
1951                                 const elk_stage_prog_data *prog_data)
1952 {
1953    if (prog_data->nr_params == 0)
1954       return -1;
1955 
1956    /* The local thread id is always the last parameter in the list */
1957    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1958    if (last_param == ELK_PARAM_BUILTIN_SUBGROUP_ID)
1959       return prog_data->nr_params - 1;
1960 
1961    return -1;
1962 }
1963 
1964 /**
1965  * Assign UNIFORM file registers to either push constants or pull constants.
1966  *
1967  * We allow a fragment shader to have more than the specified minimum
1968  * maximum number of fragment shader uniform components (64).  If
1969  * there are too many of these, they'd fill up all of register space.
1970  * So, this will push some of them out to the pull constant buffer and
1971  * update the program to load them.
1972  */
1973 void
assign_constant_locations()1974 elk_fs_visitor::assign_constant_locations()
1975 {
1976    /* Only the first compile gets to decide on locations. */
1977    if (push_constant_loc)
1978       return;
1979 
1980    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1981    for (unsigned u = 0; u < uniforms; u++)
1982       push_constant_loc[u] = u;
1983 
1984    /* Now that we know how many regular uniforms we'll push, reduce the
1985     * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1986     */
1987    /* For gen4/5:
1988     * Only allow 16 registers (128 uniform components) as push constants.
1989     *
1990     * If changing this value, note the limitation about total_regs in
1991     * elk_curbe.c/crocus_state.c
1992     */
1993    const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
1994    unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1995    for (int i = 0; i < 4; i++) {
1996       struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
1997 
1998       if (push_length + range->length > max_push_length)
1999          range->length = max_push_length - push_length;
2000 
2001       push_length += range->length;
2002    }
2003    assert(push_length <= max_push_length);
2004 }
2005 
2006 bool
get_pull_locs(const elk_fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)2007 elk_fs_visitor::get_pull_locs(const elk_fs_reg &src,
2008                           unsigned *out_surf_index,
2009                           unsigned *out_pull_index)
2010 {
2011    assert(src.file == UNIFORM);
2012 
2013    if (src.nr < UBO_START)
2014       return false;
2015 
2016    const struct elk_ubo_range *range =
2017       &prog_data->ubo_ranges[src.nr - UBO_START];
2018 
2019    /* If this access is in our (reduced) range, use the push data. */
2020    if (src.offset / 32 < range->length)
2021       return false;
2022 
2023    *out_surf_index = range->block;
2024    *out_pull_index = (32 * range->start + src.offset) / 4;
2025 
2026    prog_data->has_ubo_pull = true;
2027 
2028    return true;
2029 }
2030 
2031 /**
2032  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2033  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2034  */
2035 bool
lower_constant_loads()2036 elk_fs_visitor::lower_constant_loads()
2037 {
2038    unsigned index, pull_index;
2039    bool progress = false;
2040 
2041    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
2042       /* Set up the annotation tracking for new generated instructions. */
2043       const fs_builder ibld(this, block, inst);
2044 
2045       for (int i = 0; i < inst->sources; i++) {
2046 	 if (inst->src[i].file != UNIFORM)
2047 	    continue;
2048 
2049          /* We'll handle this case later */
2050          if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT && i == 0)
2051             continue;
2052 
2053          if (!get_pull_locs(inst->src[i], &index, &pull_index))
2054 	    continue;
2055 
2056          assert(inst->src[i].stride == 0);
2057 
2058          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2059          const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2060          const elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2061          const unsigned base = pull_index * 4;
2062 
2063          elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
2064          srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = elk_imm_ud(index);
2065          srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]  = elk_imm_ud(base & ~(block_sz - 1));
2066          srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]    = elk_imm_ud(block_sz);
2067 
2068 
2069          ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
2070                    srcs, PULL_UNIFORM_CONSTANT_SRCS);
2071 
2072          /* Rewrite the instruction to use the temporary VGRF. */
2073          inst->src[i].file = VGRF;
2074          inst->src[i].nr = dst.nr;
2075          inst->src[i].offset = (base & (block_sz - 1)) +
2076                                inst->src[i].offset % 4;
2077 
2078          progress = true;
2079       }
2080 
2081       if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT &&
2082           inst->src[0].file == UNIFORM) {
2083 
2084          if (!get_pull_locs(inst->src[0], &index, &pull_index))
2085             continue;
2086 
2087          VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2088                                     elk_imm_ud(index),
2089                                     elk_fs_reg() /* surface_handle */,
2090                                     inst->src[1],
2091                                     pull_index * 4, 4, 1);
2092          inst->remove(block);
2093 
2094          progress = true;
2095       }
2096    }
2097    invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2098 
2099    return progress;
2100 }
2101 
2102 static uint64_t
src_as_uint(const elk_fs_reg & src)2103 src_as_uint(const elk_fs_reg &src)
2104 {
2105    assert(src.file == IMM);
2106 
2107    switch (src.type) {
2108    case ELK_REGISTER_TYPE_W:
2109       return (uint64_t)(int16_t)(src.ud & 0xffff);
2110 
2111    case ELK_REGISTER_TYPE_UW:
2112       return (uint64_t)(uint16_t)(src.ud & 0xffff);
2113 
2114    case ELK_REGISTER_TYPE_D:
2115       return (uint64_t)src.d;
2116 
2117    case ELK_REGISTER_TYPE_UD:
2118       return (uint64_t)src.ud;
2119 
2120    case ELK_REGISTER_TYPE_Q:
2121       return src.d64;
2122 
2123    case ELK_REGISTER_TYPE_UQ:
2124       return src.u64;
2125 
2126    default:
2127       unreachable("Invalid integer type.");
2128    }
2129 }
2130 
2131 static elk_fs_reg
elk_imm_for_type(uint64_t value,enum elk_reg_type type)2132 elk_imm_for_type(uint64_t value, enum elk_reg_type type)
2133 {
2134    switch (type) {
2135    case ELK_REGISTER_TYPE_W:
2136       return elk_imm_w(value);
2137 
2138    case ELK_REGISTER_TYPE_UW:
2139       return elk_imm_uw(value);
2140 
2141    case ELK_REGISTER_TYPE_D:
2142       return elk_imm_d(value);
2143 
2144    case ELK_REGISTER_TYPE_UD:
2145       return elk_imm_ud(value);
2146 
2147    case ELK_REGISTER_TYPE_Q:
2148       return elk_imm_d(value);
2149 
2150    case ELK_REGISTER_TYPE_UQ:
2151       return elk_imm_uq(value);
2152 
2153    default:
2154       unreachable("Invalid integer type.");
2155    }
2156 }
2157 
2158 bool
opt_algebraic()2159 elk_fs_visitor::opt_algebraic()
2160 {
2161    bool progress = false;
2162 
2163    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2164       switch (inst->opcode) {
2165       case ELK_OPCODE_MOV:
2166          if (!devinfo->has_64bit_float &&
2167              inst->dst.type == ELK_REGISTER_TYPE_DF) {
2168             assert(inst->dst.type == inst->src[0].type);
2169             assert(!inst->saturate);
2170             assert(!inst->src[0].abs);
2171             assert(!inst->src[0].negate);
2172             const elk::fs_builder ibld(this, block, inst);
2173 
2174             if (!inst->is_partial_write())
2175                ibld.emit_undef_for_dst(inst);
2176 
2177             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 1),
2178                      subscript(inst->src[0], ELK_REGISTER_TYPE_F, 1));
2179             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 0),
2180                      subscript(inst->src[0], ELK_REGISTER_TYPE_F, 0));
2181 
2182             inst->remove(block);
2183             progress = true;
2184          }
2185 
2186          if (!devinfo->has_64bit_int &&
2187              (inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2188               inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2189             assert(inst->dst.type == inst->src[0].type);
2190             assert(!inst->saturate);
2191             assert(!inst->src[0].abs);
2192             assert(!inst->src[0].negate);
2193             const elk::fs_builder ibld(this, block, inst);
2194 
2195             if (!inst->is_partial_write())
2196                ibld.emit_undef_for_dst(inst);
2197 
2198             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2199                      subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1));
2200             ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2201                      subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0));
2202 
2203             inst->remove(block);
2204             progress = true;
2205          }
2206 
2207          if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2208               inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2209              inst->dst.is_null() &&
2210              (inst->src[0].abs || inst->src[0].negate)) {
2211             inst->src[0].abs = false;
2212             inst->src[0].negate = false;
2213             progress = true;
2214             break;
2215          }
2216 
2217          if (inst->src[0].file != IMM)
2218             break;
2219 
2220          if (inst->saturate) {
2221             /* Full mixed-type saturates don't happen.  However, we can end up
2222              * with things like:
2223              *
2224              *    mov.sat(8) g21<1>DF       -1F
2225              *
2226              * Other mixed-size-but-same-base-type cases may also be possible.
2227              */
2228             if (inst->dst.type != inst->src[0].type &&
2229                 inst->dst.type != ELK_REGISTER_TYPE_DF &&
2230                 inst->src[0].type != ELK_REGISTER_TYPE_F)
2231                assert(!"unimplemented: saturate mixed types");
2232 
2233             if (elk_saturate_immediate(inst->src[0].type,
2234                                        &inst->src[0].as_elk_reg())) {
2235                inst->saturate = false;
2236                progress = true;
2237             }
2238          }
2239          break;
2240 
2241       case ELK_OPCODE_MUL:
2242          if (inst->src[1].file != IMM)
2243             continue;
2244 
2245          if (elk_reg_type_is_floating_point(inst->src[1].type))
2246             break;
2247 
2248          /* From the BDW PRM, Vol 2a, "mul - Multiply":
2249           *
2250           *    "When multiplying integer datatypes, if src0 is DW and src1
2251           *    is W, irrespective of the destination datatype, the
2252           *    accumulator maintains full 48-bit precision."
2253           *    ...
2254           *    "When multiplying integer data types, if one of the sources
2255           *    is a DW, the resulting full precision data is stored in
2256           *    the accumulator."
2257           *
2258           * There are also similar notes in earlier PRMs.
2259           *
2260           * The MOV instruction can copy the bits of the source, but it
2261           * does not clear the higher bits of the accumulator. So, because
2262           * we might use the full accumulator in the MUL/MACH macro, we
2263           * shouldn't replace such MULs with MOVs.
2264           */
2265          if ((elk_reg_type_to_size(inst->src[0].type) == 4 ||
2266               elk_reg_type_to_size(inst->src[1].type) == 4) &&
2267              (inst->dst.is_accumulator() ||
2268               inst->writes_accumulator_implicitly(devinfo)))
2269             break;
2270 
2271          /* a * 1.0 = a */
2272          if (inst->src[1].is_one()) {
2273             inst->opcode = ELK_OPCODE_MOV;
2274             inst->sources = 1;
2275             inst->src[1] = reg_undef;
2276             progress = true;
2277             break;
2278          }
2279 
2280          /* a * -1.0 = -a */
2281          if (inst->src[1].is_negative_one()) {
2282             inst->opcode = ELK_OPCODE_MOV;
2283             inst->sources = 1;
2284             inst->src[0].negate = !inst->src[0].negate;
2285             inst->src[1] = reg_undef;
2286             progress = true;
2287             break;
2288          }
2289 
2290          break;
2291       case ELK_OPCODE_ADD:
2292          if (inst->src[1].file != IMM)
2293             continue;
2294 
2295          if (elk_reg_type_is_integer(inst->src[1].type) &&
2296              inst->src[1].is_zero()) {
2297             inst->opcode = ELK_OPCODE_MOV;
2298             inst->sources = 1;
2299             inst->src[1] = reg_undef;
2300             progress = true;
2301             break;
2302          }
2303 
2304          if (inst->src[0].file == IMM) {
2305             assert(inst->src[0].type == ELK_REGISTER_TYPE_F);
2306             inst->opcode = ELK_OPCODE_MOV;
2307             inst->sources = 1;
2308             inst->src[0].f += inst->src[1].f;
2309             inst->src[1] = reg_undef;
2310             progress = true;
2311             break;
2312          }
2313          break;
2314 
2315       case ELK_OPCODE_AND:
2316          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2317             const uint64_t src0 = src_as_uint(inst->src[0]);
2318             const uint64_t src1 = src_as_uint(inst->src[1]);
2319 
2320             inst->opcode = ELK_OPCODE_MOV;
2321             inst->sources = 1;
2322             inst->src[0] = elk_imm_for_type(src0 & src1, inst->dst.type);
2323             inst->src[1] = reg_undef;
2324             progress = true;
2325             break;
2326          }
2327 
2328          break;
2329 
2330       case ELK_OPCODE_OR:
2331          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2332             const uint64_t src0 = src_as_uint(inst->src[0]);
2333             const uint64_t src1 = src_as_uint(inst->src[1]);
2334 
2335             inst->opcode = ELK_OPCODE_MOV;
2336             inst->sources = 1;
2337             inst->src[0] = elk_imm_for_type(src0 | src1, inst->dst.type);
2338             inst->src[1] = reg_undef;
2339             progress = true;
2340             break;
2341          }
2342 
2343          if (inst->src[0].equals(inst->src[1]) ||
2344              inst->src[1].is_zero()) {
2345             /* On Gfx8+, the OR instruction can have a source modifier that
2346              * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
2347              * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2348              */
2349             if (inst->src[0].negate) {
2350                inst->opcode = ELK_OPCODE_NOT;
2351                inst->sources = 1;
2352                inst->src[0].negate = false;
2353             } else {
2354                inst->opcode = ELK_OPCODE_MOV;
2355                inst->sources = 1;
2356             }
2357             inst->src[1] = reg_undef;
2358             progress = true;
2359             break;
2360          }
2361          break;
2362       case ELK_OPCODE_CMP:
2363          if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2364               inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2365              inst->src[1].is_zero() &&
2366              (inst->src[0].abs || inst->src[0].negate)) {
2367             inst->src[0].abs = false;
2368             inst->src[0].negate = false;
2369             progress = true;
2370             break;
2371          }
2372          break;
2373       case ELK_OPCODE_SEL:
2374          if (!devinfo->has_64bit_float &&
2375              !devinfo->has_64bit_int &&
2376              (inst->dst.type == ELK_REGISTER_TYPE_DF ||
2377               inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2378               inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2379             assert(inst->dst.type == inst->src[0].type);
2380             assert(!inst->saturate);
2381             assert(!inst->src[0].abs && !inst->src[0].negate);
2382             assert(!inst->src[1].abs && !inst->src[1].negate);
2383             const elk::fs_builder ibld(this, block, inst);
2384 
2385             if (!inst->is_partial_write())
2386                ibld.emit_undef_for_dst(inst);
2387 
2388             set_predicate(inst->predicate,
2389                           ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2390                                    subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
2391                                    subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0)));
2392             set_predicate(inst->predicate,
2393                           ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2394                                    subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
2395                                    subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1)));
2396 
2397             inst->remove(block);
2398             progress = true;
2399          }
2400          if (inst->src[0].equals(inst->src[1])) {
2401             inst->opcode = ELK_OPCODE_MOV;
2402             inst->sources = 1;
2403             inst->src[1] = reg_undef;
2404             inst->predicate = ELK_PREDICATE_NONE;
2405             inst->predicate_inverse = false;
2406             progress = true;
2407          } else if (inst->saturate && inst->src[1].file == IMM) {
2408             switch (inst->conditional_mod) {
2409             case ELK_CONDITIONAL_LE:
2410             case ELK_CONDITIONAL_L:
2411                switch (inst->src[1].type) {
2412                case ELK_REGISTER_TYPE_F:
2413                   if (inst->src[1].f >= 1.0f) {
2414                      inst->opcode = ELK_OPCODE_MOV;
2415                      inst->sources = 1;
2416                      inst->src[1] = reg_undef;
2417                      inst->conditional_mod = ELK_CONDITIONAL_NONE;
2418                      progress = true;
2419                   }
2420                   break;
2421                default:
2422                   break;
2423                }
2424                break;
2425             case ELK_CONDITIONAL_GE:
2426             case ELK_CONDITIONAL_G:
2427                switch (inst->src[1].type) {
2428                case ELK_REGISTER_TYPE_F:
2429                   if (inst->src[1].f <= 0.0f) {
2430                      inst->opcode = ELK_OPCODE_MOV;
2431                      inst->sources = 1;
2432                      inst->src[1] = reg_undef;
2433                      inst->conditional_mod = ELK_CONDITIONAL_NONE;
2434                      progress = true;
2435                   }
2436                   break;
2437                default:
2438                   break;
2439                }
2440             default:
2441                break;
2442             }
2443          }
2444          break;
2445       case ELK_OPCODE_MAD:
2446          if (inst->src[0].type != ELK_REGISTER_TYPE_F ||
2447              inst->src[1].type != ELK_REGISTER_TYPE_F ||
2448              inst->src[2].type != ELK_REGISTER_TYPE_F)
2449             break;
2450          if (inst->src[1].is_one()) {
2451             inst->opcode = ELK_OPCODE_ADD;
2452             inst->sources = 2;
2453             inst->src[1] = inst->src[2];
2454             inst->src[2] = reg_undef;
2455             progress = true;
2456          } else if (inst->src[2].is_one()) {
2457             inst->opcode = ELK_OPCODE_ADD;
2458             inst->sources = 2;
2459             inst->src[2] = reg_undef;
2460             progress = true;
2461          }
2462          break;
2463       case ELK_OPCODE_SHL:
2464          if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2465             /* It's not currently possible to generate this, and this constant
2466              * folding does not handle it.
2467              */
2468             assert(!inst->saturate);
2469 
2470             elk_fs_reg result;
2471 
2472             switch (type_sz(inst->src[0].type)) {
2473             case 2:
2474                result = elk_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
2475                break;
2476             case 4:
2477                result = elk_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
2478                break;
2479             case 8:
2480                result = elk_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
2481                break;
2482             default:
2483                /* Just in case a future platform re-enables B or UB types. */
2484                unreachable("Invalid source size.");
2485             }
2486 
2487             inst->opcode = ELK_OPCODE_MOV;
2488             inst->src[0] = retype(result, inst->dst.type);
2489             inst->src[1] = reg_undef;
2490             inst->sources = 1;
2491 
2492             progress = true;
2493          }
2494          break;
2495 
2496       case ELK_SHADER_OPCODE_BROADCAST:
2497          if (is_uniform(inst->src[0])) {
2498             inst->opcode = ELK_OPCODE_MOV;
2499             inst->sources = 1;
2500             inst->force_writemask_all = true;
2501             progress = true;
2502          } else if (inst->src[1].file == IMM) {
2503             inst->opcode = ELK_OPCODE_MOV;
2504             /* It's possible that the selected component will be too large and
2505              * overflow the register.  This can happen if someone does a
2506              * readInvocation() from GLSL or SPIR-V and provides an OOB
2507              * invocationIndex.  If this happens and we some how manage
2508              * to constant fold it in and get here, then component() may cause
2509              * us to start reading outside of the VGRF which will lead to an
2510              * assert later.  Instead, just let it wrap around if it goes over
2511              * exec_size.
2512              */
2513             const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
2514             inst->src[0] = component(inst->src[0], comp);
2515             inst->sources = 1;
2516             inst->force_writemask_all = true;
2517             progress = true;
2518          }
2519          break;
2520 
2521       case ELK_SHADER_OPCODE_SHUFFLE:
2522          if (is_uniform(inst->src[0])) {
2523             inst->opcode = ELK_OPCODE_MOV;
2524             inst->sources = 1;
2525             progress = true;
2526          } else if (inst->src[1].file == IMM) {
2527             inst->opcode = ELK_OPCODE_MOV;
2528             inst->src[0] = component(inst->src[0],
2529                                      inst->src[1].ud);
2530             inst->sources = 1;
2531             progress = true;
2532          }
2533          break;
2534 
2535       default:
2536 	 break;
2537       }
2538 
2539       /* Ensure that the correct source has the immediate value. 2-source
2540        * instructions must have the immediate in src[1]. On Gfx12 and later,
2541        * some 3-source instructions can have the immediate in src[0] or
2542        * src[2]. It's complicated, so don't mess with 3-source instructions
2543        * here.
2544        */
2545       if (progress && inst->sources == 2 && inst->is_commutative()) {
2546          if (inst->src[0].file == IMM) {
2547             elk_fs_reg tmp = inst->src[1];
2548             inst->src[1] = inst->src[0];
2549             inst->src[0] = tmp;
2550          }
2551       }
2552    }
2553 
2554    if (progress)
2555       invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
2556                           DEPENDENCY_INSTRUCTION_DETAIL);
2557 
2558    return progress;
2559 }
2560 
2561 static unsigned
load_payload_sources_read_for_size(elk_fs_inst * lp,unsigned size_read)2562 load_payload_sources_read_for_size(elk_fs_inst *lp, unsigned size_read)
2563 {
2564    assert(lp->opcode == ELK_SHADER_OPCODE_LOAD_PAYLOAD);
2565    assert(size_read >= lp->header_size * REG_SIZE);
2566 
2567    unsigned i;
2568    unsigned size = lp->header_size * REG_SIZE;
2569    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
2570       size += lp->exec_size * type_sz(lp->src[i].type);
2571 
2572    /* Size read must cover exactly a subset of sources. */
2573    assert(size == size_read);
2574    return i;
2575 }
2576 
2577 /**
2578  * Optimize sample messages that have constant zero values for the trailing
2579  * parameters. We can just reduce the message length for these
2580  * instructions instead of reserving a register for it. Trailing parameters
2581  * that aren't sent default to zero anyway. This will cause the dead code
2582  * eliminator to remove the MOV instruction that would otherwise be emitted to
2583  * set up the zero value.
2584  */
2585 bool
opt_zero_samples()2586 elk_fs_visitor::opt_zero_samples()
2587 {
2588    /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
2589    assert(devinfo->ver >= 7);
2590 
2591    bool progress = false;
2592 
2593    foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2594       if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2595           send->sfid != ELK_SFID_SAMPLER)
2596          continue;
2597 
2598       /* Wa_14012688258:
2599        *
2600        * Don't trim zeros at the end of payload for sample operations
2601        * in cube and cube arrays.
2602        */
2603       if (send->keep_payload_trailing_zeros)
2604          continue;
2605 
2606       elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2607 
2608       if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2609          continue;
2610 
2611       /* How much of the payload are actually read by this SEND. */
2612       const unsigned params =
2613          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2614 
2615       /* We don't want to remove the message header or the first parameter.
2616        * Removing the first parameter is not allowed, see the Haswell PRM
2617        * volume 7, page 149:
2618        *
2619        *     "Parameter 0 is required except for the sampleinfo message, which
2620        *      has no parameter 0"
2621        */
2622       const unsigned first_param_idx = lp->header_size;
2623       unsigned zero_size = 0;
2624       for (unsigned i = params - 1; i > first_param_idx; i--) {
2625          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
2626             break;
2627          zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
2628       }
2629 
2630       const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
2631       if (zero_len > 0) {
2632          send->mlen -= zero_len;
2633          progress = true;
2634       }
2635    }
2636 
2637    if (progress)
2638       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2639 
2640    return progress;
2641 }
2642 
2643 /**
2644  * Remove redundant or useless halts.
2645  *
2646  * For example, we can eliminate halts in the following sequence:
2647  *
2648  * halt        (redundant with the next halt)
2649  * halt        (useless; jumps to the next instruction)
2650  * halt-target
2651  */
2652 bool
opt_redundant_halt()2653 elk_fs_visitor::opt_redundant_halt()
2654 {
2655    bool progress = false;
2656 
2657    unsigned halt_count = 0;
2658    elk_fs_inst *halt_target = NULL;
2659    elk_bblock_t *halt_target_block = NULL;
2660    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2661       if (inst->opcode == ELK_OPCODE_HALT)
2662          halt_count++;
2663 
2664       if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
2665          halt_target = inst;
2666          halt_target_block = block;
2667          break;
2668       }
2669    }
2670 
2671    if (!halt_target) {
2672       assert(halt_count == 0);
2673       return false;
2674    }
2675 
2676    /* Delete any HALTs immediately before the halt target. */
2677    for (elk_fs_inst *prev = (elk_fs_inst *) halt_target->prev;
2678         !prev->is_head_sentinel() && prev->opcode == ELK_OPCODE_HALT;
2679         prev = (elk_fs_inst *) halt_target->prev) {
2680       prev->remove(halt_target_block);
2681       halt_count--;
2682       progress = true;
2683    }
2684 
2685    if (halt_count == 0) {
2686       halt_target->remove(halt_target_block);
2687       progress = true;
2688    }
2689 
2690    if (progress)
2691       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2692 
2693    return progress;
2694 }
2695 
2696 /**
2697  * Compute a bitmask with GRF granularity with a bit set for each GRF starting
2698  * from \p r.offset which overlaps the region starting at \p s.offset and
2699  * spanning \p ds bytes.
2700  */
2701 static inline unsigned
mask_relative_to(const elk_fs_reg & r,const elk_fs_reg & s,unsigned ds)2702 mask_relative_to(const elk_fs_reg &r, const elk_fs_reg &s, unsigned ds)
2703 {
2704    const int rel_offset = reg_offset(s) - reg_offset(r);
2705    const int shift = rel_offset / REG_SIZE;
2706    const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
2707    assert(reg_space(r) == reg_space(s) &&
2708           shift >= 0 && shift < int(8 * sizeof(unsigned)));
2709    return ((1 << n) - 1) << shift;
2710 }
2711 
2712 bool
compute_to_mrf()2713 elk_fs_visitor::compute_to_mrf()
2714 {
2715    bool progress = false;
2716    int next_ip = 0;
2717 
2718    /* No MRFs on Gen >= 7. */
2719    if (devinfo->ver >= 7)
2720       return false;
2721 
2722    const fs_live_variables &live = live_analysis.require();
2723 
2724    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2725       int ip = next_ip;
2726       next_ip++;
2727 
2728       if (inst->opcode != ELK_OPCODE_MOV ||
2729 	  inst->is_partial_write() ||
2730 	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
2731 	  inst->dst.type != inst->src[0].type ||
2732 	  inst->src[0].abs || inst->src[0].negate ||
2733           !inst->src[0].is_contiguous() ||
2734           inst->src[0].offset % REG_SIZE != 0)
2735 	 continue;
2736 
2737       /* Can't compute-to-MRF this GRF if someone else was going to
2738        * read it later.
2739        */
2740       if (live.vgrf_end[inst->src[0].nr] > ip)
2741 	 continue;
2742 
2743       /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
2744        * things that computed the value of all GRFs of the source region.  The
2745        * regs_left bitset keeps track of the registers we haven't yet found a
2746        * generating instruction for.
2747        */
2748       unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
2749 
2750       foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2751          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2752                              inst->src[0], inst->size_read(0))) {
2753 	    /* Found the last thing to write our reg we want to turn
2754 	     * into a compute-to-MRF.
2755 	     */
2756 
2757 	    /* If this one instruction didn't populate all the
2758 	     * channels, bail.  We might be able to rewrite everything
2759 	     * that writes that reg, but it would require smarter
2760 	     * tracking.
2761 	     */
2762 	    if (scan_inst->is_partial_write())
2763 	       break;
2764 
2765             /* Handling things not fully contained in the source of the copy
2766              * would need us to understand coalescing out more than one MOV at
2767              * a time.
2768              */
2769             if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
2770                                      inst->src[0], inst->size_read(0)))
2771                break;
2772 
2773 	    /* SEND instructions can't have MRF as a destination. */
2774 	    if (scan_inst->mlen)
2775 	       break;
2776 
2777 	    if (devinfo->ver == 6) {
2778 	       /* gfx6 math instructions must have the destination be
2779 		* GRF, so no compute-to-MRF for them.
2780 		*/
2781 	       if (scan_inst->is_math()) {
2782 		  break;
2783 	       }
2784 	    }
2785 
2786             /* Clear the bits for any registers this instruction overwrites. */
2787             regs_left &= ~mask_relative_to(
2788                inst->src[0], scan_inst->dst, scan_inst->size_written);
2789             if (!regs_left)
2790                break;
2791 	 }
2792 
2793 	 /* We don't handle control flow here.  Most computation of
2794 	  * values that end up in MRFs are shortly before the MRF
2795 	  * write anyway.
2796 	  */
2797 	 if (block->start() == scan_inst)
2798 	    break;
2799 
2800 	 /* You can't read from an MRF, so if someone else reads our
2801 	  * MRF's source GRF that we wanted to rewrite, that stops us.
2802 	  */
2803 	 bool interfered = false;
2804 	 for (int i = 0; i < scan_inst->sources; i++) {
2805             if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
2806                                 inst->src[0], inst->size_read(0))) {
2807 	       interfered = true;
2808 	    }
2809 	 }
2810 	 if (interfered)
2811 	    break;
2812 
2813          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2814                              inst->dst, inst->size_written)) {
2815 	    /* If somebody else writes our MRF here, we can't
2816 	     * compute-to-MRF before that.
2817 	     */
2818             break;
2819          }
2820 
2821          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
2822              regions_overlap(elk_fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
2823                              inst->dst, inst->size_written)) {
2824 	    /* Found a SEND instruction, which means that there are
2825 	     * live values in MRFs from base_mrf to base_mrf +
2826 	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2827 	     * above it.
2828 	     */
2829             break;
2830          }
2831       }
2832 
2833       if (regs_left)
2834          continue;
2835 
2836       /* Found all generating instructions of our MRF's source value, so it
2837        * should be safe to rewrite them to point to the MRF directly.
2838        */
2839       regs_left = (1 << regs_read(inst, 0)) - 1;
2840 
2841       foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2842          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2843                              inst->src[0], inst->size_read(0))) {
2844             /* Clear the bits for any registers this instruction overwrites. */
2845             regs_left &= ~mask_relative_to(
2846                inst->src[0], scan_inst->dst, scan_inst->size_written);
2847 
2848             const unsigned rel_offset = reg_offset(scan_inst->dst) -
2849                                         reg_offset(inst->src[0]);
2850 
2851             if (inst->dst.nr & ELK_MRF_COMPR4) {
2852                /* Apply the same address transformation done by the hardware
2853                 * for COMPR4 MRF writes.
2854                 */
2855                assert(rel_offset < 2 * REG_SIZE);
2856                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
2857 
2858                /* Clear the COMPR4 bit if the generating instruction is not
2859                 * compressed.
2860                 */
2861                if (scan_inst->size_written < 2 * REG_SIZE)
2862                   scan_inst->dst.nr &= ~ELK_MRF_COMPR4;
2863 
2864             } else {
2865                /* Calculate the MRF number the result of this instruction is
2866                 * ultimately written to.
2867                 */
2868                scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
2869             }
2870 
2871             scan_inst->dst.file = MRF;
2872             scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
2873             scan_inst->saturate |= inst->saturate;
2874             if (!regs_left)
2875                break;
2876          }
2877       }
2878 
2879       assert(!regs_left);
2880       inst->remove(block);
2881       progress = true;
2882    }
2883 
2884    if (progress)
2885       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2886 
2887    return progress;
2888 }
2889 
2890 /**
2891  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2892  * flow.  We could probably do better here with some form of divergence
2893  * analysis.
2894  */
2895 bool
eliminate_find_live_channel()2896 elk_fs_visitor::eliminate_find_live_channel()
2897 {
2898    bool progress = false;
2899    unsigned depth = 0;
2900 
2901    if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
2902       /* The optimization below assumes that channel zero is live on thread
2903        * dispatch, which may not be the case if the fixed function dispatches
2904        * threads sparsely.
2905        */
2906       return false;
2907    }
2908 
2909    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2910       switch (inst->opcode) {
2911       case ELK_OPCODE_IF:
2912       case ELK_OPCODE_DO:
2913          depth++;
2914          break;
2915 
2916       case ELK_OPCODE_ENDIF:
2917       case ELK_OPCODE_WHILE:
2918          depth--;
2919          break;
2920 
2921       case ELK_OPCODE_HALT:
2922          /* This can potentially make control flow non-uniform until the end
2923           * of the program.
2924           */
2925          goto out;
2926 
2927       case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
2928          if (depth == 0) {
2929             inst->opcode = ELK_OPCODE_MOV;
2930             inst->src[0] = elk_imm_ud(0u);
2931             inst->sources = 1;
2932             inst->force_writemask_all = true;
2933             progress = true;
2934          }
2935          break;
2936 
2937       default:
2938          break;
2939       }
2940    }
2941 
2942 out:
2943    if (progress)
2944       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2945 
2946    return progress;
2947 }
2948 
2949 /**
2950  * Once we've generated code, try to convert normal ELK_FS_OPCODE_FB_WRITE
2951  * instructions to ELK_FS_OPCODE_REP_FB_WRITE.
2952  */
2953 void
emit_repclear_shader()2954 elk_fs_visitor::emit_repclear_shader()
2955 {
2956    elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
2957    elk_fs_inst *write = NULL;
2958 
2959    assert(uniforms == 0);
2960    assume(key->nr_color_regions > 0);
2961 
2962    elk_fs_reg color_output, header;
2963    if (devinfo->ver >= 7) {
2964       color_output = retype(elk_vec4_grf(127, 0), ELK_REGISTER_TYPE_UD);
2965       header = retype(elk_vec8_grf(125, 0), ELK_REGISTER_TYPE_UD);
2966    } else {
2967       color_output = retype(elk_vec4_reg(MRF, 2, 0), ELK_REGISTER_TYPE_UD);
2968       header = retype(elk_vec8_reg(MRF, 0, 0), ELK_REGISTER_TYPE_UD);
2969    }
2970 
2971    /* We pass the clear color as a flat input.  Copy it to the output. */
2972    elk_fs_reg color_input =
2973       elk_reg(ELK_GENERAL_REGISTER_FILE, 2, 3, 0, 0, ELK_REGISTER_TYPE_UD,
2974               ELK_VERTICAL_STRIDE_8, ELK_WIDTH_2, ELK_HORIZONTAL_STRIDE_4,
2975               ELK_SWIZZLE_XYZW, WRITEMASK_XYZW);
2976 
2977    const fs_builder bld = fs_builder(this).at_end();
2978    bld.exec_all().group(4, 0).MOV(color_output, color_input);
2979 
2980    if (key->nr_color_regions > 1) {
2981       /* Copy g0..g1 as the message header */
2982       bld.exec_all().group(16, 0)
2983          .MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2984    }
2985 
2986    for (int i = 0; i < key->nr_color_regions; ++i) {
2987       if (i > 0)
2988          bld.exec_all().group(1, 0).MOV(component(header, 2), elk_imm_ud(i));
2989 
2990       if (devinfo->ver >= 7) {
2991          write = bld.emit(ELK_SHADER_OPCODE_SEND);
2992          write->resize_sources(2);
2993          write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
2994          write->src[0] = elk_imm_ud(0);
2995          write->src[1] = i == 0 ? color_output : header;
2996          write->check_tdr = true;
2997          write->send_has_side_effects = true;
2998          write->desc = elk_fb_write_desc(devinfo, i,
2999             ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
3000             i == key->nr_color_regions - 1, false);
3001       } else {
3002          write = bld.emit(ELK_FS_OPCODE_REP_FB_WRITE);
3003          write->target = i;
3004          write->base_mrf = i == 0 ? color_output.nr : header.nr;
3005       }
3006 
3007       /* We can use a headerless message for the first render target */
3008       write->header_size = i == 0 ? 0 : 2;
3009       write->mlen = 1 + write->header_size;
3010    }
3011    write->eot = true;
3012    write->last_rt = true;
3013 
3014    calculate_cfg();
3015 
3016    this->first_non_payload_grf = payload().num_regs;
3017 }
3018 
3019 /**
3020  * Walks through basic blocks, looking for repeated MRF writes and
3021  * removing the later ones.
3022  */
3023 bool
remove_duplicate_mrf_writes()3024 elk_fs_visitor::remove_duplicate_mrf_writes()
3025 {
3026    elk_fs_inst *last_mrf_move[ELK_MAX_MRF_ALL];
3027    bool progress = false;
3028 
3029    /* Need to update the MRF tracking for compressed instructions. */
3030    if (dispatch_width >= 16)
3031       return false;
3032 
3033    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3034 
3035    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3036       if (inst->is_control_flow()) {
3037 	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3038       }
3039 
3040       if (inst->opcode == ELK_OPCODE_MOV &&
3041 	  inst->dst.file == MRF) {
3042          elk_fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3043 	 if (prev_inst && prev_inst->opcode == ELK_OPCODE_MOV &&
3044              inst->dst.equals(prev_inst->dst) &&
3045              inst->src[0].equals(prev_inst->src[0]) &&
3046              inst->saturate == prev_inst->saturate &&
3047              inst->predicate == prev_inst->predicate &&
3048              inst->conditional_mod == prev_inst->conditional_mod &&
3049              inst->exec_size == prev_inst->exec_size) {
3050 	    inst->remove(block);
3051 	    progress = true;
3052 	    continue;
3053 	 }
3054       }
3055 
3056       /* Clear out the last-write records for MRFs that were overwritten. */
3057       if (inst->dst.file == MRF) {
3058          last_mrf_move[inst->dst.nr] = NULL;
3059       }
3060 
3061       if (inst->mlen > 0 && inst->base_mrf != -1) {
3062 	 /* Found a SEND instruction, which will include two or fewer
3063 	  * implied MRF writes.  We could do better here.
3064 	  */
3065 	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3066 	    last_mrf_move[inst->base_mrf + i] = NULL;
3067 	 }
3068       }
3069 
3070       /* Clear out any MRF move records whose sources got overwritten. */
3071       for (unsigned i = 0; i < ELK_MAX_MRF(devinfo->ver); i++) {
3072          if (last_mrf_move[i] &&
3073              regions_overlap(inst->dst, inst->size_written,
3074                              last_mrf_move[i]->src[0],
3075                              last_mrf_move[i]->size_read(0))) {
3076             last_mrf_move[i] = NULL;
3077          }
3078       }
3079 
3080       if (inst->opcode == ELK_OPCODE_MOV &&
3081 	  inst->dst.file == MRF &&
3082 	  inst->src[0].file != ARF &&
3083 	  !inst->is_partial_write()) {
3084          last_mrf_move[inst->dst.nr] = inst;
3085       }
3086    }
3087 
3088    if (progress)
3089       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3090 
3091    return progress;
3092 }
3093 
3094 /**
3095  * Rounding modes for conversion instructions are included for each
3096  * conversion, but right now it is a state. So once it is set,
3097  * we don't need to call it again for subsequent calls.
3098  *
3099  * This is useful for vector/matrices conversions, as setting the
3100  * mode once is enough for the full vector/matrix
3101  */
3102 bool
remove_extra_rounding_modes()3103 elk_fs_visitor::remove_extra_rounding_modes()
3104 {
3105    bool progress = false;
3106    unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3107 
3108    elk_rnd_mode base_mode = ELK_RND_MODE_UNSPECIFIED;
3109    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3110         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3111         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3112        execution_mode)
3113       base_mode = ELK_RND_MODE_RTNE;
3114    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3115         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3116         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3117        execution_mode)
3118       base_mode = ELK_RND_MODE_RTZ;
3119 
3120    foreach_block (block, cfg) {
3121       elk_rnd_mode prev_mode = base_mode;
3122 
3123       foreach_inst_in_block_safe (elk_fs_inst, inst, block) {
3124          if (inst->opcode == ELK_SHADER_OPCODE_RND_MODE) {
3125             assert(inst->src[0].file == ELK_IMMEDIATE_VALUE);
3126             const elk_rnd_mode mode = (elk_rnd_mode) inst->src[0].d;
3127             if (mode == prev_mode) {
3128                inst->remove(block);
3129                progress = true;
3130             } else {
3131                prev_mode = mode;
3132             }
3133          }
3134       }
3135    }
3136 
3137    if (progress)
3138       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3139 
3140    return progress;
3141 }
3142 
3143 static void
clear_deps_for_inst_src(elk_fs_inst * inst,bool * deps,int first_grf,int grf_len)3144 clear_deps_for_inst_src(elk_fs_inst *inst, bool *deps, int first_grf, int grf_len)
3145 {
3146    /* Clear the flag for registers that actually got read (as expected). */
3147    for (int i = 0; i < inst->sources; i++) {
3148       int grf;
3149       if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3150          grf = inst->src[i].nr;
3151       } else {
3152          continue;
3153       }
3154 
3155       if (grf >= first_grf &&
3156           grf < first_grf + grf_len) {
3157          deps[grf - first_grf] = false;
3158          if (inst->exec_size == 16)
3159             deps[grf - first_grf + 1] = false;
3160       }
3161    }
3162 }
3163 
3164 /**
3165  * Implements this workaround for the original 965:
3166  *
3167  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3168  *      check for post destination dependencies on this instruction, software
3169  *      must ensure that there is no destination hazard for the case of ‘write
3170  *      followed by a posted write’ shown in the following example.
3171  *
3172  *      1. mov r3 0
3173  *      2. send r3.xy <rest of send instruction>
3174  *      3. mov r2 r3
3175  *
3176  *      Due to no post-destination dependency check on the ‘send’, the above
3177  *      code sequence could have two instructions (1 and 2) in flight at the
3178  *      same time that both consider ‘r3’ as the target of their final writes.
3179  */
3180 void
insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3181 elk_fs_visitor::insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t *block,
3182                                                         elk_fs_inst *inst)
3183 {
3184    int write_len = regs_written(inst);
3185    int first_write_grf = inst->dst.nr;
3186    bool needs_dep[ELK_MAX_MRF_ALL];
3187    assert(write_len < ELK_MAX_MRF(devinfo->ver) - 1);
3188 
3189    memset(needs_dep, false, sizeof(needs_dep));
3190    memset(needs_dep, true, write_len);
3191 
3192    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3193 
3194    /* Walk backwards looking for writes to registers we're writing which
3195     * aren't read since being written.  If we hit the start of the program,
3196     * we assume that there are no outstanding dependencies on entry to the
3197     * program.
3198     */
3199    foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3200       /* If we hit control flow, assume that there *are* outstanding
3201        * dependencies, and force their cleanup before our instruction.
3202        */
3203       if (block->start() == scan_inst && block->num != 0) {
3204          for (int i = 0; i < write_len; i++) {
3205             if (needs_dep[i])
3206                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3207                                first_write_grf + i);
3208          }
3209          return;
3210       }
3211 
3212       /* We insert our reads as late as possible on the assumption that any
3213        * instruction but a MOV that might have left us an outstanding
3214        * dependency has more latency than a MOV.
3215        */
3216       if (scan_inst->dst.file == VGRF) {
3217          for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3218             int reg = scan_inst->dst.nr + i;
3219 
3220             if (reg >= first_write_grf &&
3221                 reg < first_write_grf + write_len &&
3222                 needs_dep[reg - first_write_grf]) {
3223                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3224                needs_dep[reg - first_write_grf] = false;
3225                if (scan_inst->exec_size == 16)
3226                   needs_dep[reg - first_write_grf + 1] = false;
3227             }
3228          }
3229       }
3230 
3231       /* Clear the flag for registers that actually got read (as expected). */
3232       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3233 
3234       /* Continue the loop only if we haven't resolved all the dependencies */
3235       int i;
3236       for (i = 0; i < write_len; i++) {
3237          if (needs_dep[i])
3238             break;
3239       }
3240       if (i == write_len)
3241          return;
3242    }
3243 }
3244 
3245 /**
3246  * Implements this workaround for the original 965:
3247  *
3248  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3249  *      used as a destination register until after it has been sourced by an
3250  *      instruction with a different destination register.
3251  */
3252 void
insert_gfx4_post_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3253 elk_fs_visitor::insert_gfx4_post_send_dependency_workarounds(elk_bblock_t *block, elk_fs_inst *inst)
3254 {
3255    int write_len = regs_written(inst);
3256    unsigned first_write_grf = inst->dst.nr;
3257    bool needs_dep[ELK_MAX_MRF_ALL];
3258    assert(write_len < ELK_MAX_MRF(devinfo->ver) - 1);
3259 
3260    memset(needs_dep, false, sizeof(needs_dep));
3261    memset(needs_dep, true, write_len);
3262    /* Walk forwards looking for writes to registers we're writing which aren't
3263     * read before being written.
3264     */
3265    foreach_inst_in_block_starting_from(elk_fs_inst, scan_inst, inst) {
3266       /* If we hit control flow, force resolve all remaining dependencies. */
3267       if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3268          for (int i = 0; i < write_len; i++) {
3269             if (needs_dep[i])
3270                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3271                                first_write_grf + i);
3272          }
3273          return;
3274       }
3275 
3276       /* Clear the flag for registers that actually got read (as expected). */
3277       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3278 
3279       /* We insert our reads as late as possible since they're reading the
3280        * result of a SEND, which has massive latency.
3281        */
3282       if (scan_inst->dst.file == VGRF &&
3283           scan_inst->dst.nr >= first_write_grf &&
3284           scan_inst->dst.nr < first_write_grf + write_len &&
3285           needs_dep[scan_inst->dst.nr - first_write_grf]) {
3286          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3287                          scan_inst->dst.nr);
3288          needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3289       }
3290 
3291       /* Continue the loop only if we haven't resolved all the dependencies */
3292       int i;
3293       for (i = 0; i < write_len; i++) {
3294          if (needs_dep[i])
3295             break;
3296       }
3297       if (i == write_len)
3298          return;
3299    }
3300 }
3301 
3302 void
insert_gfx4_send_dependency_workarounds()3303 elk_fs_visitor::insert_gfx4_send_dependency_workarounds()
3304 {
3305    if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
3306       return;
3307 
3308    bool progress = false;
3309 
3310    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
3311       if (inst->mlen != 0 && inst->dst.file == VGRF) {
3312          insert_gfx4_pre_send_dependency_workarounds(block, inst);
3313          insert_gfx4_post_send_dependency_workarounds(block, inst);
3314          progress = true;
3315       }
3316    }
3317 
3318    if (progress)
3319       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3320 }
3321 
3322 /**
3323  * flags_read() and flags_written() return flag access with byte granularity,
3324  * but for Flag Register PRM lists "Access Granularity: Word", so we can assume
3325  * accessing any part of a word will clear its register dependency.
3326  */
3327 static unsigned
bytes_bitmask_to_words(unsigned b)3328 bytes_bitmask_to_words(unsigned b)
3329 {
3330    unsigned first_byte_mask = b & 0x55555555;
3331    unsigned second_byte_mask = b & 0xaaaaaaaa;
3332    return first_byte_mask |
3333           (first_byte_mask << 1) |
3334           second_byte_mask |
3335           (second_byte_mask >> 1);
3336 }
3337 
3338 /**
3339  * WaClearArfDependenciesBeforeEot
3340  *
3341  * Flag register dependency not cleared after EOT, so we have to source them
3342  * before EOT. We can do this with simple `mov(1) nullUD, f{0,1}UD`
3343  *
3344  * To avoid emitting MOVs when it's not needed, check if each block  reads all
3345  * the flags it sets. We might falsely determine register as unread if it'll be
3346  * accessed inside the next blocks, but this still should be good enough.
3347  */
3348 bool
workaround_source_arf_before_eot()3349 elk_fs_visitor::workaround_source_arf_before_eot()
3350 {
3351    bool progress = false;
3352 
3353    if (devinfo->platform != INTEL_PLATFORM_CHV)
3354       return false;
3355 
3356    unsigned flags_unread = 0;
3357 
3358    foreach_block(block, cfg) {
3359       unsigned flags_unread_in_block = 0;
3360 
3361       foreach_inst_in_block(elk_fs_inst, inst, block) {
3362          /* Instruction can read and write to the same flag, so the order is important */
3363          flags_unread_in_block &= ~bytes_bitmask_to_words(inst->flags_read(devinfo));
3364          flags_unread_in_block |= bytes_bitmask_to_words(inst->flags_written(devinfo));
3365 
3366          /* HALT does not start its block even though it can leave a dependency */
3367          if (inst->opcode == ELK_OPCODE_HALT ||
3368              inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
3369             flags_unread |= flags_unread_in_block;
3370             flags_unread_in_block = 0;
3371          }
3372       }
3373 
3374       flags_unread |= flags_unread_in_block;
3375 
3376       if ((flags_unread & 0x0f) && (flags_unread & 0xf0))
3377          break;
3378    }
3379 
3380    if (flags_unread) {
3381       int eot_count = 0;
3382 
3383       foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg)
3384       {
3385          if (!inst->eot)
3386             continue;
3387 
3388          /* Currently, we always emit only one EOT per program,
3389           * this WA should be updated if it ever changes.
3390           */
3391          ++eot_count;
3392          assert(eot_count == 1);
3393 
3394          const fs_builder ibld(this, block, inst);
3395          const fs_builder ubld = ibld.exec_all().group(1, 0);
3396 
3397          if (flags_unread & 0x0f)
3398             ubld.MOV(ubld.null_reg_ud(), retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD));
3399 
3400          if (flags_unread & 0xf0)
3401             ubld.MOV(ubld.null_reg_ud(), retype(elk_flag_reg(1, 0), ELK_REGISTER_TYPE_UD));
3402       }
3403 
3404       progress = true;
3405       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3406    }
3407 
3408    return progress;
3409 }
3410 
3411 bool
lower_load_payload()3412 elk_fs_visitor::lower_load_payload()
3413 {
3414    bool progress = false;
3415 
3416    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3417       if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
3418          continue;
3419 
3420       assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3421       assert(inst->saturate == false);
3422       elk_fs_reg dst = inst->dst;
3423 
3424       /* Get rid of COMPR4.  We'll add it back in if we need it */
3425       if (dst.file == MRF)
3426          dst.nr = dst.nr & ~ELK_MRF_COMPR4;
3427 
3428       const fs_builder ibld(this, block, inst);
3429       const fs_builder ubld = ibld.exec_all();
3430 
3431       for (uint8_t i = 0; i < inst->header_size;) {
3432          /* Number of header GRFs to initialize at once with a single MOV
3433           * instruction.
3434           */
3435          const unsigned n =
3436             (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3437              inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3438             2 : 1;
3439 
3440          if (inst->src[i].file != BAD_FILE)
3441             ubld.group(8 * n, 0).MOV(retype(dst, ELK_REGISTER_TYPE_UD),
3442                                      retype(inst->src[i], ELK_REGISTER_TYPE_UD));
3443 
3444          dst = byte_offset(dst, n * REG_SIZE);
3445          i += n;
3446       }
3447 
3448       if (inst->dst.file == MRF && (inst->dst.nr & ELK_MRF_COMPR4) &&
3449           inst->exec_size > 8) {
3450          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3451           * a straightforward copy.  Instead, the result of the
3452           * LOAD_PAYLOAD is treated as interleaved and the first four
3453           * non-header sources are unpacked as:
3454           *
3455           * m + 0: r0
3456           * m + 1: g0
3457           * m + 2: b0
3458           * m + 3: a0
3459           * m + 4: r1
3460           * m + 5: g1
3461           * m + 6: b1
3462           * m + 7: a1
3463           *
3464           * This is used for gen <= 5 fb writes.
3465           */
3466          assert(inst->exec_size == 16);
3467          assert(inst->header_size + 4 <= inst->sources);
3468          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3469             if (inst->src[i].file != BAD_FILE) {
3470                if (devinfo->has_compr4) {
3471                   elk_fs_reg compr4_dst = retype(dst, inst->src[i].type);
3472                   compr4_dst.nr |= ELK_MRF_COMPR4;
3473                   ibld.MOV(compr4_dst, inst->src[i]);
3474                } else {
3475                   /* Platform doesn't have COMPR4.  We have to fake it */
3476                   elk_fs_reg mov_dst = retype(dst, inst->src[i].type);
3477                   ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
3478                   mov_dst.nr += 4;
3479                   ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
3480                }
3481             }
3482 
3483             dst.nr++;
3484          }
3485 
3486          /* The loop above only ever incremented us through the first set
3487           * of 4 registers.  However, thanks to the magic of COMPR4, we
3488           * actually wrote to the first 8 registers, so we need to take
3489           * that into account now.
3490           */
3491          dst.nr += 4;
3492 
3493          /* The COMPR4 code took care of the first 4 sources.  We'll let
3494           * the regular path handle any remaining sources.  Yes, we are
3495           * modifying the instruction but we're about to delete it so
3496           * this really doesn't hurt anything.
3497           */
3498          inst->header_size += 4;
3499       }
3500 
3501       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3502          dst.type = inst->src[i].type;
3503          if (inst->src[i].file != BAD_FILE) {
3504             ibld.MOV(dst, inst->src[i]);
3505          }
3506          dst = offset(dst, ibld, 1);
3507       }
3508 
3509       inst->remove(block);
3510       progress = true;
3511    }
3512 
3513    if (progress)
3514       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3515 
3516    return progress;
3517 }
3518 
3519 /**
3520  * Factor an unsigned 32-bit integer.
3521  *
3522  * Attempts to factor \c x into two values that are at most 0xFFFF.  If no
3523  * such factorization is possible, either because the value is too large or is
3524  * prime, both \c result_a and \c result_b will be zero.
3525  */
3526 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)3527 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
3528 {
3529    /* This is necessary to prevent various opportunities for division by zero
3530     * below.
3531     */
3532    assert(x > 0xffff);
3533 
3534    /* This represents the actual expected constraints on the input.  Namely,
3535     * both the upper and lower words should be > 1.
3536     */
3537    assert(x >= 0x00020002);
3538 
3539    *result_a = 0;
3540    *result_b = 0;
3541 
3542    /* The value is too large to factor with the constraints. */
3543    if (x > (0xffffu * 0xffffu))
3544       return;
3545 
3546    /* A non-prime number will have the form p*q*d where p is some prime
3547     * number, q > 1, and 1 <= d <= q.  To meet the constraints of this
3548     * function, (p*d) < 0x10000.  This implies d <= floor(0xffff / p).
3549     * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)).  Finally,
3550     * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
3551     *
3552     * The observation is finding the largest possible value of p reduces the
3553     * possible range of d.  After selecting p, all values of d in this range
3554     * are tested until a factorization is found.  The size of the range of
3555     * possible values of d sets an upper bound on the run time of the
3556     * function.
3557     */
3558    static const uint16_t primes[256] = {
3559          2,    3,    5,    7,   11,   13,   17,   19,
3560         23,   29,   31,   37,   41,   43,   47,   53,
3561         59,   61,   67,   71,   73,   79,   83,   89,
3562         97,  101,  103,  107,  109,  113,  127,  131,  /*  32 */
3563        137,  139,  149,  151,  157,  163,  167,  173,
3564        179,  181,  191,  193,  197,  199,  211,  223,
3565        227,  229,  233,  239,  241,  251,  257,  263,
3566        269,  271,  277,  281,  283,  293,  307,  311,  /*  64 */
3567        313,  317,  331,  337,  347,  349,  353,  359,
3568        367,  373,  379,  383,  389,  397,  401,  409,
3569        419,  421,  431,  433,  439,  443,  449,  457,
3570        461,  463,  467,  479,  487,  491,  499,  503,  /*  96 */
3571        509,  521,  523,  541,  547,  557,  563,  569,
3572        571,  577,  587,  593,  599,  601,  607,  613,
3573        617,  619,  631,  641,  643,  647,  653,  659,
3574        661,  673,  677,  683,  691,  701,  709,  719,   /* 128 */
3575        727,  733,  739,  743,  751,  757,  761,  769,
3576        773,  787,  797,  809,  811,  821,  823,  827,
3577        829,  839,  853,  857,  859,  863,  877,  881,
3578        883,  887,  907,  911,  919,  929,  937,  941,  /* 160 */
3579        947,  953,  967,  971,  977,  983,  991,  997,
3580       1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
3581       1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
3582       1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163,  /* 192 */
3583       1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
3584       1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
3585       1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
3586       1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423,  /* 224 */
3587       1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
3588       1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
3589       1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
3590       1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619,  /* 256 */
3591    };
3592 
3593    unsigned p;
3594    unsigned x_div_p;
3595 
3596    for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
3597       p = primes[i];
3598       x_div_p = x / p;
3599 
3600       if ((x_div_p * p) == x)
3601          break;
3602    }
3603 
3604    /* A prime factor was not found. */
3605    if (x_div_p * p != x)
3606       return;
3607 
3608    /* Terminate early if d=1 is a solution. */
3609    if (x_div_p < 0x10000) {
3610       *result_a = x_div_p;
3611       *result_b = p;
3612       return;
3613    }
3614 
3615    /* Pick the maximum possible value for 'd'.  It's important that the loop
3616     * below execute while d <= max_d because max_d is a valid value.  Having
3617     * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
3618     * incorrectly reported as not being factorable.  The problem would occur
3619     * with any value that is a factor of two primes in the table and one prime
3620     * not in the table.
3621     */
3622    const unsigned max_d = 0xffff / p;
3623 
3624    /* Pick an initial value of 'd' that (combined with rejecting too large
3625     * values above) guarantees that 'q' will always be small enough.
3626     * DIV_ROUND_UP is used to prevent 'd' from being zero.
3627     */
3628    for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
3629       unsigned q = x_div_p / d;
3630 
3631       if ((q * d) == x_div_p) {
3632          assert(p * d * q == x);
3633          assert((p * d) < 0x10000);
3634 
3635          *result_a = q;
3636          *result_b = p * d;
3637          break;
3638       }
3639 
3640       /* Since every value of 'd' is tried, as soon as 'd' is larger
3641        * than 'q', we're just re-testing combinations that have
3642        * already been tested.
3643        */
3644       if (d > q)
3645          break;
3646    }
3647 }
3648 
3649 void
lower_mul_dword_inst(elk_fs_inst * inst,elk_bblock_t * block)3650 elk_fs_visitor::lower_mul_dword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3651 {
3652    const fs_builder ibld(this, block, inst);
3653 
3654    /* It is correct to use inst->src[1].d in both end of the comparison.
3655     * Using .ud in the UINT16_MAX comparison would cause any negative value to
3656     * fail the check.
3657     */
3658    if (inst->src[1].file == IMM &&
3659        (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
3660       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3661        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3662        * src1 are used.
3663        *
3664        * If multiplying by an immediate value that fits in 16-bits, do a
3665        * single MUL instruction with that value in the proper location.
3666        */
3667       const bool ud = (inst->src[1].d >= 0);
3668       if (devinfo->ver < 7) {
3669          elk_fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
3670          ibld.MOV(imm, inst->src[1]);
3671          ibld.MUL(inst->dst, imm, inst->src[0]);
3672       } else {
3673          ibld.MUL(inst->dst, inst->src[0],
3674                   ud ? elk_imm_uw(inst->src[1].ud)
3675                      : elk_imm_w(inst->src[1].d));
3676       }
3677    } else {
3678       /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
3679        * do 32-bit integer multiplication in one instruction, but instead
3680        * must do a sequence (which actually calculates a 64-bit result):
3681        *
3682        *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3683        *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3684        *    mov(8)  g2<1>D     acc0<8,8,1>D
3685        *
3686        * But on Gen > 6, the ability to use second accumulator register
3687        * (acc1) for non-float data types was removed, preventing a simple
3688        * implementation in SIMD16. A 16-channel result can be calculated by
3689        * executing the three instructions twice in SIMD8, once with quarter
3690        * control of 1Q for the first eight channels and again with 2Q for
3691        * the second eight channels.
3692        *
3693        * Which accumulator register is implicitly accessed (by AccWrEnable
3694        * for instance) is determined by the quarter control. Unfortunately
3695        * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3696        * implicit accumulator access by an instruction with 2Q will access
3697        * acc1 regardless of whether the data type is usable in acc1.
3698        *
3699        * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3700        * integer data types.
3701        *
3702        * Since we only want the low 32-bits of the result, we can do two
3703        * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3704        * adjust the high result and add them (like the mach is doing):
3705        *
3706        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3707        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3708        *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3709        *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3710        *
3711        * We avoid the shl instruction by realizing that we only want to add
3712        * the low 16-bits of the "high" result to the high 16-bits of the
3713        * "low" result and using proper regioning on the add:
3714        *
3715        *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3716        *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3717        *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3718        *
3719        * Since it does not use the (single) accumulator register, we can
3720        * schedule multi-component multiplications much better.
3721        */
3722 
3723       bool needs_mov = false;
3724       elk_fs_reg orig_dst = inst->dst;
3725 
3726       /* Get a new VGRF for the "low" 32x16-bit multiplication result if
3727        * reusing the original destination is impossible due to hardware
3728        * restrictions, source/destination overlap, or it being the null
3729        * register.
3730        */
3731       elk_fs_reg low = inst->dst;
3732       if (orig_dst.is_null() || orig_dst.file == MRF ||
3733           regions_overlap(inst->dst, inst->size_written,
3734                           inst->src[0], inst->size_read(0)) ||
3735           regions_overlap(inst->dst, inst->size_written,
3736                           inst->src[1], inst->size_read(1)) ||
3737           inst->dst.stride >= 4) {
3738          needs_mov = true;
3739          low = elk_fs_reg(VGRF, alloc.allocate(regs_written(inst)),
3740                       inst->dst.type);
3741       }
3742 
3743       /* Get a new VGRF but keep the same stride as inst->dst */
3744       elk_fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
3745       high.stride = inst->dst.stride;
3746       high.offset = inst->dst.offset % REG_SIZE;
3747 
3748       bool do_addition = true;
3749       if (devinfo->ver >= 7) {
3750          if (inst->src[1].abs)
3751             lower_src_modifiers(this, block, inst, 1);
3752 
3753          if (inst->src[1].file == IMM) {
3754             unsigned a;
3755             unsigned b;
3756 
3757             /* If the immeditate value can be factored into two values, A and
3758              * B, that each fit in 16-bits, the multiplication result can
3759              * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
3760              * This saves an operation (the addition) and a temporary register
3761              * (high).
3762              *
3763              * Skip the optimization if either the high word or the low word
3764              * is 0 or 1.  In these conditions, at least one of the
3765              * multiplications generated by the straightforward method will be
3766              * eliminated anyway.
3767              */
3768             if (inst->src[1].ud > 0x0001ffff &&
3769                 (inst->src[1].ud & 0xffff) > 1) {
3770                factor_uint32(inst->src[1].ud, &a, &b);
3771 
3772                if (a != 0) {
3773                   ibld.MUL(low, inst->src[0], elk_imm_uw(a));
3774                   ibld.MUL(low, low, elk_imm_uw(b));
3775                   do_addition = false;
3776                }
3777             }
3778 
3779             if (do_addition) {
3780                ibld.MUL(low, inst->src[0],
3781                         elk_imm_uw(inst->src[1].ud & 0xffff));
3782                ibld.MUL(high, inst->src[0],
3783                         elk_imm_uw(inst->src[1].ud >> 16));
3784             }
3785          } else {
3786             ibld.MUL(low, inst->src[0],
3787                      subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3788             ibld.MUL(high, inst->src[0],
3789                      subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 1));
3790          }
3791       } else {
3792          if (inst->src[0].abs)
3793             lower_src_modifiers(this, block, inst, 0);
3794 
3795          ibld.MUL(low, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 0),
3796                   inst->src[1]);
3797          ibld.MUL(high, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 1),
3798                   inst->src[1]);
3799       }
3800 
3801       if (do_addition) {
3802          ibld.ADD(subscript(low, ELK_REGISTER_TYPE_UW, 1),
3803                   subscript(low, ELK_REGISTER_TYPE_UW, 1),
3804                   subscript(high, ELK_REGISTER_TYPE_UW, 0));
3805       }
3806 
3807       if (needs_mov || inst->conditional_mod)
3808          set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
3809    }
3810 }
3811 
3812 void
lower_mul_qword_inst(elk_fs_inst * inst,elk_bblock_t * block)3813 elk_fs_visitor::lower_mul_qword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3814 {
3815    const fs_builder ibld(this, block, inst);
3816 
3817    /* Considering two 64-bit integers ab and cd where each letter        ab
3818     * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
3819     * only need to provide the YZ part of the result.               -------
3820     *                                                                    BD
3821     *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
3822     *  about the lower 32 bits (since they are part of the upper     +  BC
3823     *  32 bits of our result). AC is not needed since it starts      + AC
3824     *  on the 65th bit of the result.                               -------
3825     *                                                                  WXYZ
3826     */
3827    unsigned int q_regs = regs_written(inst);
3828    unsigned int d_regs = (q_regs + 1) / 2;
3829 
3830    elk_fs_reg bd(VGRF, alloc.allocate(q_regs), ELK_REGISTER_TYPE_UQ);
3831    elk_fs_reg ad(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3832    elk_fs_reg bc(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3833 
3834    /* Here we need the full 64 bit result for 32b * 32b. */
3835    if (devinfo->has_integer_dword_mul) {
3836       ibld.MUL(bd, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3837                subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3838    } else {
3839       elk_fs_reg bd_high(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3840       elk_fs_reg bd_low(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3841       const unsigned acc_width = reg_unit(devinfo) * 8;
3842       elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), ELK_REGISTER_TYPE_UD),
3843                              inst->group % acc_width);
3844 
3845       elk_fs_inst *mul = ibld.MUL(acc,
3846                             subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3847                             subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3848       mul->writes_accumulator = true;
3849 
3850       ibld.MACH(bd_high, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3851                 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3852       ibld.MOV(bd_low, acc);
3853 
3854       ibld.UNDEF(bd);
3855       ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 0), bd_low);
3856       ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 1), bd_high);
3857    }
3858 
3859    ibld.MUL(ad, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
3860             subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3861    ibld.MUL(bc, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3862             subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1));
3863 
3864    ibld.ADD(ad, ad, bc);
3865    ibld.ADD(subscript(bd, ELK_REGISTER_TYPE_UD, 1),
3866             subscript(bd, ELK_REGISTER_TYPE_UD, 1), ad);
3867 
3868    if (devinfo->has_64bit_int) {
3869       ibld.MOV(inst->dst, bd);
3870    } else {
3871       if (!inst->is_partial_write())
3872          ibld.emit_undef_for_dst(inst);
3873       ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
3874                subscript(bd, ELK_REGISTER_TYPE_UD, 0));
3875       ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
3876                subscript(bd, ELK_REGISTER_TYPE_UD, 1));
3877    }
3878 }
3879 
3880 void
lower_mulh_inst(elk_fs_inst * inst,elk_bblock_t * block)3881 elk_fs_visitor::lower_mulh_inst(elk_fs_inst *inst, elk_bblock_t *block)
3882 {
3883    const fs_builder ibld(this, block, inst);
3884 
3885    /* According to the BDW+ BSpec page for the "Multiply Accumulate
3886     * High" instruction:
3887     *
3888     *  "An added preliminary mov is required for source modification on
3889     *   src1:
3890     *      mov (8) r3.0<1>:d -r3<8;8,1>:d
3891     *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
3892     *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
3893     */
3894    if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
3895       lower_src_modifiers(this, block, inst, 1);
3896 
3897    /* Should have been lowered to 8-wide. */
3898    assert(inst->exec_size <= get_lowered_simd_width(this, inst));
3899    const unsigned acc_width = reg_unit(devinfo) * 8;
3900    const elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), inst->dst.type),
3901                                 inst->group % acc_width);
3902    elk_fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3903    elk_fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3904 
3905    if (devinfo->ver >= 8) {
3906       /* Until Gfx8, integer multiplies read 32-bits from one source,
3907        * and 16-bits from the other, and relying on the MACH instruction
3908        * to generate the high bits of the result.
3909        *
3910        * On Gfx8, the multiply instruction does a full 32x32-bit
3911        * multiply, but in order to do a 64-bit multiply we can simulate
3912        * the previous behavior and then use a MACH instruction.
3913        */
3914       assert(mul->src[1].type == ELK_REGISTER_TYPE_D ||
3915              mul->src[1].type == ELK_REGISTER_TYPE_UD);
3916       mul->src[1].type = ELK_REGISTER_TYPE_UW;
3917       mul->src[1].stride *= 2;
3918 
3919       if (mul->src[1].file == IMM) {
3920          mul->src[1] = elk_imm_uw(mul->src[1].ud);
3921       }
3922    } else if (devinfo->verx10 == 70 &&
3923               inst->group > 0) {
3924       /* Among other things the quarter control bits influence which
3925        * accumulator register is used by the hardware for instructions
3926        * that access the accumulator implicitly (e.g. MACH).  A
3927        * second-half instruction would normally map to acc1, which
3928        * doesn't exist on Gfx7 and up (the hardware does emulate it for
3929        * floating-point instructions *only* by taking advantage of the
3930        * extra precision of acc0 not normally used for floating point
3931        * arithmetic).
3932        *
3933        * HSW and up are careful enough not to try to access an
3934        * accumulator register that doesn't exist, but on earlier Gfx7
3935        * hardware we need to make sure that the quarter control bits are
3936        * zero to avoid non-deterministic behaviour and emit an extra MOV
3937        * to get the result masked correctly according to the current
3938        * channel enables.
3939        */
3940       mach->group = 0;
3941       mach->force_writemask_all = true;
3942       mach->dst = ibld.vgrf(inst->dst.type);
3943       ibld.MOV(inst->dst, mach->dst);
3944    }
3945 }
3946 
3947 bool
lower_integer_multiplication()3948 elk_fs_visitor::lower_integer_multiplication()
3949 {
3950    bool progress = false;
3951 
3952    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3953       if (inst->opcode == ELK_OPCODE_MUL) {
3954          /* If the instruction is already in a form that does not need lowering,
3955           * return early.
3956           */
3957          if (devinfo->ver >= 7) {
3958             if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
3959                continue;
3960          } else {
3961             if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
3962                continue;
3963          }
3964 
3965          if ((inst->dst.type == ELK_REGISTER_TYPE_Q ||
3966               inst->dst.type == ELK_REGISTER_TYPE_UQ) &&
3967              (inst->src[0].type == ELK_REGISTER_TYPE_Q ||
3968               inst->src[0].type == ELK_REGISTER_TYPE_UQ) &&
3969              (inst->src[1].type == ELK_REGISTER_TYPE_Q ||
3970               inst->src[1].type == ELK_REGISTER_TYPE_UQ)) {
3971             lower_mul_qword_inst(inst, block);
3972             inst->remove(block);
3973             progress = true;
3974          } else if (!inst->dst.is_accumulator() &&
3975                     (inst->dst.type == ELK_REGISTER_TYPE_D ||
3976                      inst->dst.type == ELK_REGISTER_TYPE_UD) &&
3977                     !devinfo->has_integer_dword_mul) {
3978             lower_mul_dword_inst(inst, block);
3979             inst->remove(block);
3980             progress = true;
3981          }
3982       } else if (inst->opcode == ELK_SHADER_OPCODE_MULH) {
3983          lower_mulh_inst(inst, block);
3984          inst->remove(block);
3985          progress = true;
3986       }
3987 
3988    }
3989 
3990    if (progress)
3991       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3992 
3993    return progress;
3994 }
3995 
3996 bool
lower_minmax()3997 elk_fs_visitor::lower_minmax()
3998 {
3999    assert(devinfo->ver < 6);
4000 
4001    bool progress = false;
4002 
4003    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4004       const fs_builder ibld(this, block, inst);
4005 
4006       if (inst->opcode == ELK_OPCODE_SEL &&
4007           inst->predicate == ELK_PREDICATE_NONE) {
4008          /* If src1 is an immediate value that is not NaN, then it can't be
4009           * NaN.  In that case, emit CMP because it is much better for cmod
4010           * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
4011           * support HF or DF, so it is not necessary to check for those.
4012           */
4013          if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
4014              (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
4015             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4016                      inst->conditional_mod);
4017          } else {
4018             ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
4019                       inst->conditional_mod);
4020          }
4021          inst->predicate = ELK_PREDICATE_NORMAL;
4022          inst->conditional_mod = ELK_CONDITIONAL_NONE;
4023 
4024          progress = true;
4025       }
4026    }
4027 
4028    if (progress)
4029       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4030 
4031    return progress;
4032 }
4033 
4034 bool
lower_sub_sat()4035 elk_fs_visitor::lower_sub_sat()
4036 {
4037    bool progress = false;
4038 
4039    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4040       const fs_builder ibld(this, block, inst);
4041 
4042       if (inst->opcode == ELK_SHADER_OPCODE_USUB_SAT ||
4043           inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4044          /* The fundamental problem is the hardware performs source negation
4045           * at the bit width of the source.  If the source is 0x80000000D, the
4046           * negation is 0x80000000D.  As a result, subtractSaturate(0,
4047           * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
4048           * are at least three ways to resolve this:
4049           *
4050           * 1. Use the accumulator for the negated source.  The accumulator is
4051           *    33 bits, so our source 0x80000000 is sign-extended to
4052           *    0x1800000000.  The negation of which is 0x080000000.  This
4053           *    doesn't help for 64-bit integers (which are already bigger than
4054           *    33 bits).  There are also only 8 accumulators, so SIMD16 or
4055           *    SIMD32 instructions would have to be split into multiple SIMD8
4056           *    instructions.
4057           *
4058           * 2. Use slightly different math.  For any n-bit value x, we know (x
4059           *    >> 1) != -(x >> 1).  We can use this fact to only do
4060           *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
4061           *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
4062           *
4063           * 3. For unsigned sources, it is sufficient to replace the
4064           *    subtractSaturate with (a > b) ? a - b : 0.
4065           *
4066           * It may also be possible to use the SUBB instruction.  This
4067           * implicitly writes the accumulator, so it could only be used in the
4068           * same situations as #1 above.  It is further limited by only
4069           * allowing UD sources.
4070           */
4071          if (inst->exec_size == 8 && inst->src[0].type != ELK_REGISTER_TYPE_Q &&
4072              inst->src[0].type != ELK_REGISTER_TYPE_UQ) {
4073             elk_fs_reg acc(ARF, ELK_ARF_ACCUMULATOR, inst->src[1].type);
4074 
4075             ibld.MOV(acc, inst->src[1]);
4076             elk_fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
4077             add->saturate = true;
4078             add->src[0].negate = true;
4079          } else if (inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4080             /* tmp = src1 >> 1;
4081              * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
4082              */
4083             elk_fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
4084             elk_fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
4085             elk_fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
4086             elk_fs_inst *add;
4087 
4088             ibld.SHR(tmp1, inst->src[1], elk_imm_d(1));
4089 
4090             add = ibld.ADD(tmp2, inst->src[1], tmp1);
4091             add->src[1].negate = true;
4092 
4093             add = ibld.ADD(tmp3, inst->src[0], tmp1);
4094             add->src[1].negate = true;
4095             add->saturate = true;
4096 
4097             add = ibld.ADD(inst->dst, tmp3, tmp2);
4098             add->src[1].negate = true;
4099             add->saturate = true;
4100          } else {
4101             /* a > b ? a - b : 0 */
4102             ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4103                      ELK_CONDITIONAL_G);
4104 
4105             elk_fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4106             add->src[1].negate = !add->src[1].negate;
4107 
4108             ibld.SEL(inst->dst, inst->dst, elk_imm_ud(0))
4109                ->predicate = ELK_PREDICATE_NORMAL;
4110          }
4111 
4112          inst->remove(block);
4113          progress = true;
4114       }
4115    }
4116 
4117    if (progress)
4118       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4119 
4120    return progress;
4121 }
4122 
4123 /**
4124  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4125  * by discard.  Due to the layout of the sample mask in the fragment shader
4126  * thread payload, \p bld is required to have a dispatch_width() not greater
4127  * than 16 for fragment shaders.
4128  */
4129 elk_fs_reg
elk_sample_mask_reg(const fs_builder & bld)4130 elk_sample_mask_reg(const fs_builder &bld)
4131 {
4132    const elk_fs_visitor &s = *bld.shader;
4133 
4134    if (s.stage != MESA_SHADER_FRAGMENT) {
4135       return elk_imm_ud(0xffffffff);
4136    } else if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4137       assert(bld.dispatch_width() <= 16);
4138       return elk_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
4139    } else {
4140       assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4141       return retype(elk_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4142                     ELK_REGISTER_TYPE_UW);
4143    }
4144 }
4145 
4146 uint32_t
elk_fb_write_msg_control(const elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data)4147 elk_fb_write_msg_control(const elk_fs_inst *inst,
4148                          const struct elk_wm_prog_data *prog_data)
4149 {
4150    uint32_t mctl;
4151 
4152    if (inst->opcode == ELK_FS_OPCODE_REP_FB_WRITE) {
4153       assert(inst->group == 0 && inst->exec_size == 16);
4154       mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4155    } else if (prog_data->dual_src_blend) {
4156       assert(inst->exec_size == 8);
4157 
4158       if (inst->group % 16 == 0)
4159          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4160       else if (inst->group % 16 == 8)
4161          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4162       else
4163          unreachable("Invalid dual-source FB write instruction group");
4164    } else {
4165       assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4166 
4167       if (inst->exec_size == 16)
4168          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4169       else if (inst->exec_size == 8)
4170          mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4171       else
4172          unreachable("Invalid FB write execution size");
4173    }
4174 
4175    return mctl;
4176 }
4177 
4178  /**
4179  * Predicate the specified instruction on the sample mask.
4180  */
4181 void
elk_emit_predicate_on_sample_mask(const fs_builder & bld,elk_fs_inst * inst)4182 elk_emit_predicate_on_sample_mask(const fs_builder &bld, elk_fs_inst *inst)
4183 {
4184    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
4185           bld.group() == inst->group &&
4186           bld.dispatch_width() == inst->exec_size);
4187 
4188    const elk_fs_visitor &s = *bld.shader;
4189    const elk_fs_reg sample_mask = elk_sample_mask_reg(bld);
4190    const unsigned subreg = sample_mask_flag_subreg(s);
4191 
4192    if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4193       assert(sample_mask.file == ARF &&
4194              sample_mask.nr == elk_flag_subreg(subreg).nr &&
4195              sample_mask.subnr == elk_flag_subreg(
4196                 subreg + inst->group / 16).subnr);
4197    } else {
4198       bld.group(1, 0).exec_all()
4199          .MOV(elk_flag_subreg(subreg + inst->group / 16), sample_mask);
4200    }
4201 
4202    if (inst->predicate) {
4203       assert(inst->predicate == ELK_PREDICATE_NORMAL);
4204       assert(!inst->predicate_inverse);
4205       assert(inst->flag_subreg == 0);
4206       /* Combine the sample mask with the existing predicate by using a
4207        * vertical predication mode.
4208        */
4209       inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
4210    } else {
4211       inst->flag_subreg = subreg;
4212       inst->predicate = ELK_PREDICATE_NORMAL;
4213       inst->predicate_inverse = false;
4214    }
4215 }
4216 
4217 static bool
is_mixed_float_with_fp32_dst(const elk_fs_inst * inst)4218 is_mixed_float_with_fp32_dst(const elk_fs_inst *inst)
4219 {
4220    /* This opcode sometimes uses :W type on the source even if the operand is
4221     * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
4222     */
4223    if (inst->opcode == ELK_OPCODE_F16TO32)
4224       return true;
4225 
4226    if (inst->dst.type != ELK_REGISTER_TYPE_F)
4227       return false;
4228 
4229    for (int i = 0; i < inst->sources; i++) {
4230       if (inst->src[i].type == ELK_REGISTER_TYPE_HF)
4231          return true;
4232    }
4233 
4234    return false;
4235 }
4236 
4237 static bool
is_mixed_float_with_packed_fp16_dst(const elk_fs_inst * inst)4238 is_mixed_float_with_packed_fp16_dst(const elk_fs_inst *inst)
4239 {
4240    /* This opcode sometimes uses :W type on the destination even if the
4241     * destination is a :HF, because in gfx7 there is no support for :HF, and
4242     * thus it uses :W.
4243     */
4244    if (inst->opcode == ELK_OPCODE_F32TO16 &&
4245        inst->dst.stride == 1)
4246       return true;
4247 
4248    if (inst->dst.type != ELK_REGISTER_TYPE_HF ||
4249        inst->dst.stride != 1)
4250       return false;
4251 
4252    for (int i = 0; i < inst->sources; i++) {
4253       if (inst->src[i].type == ELK_REGISTER_TYPE_F)
4254          return true;
4255    }
4256 
4257    return false;
4258 }
4259 
4260 /**
4261  * Get the closest allowed SIMD width for instruction \p inst accounting for
4262  * some common regioning and execution control restrictions that apply to FPU
4263  * instructions.  These restrictions don't necessarily have any relevance to
4264  * instructions not executed by the FPU pipeline like extended math, control
4265  * flow or send message instructions.
4266  *
4267  * For virtual opcodes it's really up to the instruction -- In some cases
4268  * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
4269  * instructions) it may simplify virtual instruction lowering if we can
4270  * enforce FPU-like regioning restrictions already on the virtual instruction,
4271  * in other cases (e.g. virtual send-like instructions) this may be
4272  * excessively restrictive.
4273  */
4274 static unsigned
get_fpu_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4275 get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
4276                            const elk_fs_inst *inst)
4277 {
4278    const struct elk_compiler *compiler = shader->compiler;
4279    const struct intel_device_info *devinfo = compiler->devinfo;
4280 
4281    /* Maximum execution size representable in the instruction controls. */
4282    unsigned max_width = MIN2(32, inst->exec_size);
4283 
4284    /* According to the PRMs:
4285     *  "A. In Direct Addressing mode, a source cannot span more than 2
4286     *      adjacent GRF registers.
4287     *   B. A destination cannot span more than 2 adjacent GRF registers."
4288     *
4289     * Look for the source or destination with the largest register region
4290     * which is the one that is going to limit the overall execution size of
4291     * the instruction due to this rule.
4292     */
4293    unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4294 
4295    for (unsigned i = 0; i < inst->sources; i++)
4296       reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
4297 
4298    /* Calculate the maximum execution size of the instruction based on the
4299     * factor by which it goes over the hardware limit of 2 GRFs.
4300     */
4301    const unsigned max_reg_count = 2 * reg_unit(devinfo);
4302    if (reg_count > max_reg_count)
4303       max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
4304 
4305    /* According to the IVB PRMs:
4306     *  "When destination spans two registers, the source MUST span two
4307     *   registers. The exception to the above rule:
4308     *
4309     *    - When source is scalar, the source registers are not incremented.
4310     *    - When source is packed integer Word and destination is packed
4311     *      integer DWord, the source register is not incremented but the
4312     *      source sub register is incremented."
4313     *
4314     * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
4315     * restrictions.  The code below intentionally doesn't check whether the
4316     * destination type is integer because empirically the hardware doesn't
4317     * seem to care what the actual type is as long as it's dword-aligned.
4318     *
4319     * HSW PRMs also add a note to the second exception:
4320     *  "When lower 8 channels are disabled, the sub register of source1
4321     *   operand is not incremented. If the lower 8 channels are expected
4322     *   to be disabled, say by predication, the instruction must be split
4323     *   into pair of simd8 operations."
4324     *
4325     * We can't reliably know if the channels won't be disabled due to,
4326     * for example, IMASK. So, play it safe and disallow packed-word exception
4327     * for src1.
4328     */
4329    if (devinfo->ver < 8) {
4330       for (unsigned i = 0; i < inst->sources; i++) {
4331          /* IVB implements DF scalars as <0;2,1> regions. */
4332          const bool is_scalar_exception = is_uniform(inst->src[i]) &&
4333             (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
4334          const bool is_packed_word_exception = i != 1 &&
4335             type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
4336             type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
4337 
4338          /* We check size_read(i) against size_written instead of REG_SIZE
4339           * because we want to properly handle SIMD32.  In SIMD32, you can end
4340           * up with writes to 4 registers and a source that reads 2 registers
4341           * and we may still need to lower all the way to SIMD8 in that case.
4342           */
4343          if (inst->size_written > REG_SIZE &&
4344              inst->size_read(i) != 0 &&
4345              inst->size_read(i) < inst->size_written &&
4346              !is_scalar_exception && !is_packed_word_exception) {
4347             const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4348             max_width = MIN2(max_width, inst->exec_size / reg_count);
4349          }
4350       }
4351    }
4352 
4353    if (devinfo->ver < 6) {
4354       /* From the G45 PRM, Volume 4 Page 361:
4355        *
4356        *    "Operand Alignment Rule: With the exceptions listed below, a
4357        *     source/destination operand in general should be aligned to even
4358        *     256-bit physical register with a region size equal to two 256-bit
4359        *     physical registers."
4360        *
4361        * Normally we enforce this by allocating virtual registers to the
4362        * even-aligned class.  But we need to handle payload registers.
4363        */
4364       for (unsigned i = 0; i < inst->sources; i++) {
4365          if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
4366              inst->size_read(i) > REG_SIZE) {
4367             max_width = MIN2(max_width, 8);
4368          }
4369       }
4370    }
4371 
4372    /* From the IVB PRMs:
4373     *  "When an instruction is SIMD32, the low 16 bits of the execution mask
4374     *   are applied for both halves of the SIMD32 instruction. If different
4375     *   execution mask channels are required, split the instruction into two
4376     *   SIMD16 instructions."
4377     *
4378     * There is similar text in the HSW PRMs.  Gfx4-6 don't even implement
4379     * 32-wide control flow support in hardware and will behave similarly.
4380     */
4381    if (devinfo->ver < 8 && !inst->force_writemask_all)
4382       max_width = MIN2(max_width, 16);
4383 
4384    /* From the IVB PRMs (applies to HSW too):
4385     *  "Instructions with condition modifiers must not use SIMD32."
4386     *
4387     * From the BDW PRMs (applies to later hardware too):
4388     *  "Ternary instruction with condition modifiers must not use SIMD32."
4389     */
4390    if (inst->conditional_mod && (devinfo->ver < 8 ||
4391                                  inst->elk_is_3src(compiler)))
4392       max_width = MIN2(max_width, 16);
4393 
4394    /* From the IVB PRMs (applies to other devices that don't have the
4395     * intel_device_info::supports_simd16_3src flag set):
4396     *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
4397     *   SIMD8 is not allowed for DF operations."
4398     */
4399    if (inst->elk_is_3src(compiler) && !devinfo->supports_simd16_3src)
4400       max_width = MIN2(max_width, inst->exec_size / reg_count);
4401 
4402    /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
4403     * the 8-bit quarter of the execution mask signals specified in the
4404     * instruction control fields) for the second compressed half of any
4405     * single-precision instruction (for double-precision instructions
4406     * it's hardwired to use NibCtrl+1, at least on HSW), which means that
4407     * the EU will apply the wrong execution controls for the second
4408     * sequential GRF write if the number of channels per GRF is not exactly
4409     * eight in single-precision mode (or four in double-float mode).
4410     *
4411     * In this situation we calculate the maximum size of the split
4412     * instructions so they only ever write to a single register.
4413     */
4414    if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
4415        !inst->force_writemask_all) {
4416       const unsigned channels_per_grf = inst->exec_size /
4417          DIV_ROUND_UP(inst->size_written, REG_SIZE);
4418       const unsigned exec_type_size = get_exec_type_size(inst);
4419       assert(exec_type_size);
4420 
4421       /* The hardware shifts exactly 8 channels per compressed half of the
4422        * instruction in single-precision mode and exactly 4 in double-precision.
4423        */
4424       if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
4425          max_width = MIN2(max_width, channels_per_grf);
4426 
4427       /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
4428        * because HW applies the same channel enable signals to both halves of
4429        * the compressed instruction which will be just wrong under
4430        * non-uniform control flow.
4431        */
4432       if (devinfo->verx10 == 70 &&
4433           (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
4434          max_width = MIN2(max_width, 4);
4435    }
4436 
4437    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4438     * Float Operations:
4439     *
4440     *    "No SIMD16 in mixed mode when destination is f32. Instruction
4441     *     execution size must be no more than 8."
4442     *
4443     * FIXME: the simulator doesn't seem to complain if we don't do this and
4444     * empirical testing with existing CTS tests show that they pass just fine
4445     * without implementing this, however, since our interpretation of the PRM
4446     * is that conversion MOVs between HF and F are still mixed-float
4447     * instructions (and therefore subject to this restriction) we decided to
4448     * split them to be safe. Might be useful to do additional investigation to
4449     * lift the restriction if we can ensure that it is safe though, since these
4450     * conversions are common when half-float types are involved since many
4451     * instructions do not support HF types and conversions from/to F are
4452     * required.
4453     */
4454    if (is_mixed_float_with_fp32_dst(inst))
4455       max_width = MIN2(max_width, 8);
4456 
4457    /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4458     * Float Operations:
4459     *
4460     *    "No SIMD16 in mixed mode when destination is packed f16 for both
4461     *     Align1 and Align16."
4462     */
4463    if (is_mixed_float_with_packed_fp16_dst(inst))
4464       max_width = MIN2(max_width, 8);
4465 
4466    /* Only power-of-two execution sizes are representable in the instruction
4467     * control fields.
4468     */
4469    return 1 << util_logbase2(max_width);
4470 }
4471 
4472 /**
4473  * Get the maximum allowed SIMD width for instruction \p inst accounting for
4474  * various payload size restrictions that apply to sampler message
4475  * instructions.
4476  *
4477  * This is only intended to provide a maximum theoretical bound for the
4478  * execution size of the message based on the number of argument components
4479  * alone, which in most cases will determine whether the SIMD8 or SIMD16
4480  * variant of the message can be used, though some messages may have
4481  * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
4482  * the message length to determine the exact SIMD width and argument count,
4483  * which makes a number of sampler message combinations impossible to
4484  * represent).
4485  *
4486  * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
4487  * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
4488  */
4489 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const elk_fs_inst * inst)4490 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
4491                                const elk_fs_inst *inst)
4492 {
4493    /* If we have a min_lod parameter on anything other than a simple sample
4494     * message, it will push it over 5 arguments and we have to fall back to
4495     * SIMD8.
4496     */
4497    if (inst->opcode != ELK_SHADER_OPCODE_TEX &&
4498        inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
4499       return 8;
4500 
4501    /* Calculate the number of coordinate components that have to be present
4502     * assuming that additional arguments follow the texel coordinates in the
4503     * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
4504     * need to pad to four or three components depending on the message,
4505     * pre-ILK we need to pad to at most three components.
4506     */
4507    const unsigned req_coord_components =
4508       (devinfo->ver >= 7 ||
4509        !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
4510       (devinfo->ver >= 5 && inst->opcode != ELK_SHADER_OPCODE_TXF_LOGICAL &&
4511                             inst->opcode != ELK_SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
4512       3;
4513 
4514    /* Calculate the total number of argument components that need to be passed
4515     * to the sampler unit.
4516     */
4517    const unsigned num_payload_components =
4518       MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
4519            req_coord_components) +
4520       inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
4521       inst->components_read(TEX_LOGICAL_SRC_LOD) +
4522       inst->components_read(TEX_LOGICAL_SRC_LOD2) +
4523       inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
4524       (inst->opcode == ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
4525        inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
4526       inst->components_read(TEX_LOGICAL_SRC_MCS);
4527 
4528    const unsigned simd_limit = reg_unit(devinfo) *
4529       (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
4530 
4531    /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
4532     * maximum message size supported by the sampler, regardless of whether a
4533     * header is provided or not.
4534     */
4535    return MIN2(inst->exec_size, simd_limit);
4536 }
4537 
4538 /**
4539  * Get the closest native SIMD width supported by the hardware for instruction
4540  * \p inst.  The instruction will be left untouched by
4541  * elk_fs_visitor::lower_simd_width() if the returned value is equal to the
4542  * original execution size.
4543  */
4544 static unsigned
get_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4545 get_lowered_simd_width(const elk_fs_visitor *shader, const elk_fs_inst *inst)
4546 {
4547    const struct elk_compiler *compiler = shader->compiler;
4548    const struct intel_device_info *devinfo = compiler->devinfo;
4549 
4550    switch (inst->opcode) {
4551    case ELK_OPCODE_MOV:
4552    case ELK_OPCODE_SEL:
4553    case ELK_OPCODE_NOT:
4554    case ELK_OPCODE_AND:
4555    case ELK_OPCODE_OR:
4556    case ELK_OPCODE_XOR:
4557    case ELK_OPCODE_SHR:
4558    case ELK_OPCODE_SHL:
4559    case ELK_OPCODE_ASR:
4560    case ELK_OPCODE_CMPN:
4561    case ELK_OPCODE_CSEL:
4562    case ELK_OPCODE_F32TO16:
4563    case ELK_OPCODE_F16TO32:
4564    case ELK_OPCODE_BFREV:
4565    case ELK_OPCODE_BFE:
4566    case ELK_OPCODE_ADD:
4567    case ELK_OPCODE_MUL:
4568    case ELK_OPCODE_AVG:
4569    case ELK_OPCODE_FRC:
4570    case ELK_OPCODE_RNDU:
4571    case ELK_OPCODE_RNDD:
4572    case ELK_OPCODE_RNDE:
4573    case ELK_OPCODE_RNDZ:
4574    case ELK_OPCODE_LZD:
4575    case ELK_OPCODE_FBH:
4576    case ELK_OPCODE_FBL:
4577    case ELK_OPCODE_CBIT:
4578    case ELK_OPCODE_SAD2:
4579    case ELK_OPCODE_MAD:
4580    case ELK_OPCODE_LRP:
4581    case ELK_FS_OPCODE_PACK:
4582    case ELK_SHADER_OPCODE_SEL_EXEC:
4583    case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
4584    case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
4585       return get_fpu_lowered_simd_width(shader, inst);
4586 
4587    case ELK_OPCODE_CMP: {
4588       /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
4589        * when the destination is a GRF the dependency-clear bit on the flag
4590        * register is cleared early.
4591        *
4592        * Suggested workarounds are to disable coissuing CMP instructions
4593        * or to split CMP(16) instructions into two CMP(8) instructions.
4594        *
4595        * We choose to split into CMP(8) instructions since disabling
4596        * coissuing would affect CMP instructions not otherwise affected by
4597        * the errata.
4598        */
4599       const unsigned max_width = (devinfo->verx10 == 70 &&
4600                                   !inst->dst.is_null() ? 8 : ~0);
4601       return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
4602    }
4603    case ELK_OPCODE_BFI1:
4604    case ELK_OPCODE_BFI2:
4605       /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
4606        * should
4607        *  "Force BFI instructions to be executed always in SIMD8."
4608        */
4609       return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
4610                   get_fpu_lowered_simd_width(shader, inst));
4611 
4612    case ELK_OPCODE_IF:
4613       assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
4614       return inst->exec_size;
4615 
4616    case ELK_SHADER_OPCODE_RCP:
4617    case ELK_SHADER_OPCODE_RSQ:
4618    case ELK_SHADER_OPCODE_SQRT:
4619    case ELK_SHADER_OPCODE_EXP2:
4620    case ELK_SHADER_OPCODE_LOG2:
4621    case ELK_SHADER_OPCODE_SIN:
4622    case ELK_SHADER_OPCODE_COS: {
4623       /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
4624        * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
4625        */
4626       if (devinfo->ver == 6 || devinfo->verx10 == 40)
4627          return MIN2(8, inst->exec_size);
4628       if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4629          return MIN2(8, inst->exec_size);
4630       return MIN2(16, inst->exec_size);
4631    }
4632 
4633    case ELK_SHADER_OPCODE_POW: {
4634       /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
4635        * to SIMD8 with half-float
4636        */
4637       if (devinfo->ver < 7)
4638          return MIN2(8, inst->exec_size);
4639       if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4640          return MIN2(8, inst->exec_size);
4641       return MIN2(16, inst->exec_size);
4642    }
4643 
4644    case ELK_SHADER_OPCODE_USUB_SAT:
4645    case ELK_SHADER_OPCODE_ISUB_SAT:
4646       return get_fpu_lowered_simd_width(shader, inst);
4647 
4648    case ELK_SHADER_OPCODE_INT_QUOTIENT:
4649    case ELK_SHADER_OPCODE_INT_REMAINDER:
4650       /* Integer division is limited to SIMD8 on all generations. */
4651       return MIN2(8, inst->exec_size);
4652 
4653    case ELK_FS_OPCODE_LINTERP:
4654    case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
4655    case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
4656    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
4657    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
4658    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
4659       return MIN2(16, inst->exec_size);
4660 
4661    case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4662       /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
4663        * message used to implement varying pull constant loads, so expand it
4664        * to SIMD16.  An alternative with longer message payload length but
4665        * shorter return payload would be to use the SIMD8 sampler message that
4666        * takes (header, u, v, r) as parameters instead of (header, u).
4667        */
4668       return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
4669 
4670    case ELK_FS_OPCODE_DDX_COARSE:
4671    case ELK_FS_OPCODE_DDX_FINE:
4672    case ELK_FS_OPCODE_DDY_COARSE:
4673    case ELK_FS_OPCODE_DDY_FINE:
4674       /* The implementation of this virtual opcode may require emitting
4675        * compressed Align16 instructions, which are severely limited on some
4676        * generations.
4677        *
4678        * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
4679        * Region Restrictions):
4680        *
4681        *  "In Align16 access mode, SIMD16 is not allowed for DW operations
4682        *   and SIMD8 is not allowed for DF operations."
4683        *
4684        * In this context, "DW operations" means "operations acting on 32-bit
4685        * values", so it includes operations on floats.
4686        *
4687        * Gfx4 has a similar restriction.  From the i965 PRM, section 11.5.3
4688        * (Instruction Compression -> Rules and Restrictions):
4689        *
4690        *  "A compressed instruction must be in Align1 access mode. Align16
4691        *   mode instructions cannot be compressed."
4692        *
4693        * Similar text exists in the g45 PRM.
4694        *
4695        * Empirically, compressed align16 instructions using odd register
4696        * numbers don't appear to work on Sandybridge either.
4697        */
4698       return (devinfo->ver == 4 || devinfo->ver == 6 ||
4699               (devinfo->verx10 == 70) ?
4700               MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
4701 
4702    case ELK_SHADER_OPCODE_MULH:
4703       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4704        * is 8-wide on Gfx7+.
4705        */
4706       return (devinfo->ver >= 7 ? 8 :
4707               get_fpu_lowered_simd_width(shader, inst));
4708 
4709    case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
4710       /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
4711        * here.
4712        */
4713       assert(devinfo->ver != 6 ||
4714              inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4715              inst->exec_size == 8);
4716       /* Dual-source FB writes are unsupported in SIMD16 mode. */
4717       return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4718               8 : MIN2(16, inst->exec_size));
4719 
4720    case ELK_SHADER_OPCODE_TEX_LOGICAL:
4721    case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
4722    case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
4723    case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
4724    case ELK_SHADER_OPCODE_LOD_LOGICAL:
4725    case ELK_SHADER_OPCODE_TG4_LOGICAL:
4726    case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
4727    case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4728    case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4729       return get_sampler_lowered_simd_width(devinfo, inst);
4730 
4731    /* On gfx12 parameters are fixed to 16-bit values and therefore they all
4732     * always fit regardless of the execution size.
4733     */
4734    case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
4735       return MIN2(16, inst->exec_size);
4736 
4737    case ELK_SHADER_OPCODE_TXD_LOGICAL:
4738       /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
4739        * unsuppported on Xe2.
4740        */
4741       return 8;
4742 
4743    case ELK_SHADER_OPCODE_TXL_LOGICAL:
4744    case ELK_FS_OPCODE_TXB_LOGICAL:
4745       /* Only one execution size is representable pre-ILK depending on whether
4746        * the shadow reference argument is present.
4747        */
4748       if (devinfo->ver == 4)
4749          return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
4750       else
4751          return get_sampler_lowered_simd_width(devinfo, inst);
4752 
4753    case ELK_SHADER_OPCODE_TXF_LOGICAL:
4754    case ELK_SHADER_OPCODE_TXS_LOGICAL:
4755       /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4756        * messages.  Use SIMD16 instead.
4757        */
4758       if (devinfo->ver == 4)
4759          return 16;
4760       else
4761          return get_sampler_lowered_simd_width(devinfo, inst);
4762 
4763    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4764    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4765    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4766       return 8;
4767 
4768    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4769    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4770    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4771    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
4772    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
4773    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
4774    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
4775       return MIN2(16, inst->exec_size);
4776 
4777    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
4778    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
4779    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
4780    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
4781       return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
4782 
4783    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
4784    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
4785    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
4786       assert(inst->exec_size <= 16);
4787       return inst->exec_size;
4788 
4789    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
4790       return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
4791 
4792    case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
4793    case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
4794       return MIN2(8, inst->exec_size);
4795 
4796    case ELK_SHADER_OPCODE_QUAD_SWIZZLE: {
4797       const unsigned swiz = inst->src[1].ud;
4798       return (is_uniform(inst->src[0]) ?
4799                  get_fpu_lowered_simd_width(shader, inst) :
4800               type_sz(inst->src[0].type) == 4 ? 8 :
4801               swiz == ELK_SWIZZLE_XYXY || swiz == ELK_SWIZZLE_ZWZW ? 4 :
4802               get_fpu_lowered_simd_width(shader, inst));
4803    }
4804    case ELK_SHADER_OPCODE_MOV_INDIRECT: {
4805       /* From IVB and HSW PRMs:
4806        *
4807        * "2.When the destination requires two registers and the sources are
4808        *  indirect, the sources must use 1x1 regioning mode.
4809        *
4810        * In case of DF instructions in HSW/IVB, the exec_size is limited by
4811        * the EU decompression logic not handling VxH indirect addressing
4812        * correctly.
4813        */
4814       const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
4815       /* Prior to Broadwell, we only have 8 address subregisters. */
4816       return MIN3(devinfo->ver >= 8 ? 16 : 8,
4817                   max_size / (inst->dst.stride * type_sz(inst->dst.type)),
4818                   inst->exec_size);
4819    }
4820 
4821    case ELK_SHADER_OPCODE_LOAD_PAYLOAD: {
4822       const unsigned reg_count =
4823          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
4824 
4825       if (reg_count > 2) {
4826          /* Only LOAD_PAYLOAD instructions with per-channel destination region
4827           * can be easily lowered (which excludes headers and heterogeneous
4828           * types).
4829           */
4830          assert(!inst->header_size);
4831          for (unsigned i = 0; i < inst->sources; i++)
4832             assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
4833                    inst->src[i].file == BAD_FILE);
4834 
4835          return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4836       } else {
4837          return inst->exec_size;
4838       }
4839    }
4840    default:
4841       return inst->exec_size;
4842    }
4843 }
4844 
4845 /**
4846  * Return true if splitting out the group of channels of instruction \p inst
4847  * given by lbld.group() requires allocating a temporary for the i-th source
4848  * of the lowered instruction.
4849  */
4850 static inline bool
needs_src_copy(const fs_builder & lbld,const elk_fs_inst * inst,unsigned i)4851 needs_src_copy(const fs_builder &lbld, const elk_fs_inst *inst, unsigned i)
4852 {
4853    return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
4854             (inst->components_read(i) == 1 &&
4855              lbld.dispatch_width() <= inst->exec_size)) ||
4856           (inst->flags_written(lbld.shader->devinfo) &
4857            flag_mask(inst->src[i], type_sz(inst->src[i].type)));
4858 }
4859 
4860 /**
4861  * Extract the data that would be consumed by the channel group given by
4862  * lbld.group() from the i-th source region of instruction \p inst and return
4863  * it as result in packed form.
4864  */
4865 static elk_fs_reg
emit_unzip(const fs_builder & lbld,elk_fs_inst * inst,unsigned i)4866 emit_unzip(const fs_builder &lbld, elk_fs_inst *inst, unsigned i)
4867 {
4868    assert(lbld.group() >= inst->group);
4869 
4870    /* Specified channel group from the source region. */
4871    const elk_fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
4872 
4873    if (needs_src_copy(lbld, inst, i)) {
4874       /* Builder of the right width to perform the copy avoiding uninitialized
4875        * data if the lowered execution size is greater than the original
4876        * execution size of the instruction.
4877        */
4878       const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
4879                                               inst->exec_size), 0);
4880       const elk_fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
4881 
4882       for (unsigned k = 0; k < inst->components_read(i); ++k)
4883          cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
4884 
4885       return tmp;
4886 
4887    } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
4888       /* The source is invariant for all dispatch_width-wide groups of the
4889        * original region.
4890        */
4891       return inst->src[i];
4892 
4893    } else {
4894       /* We can just point the lowered instruction at the right channel group
4895        * from the original region.
4896        */
4897       return src;
4898    }
4899 }
4900 
4901 /**
4902  * Return true if splitting out the group of channels of instruction \p inst
4903  * given by lbld.group() requires allocating a temporary for the destination
4904  * of the lowered instruction and copying the data back to the original
4905  * destination region.
4906  */
4907 static inline bool
needs_dst_copy(const fs_builder & lbld,const elk_fs_inst * inst)4908 needs_dst_copy(const fs_builder &lbld, const elk_fs_inst *inst)
4909 {
4910    if (inst->dst.is_null())
4911       return false;
4912 
4913    /* If the instruction writes more than one component we'll have to shuffle
4914     * the results of multiple lowered instructions in order to make sure that
4915     * they end up arranged correctly in the original destination region.
4916     */
4917    if (inst->size_written > inst->dst.component_size(inst->exec_size))
4918       return true;
4919 
4920    /* If the lowered execution size is larger than the original the result of
4921     * the instruction won't fit in the original destination, so we'll have to
4922     * allocate a temporary in any case.
4923     */
4924    if (lbld.dispatch_width() > inst->exec_size)
4925       return true;
4926 
4927    for (unsigned i = 0; i < inst->sources; i++) {
4928       /* If we already made a copy of the source for other reasons there won't
4929        * be any overlap with the destination.
4930        */
4931       if (needs_src_copy(lbld, inst, i))
4932          continue;
4933 
4934       /* In order to keep the logic simple we emit a copy whenever the
4935        * destination region doesn't exactly match an overlapping source, which
4936        * may point at the source and destination not being aligned group by
4937        * group which could cause one of the lowered instructions to overwrite
4938        * the data read from the same source by other lowered instructions.
4939        */
4940       if (regions_overlap(inst->dst, inst->size_written,
4941                           inst->src[i], inst->size_read(i)) &&
4942           !inst->dst.equals(inst->src[i]))
4943         return true;
4944    }
4945 
4946    return false;
4947 }
4948 
4949 /**
4950  * Insert data from a packed temporary into the channel group given by
4951  * lbld.group() of the destination region of instruction \p inst and return
4952  * the temporary as result.  Any copy instructions that are required for
4953  * unzipping the previous value (in the case of partial writes) will be
4954  * inserted using \p lbld_before and any copy instructions required for
4955  * zipping up the destination of \p inst will be inserted using \p lbld_after.
4956  */
4957 static elk_fs_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,elk_fs_inst * inst)4958 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
4959          elk_fs_inst *inst)
4960 {
4961    assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
4962    assert(lbld_before.group() == lbld_after.group());
4963    assert(lbld_after.group() >= inst->group);
4964 
4965    const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
4966 
4967    /* Specified channel group from the destination region. */
4968    const elk_fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
4969 
4970    if (!needs_dst_copy(lbld_after, inst)) {
4971       /* No need to allocate a temporary for the lowered instruction, just
4972        * take the right group of channels from the original region.
4973        */
4974       return dst;
4975    }
4976 
4977    /* Deal with the residency data part later */
4978    const unsigned residency_size = inst->has_sampler_residency() ?
4979       (reg_unit(devinfo) * REG_SIZE) : 0;
4980    const unsigned dst_size = (inst->size_written - residency_size) /
4981       inst->dst.component_size(inst->exec_size);
4982 
4983    const elk_fs_reg tmp = lbld_after.vgrf(inst->dst.type,
4984                                       dst_size + inst->has_sampler_residency());
4985 
4986    if (inst->predicate) {
4987       /* Handle predication by copying the original contents of the
4988        * destination into the temporary before emitting the lowered
4989        * instruction.
4990        */
4991       const fs_builder gbld_before =
4992          lbld_before.group(MIN2(lbld_before.dispatch_width(),
4993                                 inst->exec_size), 0);
4994       for (unsigned k = 0; k < dst_size; ++k) {
4995          gbld_before.MOV(offset(tmp, lbld_before, k),
4996                          offset(dst, inst->exec_size, k));
4997       }
4998    }
4999 
5000    const fs_builder gbld_after =
5001       lbld_after.group(MIN2(lbld_after.dispatch_width(),
5002                             inst->exec_size), 0);
5003    for (unsigned k = 0; k < dst_size; ++k) {
5004       /* Use a builder of the right width to perform the copy avoiding
5005        * uninitialized data if the lowered execution size is greater than the
5006        * original execution size of the instruction.
5007        */
5008       gbld_after.MOV(offset(dst, inst->exec_size, k),
5009                      offset(tmp, lbld_after, k));
5010    }
5011 
5012    if (inst->has_sampler_residency()) {
5013       /* Sampler messages with residency need a special attention. In the
5014        * first lane of the last component are located the Pixel Null Mask
5015        * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
5016        * have to build a single 32bit value for the SIMD32 message out of 2
5017        * SIMD16 16 bit values.
5018        */
5019       const fs_builder rbld = gbld_after.exec_all().group(1, 0);
5020       elk_fs_reg local_res_reg = component(
5021          retype(offset(tmp, lbld_before, dst_size),
5022                 ELK_REGISTER_TYPE_UW), 0);
5023       elk_fs_reg final_res_reg =
5024          retype(byte_offset(inst->dst,
5025                             inst->size_written - residency_size +
5026                             gbld_after.group() / 8),
5027                 ELK_REGISTER_TYPE_UW);
5028       rbld.MOV(final_res_reg, local_res_reg);
5029    }
5030 
5031    return tmp;
5032 }
5033 
5034 bool
lower_simd_width()5035 elk_fs_visitor::lower_simd_width()
5036 {
5037    bool progress = false;
5038 
5039    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5040       const unsigned lower_width = get_lowered_simd_width(this, inst);
5041 
5042       if (lower_width != inst->exec_size) {
5043          /* Builder matching the original instruction.  We may also need to
5044           * emit an instruction of width larger than the original, set the
5045           * execution size of the builder to the highest of both for now so
5046           * we're sure that both cases can be handled.
5047           */
5048          const unsigned max_width = MAX2(inst->exec_size, lower_width);
5049 
5050          const fs_builder bld =
5051             fs_builder(this, MAX2(max_width, dispatch_width)).at_end();
5052          const fs_builder ibld = bld.at(block, inst)
5053                                     .exec_all(inst->force_writemask_all)
5054                                     .group(max_width, inst->group / max_width);
5055 
5056          /* Split the copies in chunks of the execution width of either the
5057           * original or the lowered instruction, whichever is lower.
5058           */
5059          const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
5060          const unsigned residency_size = inst->has_sampler_residency() ?
5061             (reg_unit(devinfo) * REG_SIZE) : 0;
5062          const unsigned dst_size =
5063             (inst->size_written - residency_size) /
5064             inst->dst.component_size(inst->exec_size);
5065 
5066          assert(!inst->writes_accumulator && !inst->mlen);
5067 
5068          /* Inserting the zip, unzip, and duplicated instructions in all of
5069           * the right spots is somewhat tricky.  All of the unzip and any
5070           * instructions from the zip which unzip the destination prior to
5071           * writing need to happen before all of the per-group instructions
5072           * and the zip instructions need to happen after.  In order to sort
5073           * this all out, we insert the unzip instructions before \p inst,
5074           * insert the per-group instructions after \p inst (i.e. before
5075           * inst->next), and insert the zip instructions before the
5076           * instruction after \p inst.  Since we are inserting instructions
5077           * after \p inst, inst->next is a moving target and we need to save
5078           * it off here so that we insert the zip instructions in the right
5079           * place.
5080           *
5081           * Since we're inserting split instructions after after_inst, the
5082           * instructions will end up in the reverse order that we insert them.
5083           * However, certain render target writes require that the low group
5084           * instructions come before the high group.  From the Ivy Bridge PRM
5085           * Vol. 4, Pt. 1, Section 3.9.11:
5086           *
5087           *    "If multiple SIMD8 Dual Source messages are delivered by the
5088           *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
5089           *    issued before the SIMD8_DUALSRC_HI message with the same Slot
5090           *    Group Select setting."
5091           *
5092           * And, from Section 3.9.11.1 of the same PRM:
5093           *
5094           *    "When SIMD32 or SIMD16 PS threads send render target writes
5095           *    with multiple SIMD8 and SIMD16 messages, the following must
5096           *    hold:
5097           *
5098           *    All the slots (as described above) must have a corresponding
5099           *    render target write irrespective of the slot's validity. A slot
5100           *    is considered valid when at least one sample is enabled. For
5101           *    example, a SIMD16 PS thread must send two SIMD8 render target
5102           *    writes to cover all the slots.
5103           *
5104           *    PS thread must send SIMD render target write messages with
5105           *    increasing slot numbers. For example, SIMD16 thread has
5106           *    Slot[15:0] and if two SIMD8 render target writes are used, the
5107           *    first SIMD8 render target write must send Slot[7:0] and the
5108           *    next one must send Slot[15:8]."
5109           *
5110           * In order to make low group instructions come before high group
5111           * instructions (this is required for some render target writes), we
5112           * split from the highest group to lowest.
5113           */
5114          exec_node *const after_inst = inst->next;
5115          for (int i = n - 1; i >= 0; i--) {
5116             /* Emit a copy of the original instruction with the lowered width.
5117              * If the EOT flag was set throw it away except for the last
5118              * instruction to avoid killing the thread prematurely.
5119              */
5120             elk_fs_inst split_inst = *inst;
5121             split_inst.exec_size = lower_width;
5122             split_inst.eot = inst->eot && i == int(n - 1);
5123 
5124             /* Select the correct channel enables for the i-th group, then
5125              * transform the sources and destination and emit the lowered
5126              * instruction.
5127              */
5128             const fs_builder lbld = ibld.group(lower_width, i);
5129 
5130             for (unsigned j = 0; j < inst->sources; j++)
5131                split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
5132 
5133             split_inst.dst = emit_zip(lbld.at(block, inst),
5134                                       lbld.at(block, after_inst), inst);
5135             split_inst.size_written =
5136                split_inst.dst.component_size(lower_width) * dst_size +
5137                residency_size;
5138 
5139             lbld.at(block, inst->next).emit(split_inst);
5140          }
5141 
5142          inst->remove(block);
5143          progress = true;
5144       }
5145    }
5146 
5147    if (progress)
5148       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5149 
5150    return progress;
5151 }
5152 
5153 /**
5154  * Transform barycentric vectors into the interleaved form expected by the PLN
5155  * instruction and returned by the Gfx7+ PI shared function.
5156  *
5157  * For channels 0-15 in SIMD16 mode they are expected to be laid out as
5158  * follows in the register file:
5159  *
5160  *    rN+0: X[0-7]
5161  *    rN+1: Y[0-7]
5162  *    rN+2: X[8-15]
5163  *    rN+3: Y[8-15]
5164  *
5165  * There is no need to handle SIMD32 here -- This is expected to be run after
5166  * SIMD lowering, since SIMD lowering relies on vectors having the standard
5167  * component layout.
5168  */
5169 bool
lower_barycentrics()5170 elk_fs_visitor::lower_barycentrics()
5171 {
5172    const bool has_interleaved_layout = devinfo->has_pln ||
5173                                        devinfo->ver >= 7;
5174    bool progress = false;
5175 
5176    if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
5177       return false;
5178 
5179    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5180       if (inst->exec_size < 16)
5181          continue;
5182 
5183       const fs_builder ibld(this, block, inst);
5184       const fs_builder ubld = ibld.exec_all().group(8, 0);
5185 
5186       switch (inst->opcode) {
5187       case ELK_FS_OPCODE_LINTERP : {
5188          assert(inst->exec_size == 16);
5189          const elk_fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
5190          elk_fs_reg srcs[4];
5191 
5192          for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
5193             srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
5194                                    8 * (i / 2));
5195 
5196          ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
5197 
5198          inst->src[0] = tmp;
5199          progress = true;
5200          break;
5201       }
5202       case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
5203       case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
5204       case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
5205          assert(inst->exec_size == 16);
5206          const elk_fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
5207 
5208          for (unsigned i = 0; i < 2; i++) {
5209             for (unsigned g = 0; g < inst->exec_size / 8; g++) {
5210                elk_fs_inst *mov = ibld.at(block, inst->next).group(8, g)
5211                                   .MOV(horiz_offset(offset(inst->dst, ibld, i),
5212                                                     8 * g),
5213                                        offset(tmp, ubld, 2 * g + i));
5214                mov->predicate = inst->predicate;
5215                mov->predicate_inverse = inst->predicate_inverse;
5216                mov->flag_subreg = inst->flag_subreg;
5217             }
5218          }
5219 
5220          inst->dst = tmp;
5221          progress = true;
5222          break;
5223       }
5224       default:
5225          break;
5226       }
5227    }
5228 
5229    if (progress)
5230       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5231 
5232    return progress;
5233 }
5234 
5235 bool
lower_find_live_channel()5236 elk_fs_visitor::lower_find_live_channel()
5237 {
5238    bool progress = false;
5239 
5240    if (devinfo->ver < 8)
5241       return false;
5242 
5243    bool packed_dispatch =
5244       elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data);
5245    bool vmask =
5246       stage == MESA_SHADER_FRAGMENT &&
5247       elk_wm_prog_data(stage_prog_data)->uses_vmask;
5248 
5249    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5250       if (inst->opcode != ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL &&
5251           inst->opcode != ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
5252          continue;
5253 
5254       bool first = inst->opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL;
5255 
5256       /* Getting the first active channel index is easy on Gfx8: Just find
5257        * the first bit set in the execution mask.  The register exists on
5258        * HSW already but it reads back as all ones when the current
5259        * instruction has execution masking disabled, so it's kind of
5260        * useless there.
5261        */
5262       elk_fs_reg exec_mask(retype(elk_mask_reg(0), ELK_REGISTER_TYPE_UD));
5263 
5264       const fs_builder ibld(this, block, inst);
5265       if (!inst->is_partial_write())
5266          ibld.emit_undef_for_dst(inst);
5267 
5268       const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
5269 
5270       /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
5271        * so combine the execution and dispatch masks to obtain the true mask.
5272        *
5273        * If we're looking for the first live channel, and we have packed
5274        * dispatch, we can skip this step, as we know all dispatched channels
5275        * will appear at the front of the mask.
5276        */
5277       if (!(first && packed_dispatch)) {
5278          elk_fs_reg mask = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5279          ubld.UNDEF(mask);
5280          ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, mask, elk_imm_ud(vmask ? 3 : 2));
5281 
5282          /* Quarter control has the effect of magically shifting the value of
5283           * ce0 so you'll get the first/last active channel relative to the
5284           * specified quarter control as result.
5285           */
5286          if (inst->group > 0)
5287             ubld.SHR(mask, mask, elk_imm_ud(ALIGN(inst->group, 8)));
5288 
5289          ubld.AND(mask, exec_mask, mask);
5290          exec_mask = mask;
5291       }
5292 
5293       if (first) {
5294          ubld.FBL(inst->dst, exec_mask);
5295       } else {
5296          elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 1);
5297          ubld.UNDEF(tmp);
5298          ubld.LZD(tmp, exec_mask);
5299          ubld.ADD(inst->dst, negate(tmp), elk_imm_uw(31));
5300       }
5301 
5302       inst->remove(block);
5303       progress = true;
5304    }
5305 
5306    if (progress)
5307       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5308 
5309    return progress;
5310 }
5311 
5312 void
dump_instructions_to_file(FILE * file) const5313 elk_fs_visitor::dump_instructions_to_file(FILE *file) const
5314 {
5315    if (cfg) {
5316       const register_pressure &rp = regpressure_analysis.require();
5317       unsigned ip = 0, max_pressure = 0;
5318       unsigned cf_count = 0;
5319       foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5320          if (inst->is_control_flow_end())
5321             cf_count -= 1;
5322 
5323          max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5324          fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
5325          for (unsigned i = 0; i < cf_count; i++)
5326             fprintf(file, "  ");
5327          dump_instruction(inst, file);
5328          ip++;
5329 
5330          if (inst->is_control_flow_begin())
5331             cf_count += 1;
5332       }
5333       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
5334    } else {
5335       int ip = 0;
5336       foreach_in_list(elk_backend_instruction, inst, &instructions) {
5337          fprintf(file, "%4d: ", ip++);
5338          dump_instruction(inst, file);
5339       }
5340    }
5341 }
5342 
5343 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const5344 elk_fs_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
5345 {
5346    const elk_fs_inst *inst = (const elk_fs_inst *)be_inst;
5347 
5348    if (inst->predicate) {
5349       fprintf(file, "(%cf%d.%d) ",
5350               inst->predicate_inverse ? '-' : '+',
5351               inst->flag_subreg / 2,
5352               inst->flag_subreg % 2);
5353    }
5354 
5355    fprintf(file, "%s", elk_instruction_name(&compiler->isa, inst->opcode));
5356    if (inst->saturate)
5357       fprintf(file, ".sat");
5358    if (inst->conditional_mod) {
5359       fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
5360       if (!inst->predicate &&
5361           (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
5362                                 inst->opcode != ELK_OPCODE_CSEL &&
5363                                 inst->opcode != ELK_OPCODE_IF &&
5364                                 inst->opcode != ELK_OPCODE_WHILE))) {
5365          fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
5366                  inst->flag_subreg % 2);
5367       }
5368    }
5369    fprintf(file, "(%d) ", inst->exec_size);
5370 
5371    if (inst->mlen) {
5372       fprintf(file, "(mlen: %d) ", inst->mlen);
5373    }
5374 
5375    if (inst->eot) {
5376       fprintf(file, "(EOT) ");
5377    }
5378 
5379    switch (inst->dst.file) {
5380    case VGRF:
5381       fprintf(file, "vgrf%d", inst->dst.nr);
5382       break;
5383    case FIXED_GRF:
5384       fprintf(file, "g%d", inst->dst.nr);
5385       break;
5386    case MRF:
5387       fprintf(file, "m%d", inst->dst.nr);
5388       break;
5389    case BAD_FILE:
5390       fprintf(file, "(null)");
5391       break;
5392    case UNIFORM:
5393       fprintf(file, "***u%d***", inst->dst.nr);
5394       break;
5395    case ATTR:
5396       fprintf(file, "***attr%d***", inst->dst.nr);
5397       break;
5398    case ARF:
5399       switch (inst->dst.nr) {
5400       case ELK_ARF_NULL:
5401          fprintf(file, "null");
5402          break;
5403       case ELK_ARF_ADDRESS:
5404          fprintf(file, "a0.%d", inst->dst.subnr);
5405          break;
5406       case ELK_ARF_ACCUMULATOR:
5407          fprintf(file, "acc%d", inst->dst.subnr);
5408          break;
5409       case ELK_ARF_FLAG:
5410          fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5411          break;
5412       default:
5413          fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5414          break;
5415       }
5416       break;
5417    case IMM:
5418       unreachable("not reached");
5419    }
5420 
5421    if (inst->dst.offset ||
5422        (inst->dst.file == VGRF &&
5423         alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
5424       const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
5425       fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
5426               inst->dst.offset % reg_size);
5427    }
5428 
5429    if (inst->dst.stride != 1)
5430       fprintf(file, "<%u>", inst->dst.stride);
5431    fprintf(file, ":%s, ", elk_reg_type_to_letters(inst->dst.type));
5432 
5433    for (int i = 0; i < inst->sources; i++) {
5434       if (inst->src[i].negate)
5435          fprintf(file, "-");
5436       if (inst->src[i].abs)
5437          fprintf(file, "|");
5438       switch (inst->src[i].file) {
5439       case VGRF:
5440          fprintf(file, "vgrf%d", inst->src[i].nr);
5441          break;
5442       case FIXED_GRF:
5443          fprintf(file, "g%d", inst->src[i].nr);
5444          break;
5445       case MRF:
5446          fprintf(file, "***m%d***", inst->src[i].nr);
5447          break;
5448       case ATTR:
5449          fprintf(file, "attr%d", inst->src[i].nr);
5450          break;
5451       case UNIFORM:
5452          fprintf(file, "u%d", inst->src[i].nr);
5453          break;
5454       case BAD_FILE:
5455          fprintf(file, "(null)");
5456          break;
5457       case IMM:
5458          switch (inst->src[i].type) {
5459          case ELK_REGISTER_TYPE_HF:
5460             fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
5461             break;
5462          case ELK_REGISTER_TYPE_F:
5463             fprintf(file, "%-gf", inst->src[i].f);
5464             break;
5465          case ELK_REGISTER_TYPE_DF:
5466             fprintf(file, "%fdf", inst->src[i].df);
5467             break;
5468          case ELK_REGISTER_TYPE_W:
5469          case ELK_REGISTER_TYPE_D:
5470             fprintf(file, "%dd", inst->src[i].d);
5471             break;
5472          case ELK_REGISTER_TYPE_UW:
5473          case ELK_REGISTER_TYPE_UD:
5474             fprintf(file, "%uu", inst->src[i].ud);
5475             break;
5476          case ELK_REGISTER_TYPE_Q:
5477             fprintf(file, "%" PRId64 "q", inst->src[i].d64);
5478             break;
5479          case ELK_REGISTER_TYPE_UQ:
5480             fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
5481             break;
5482          case ELK_REGISTER_TYPE_VF:
5483             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
5484                     elk_vf_to_float((inst->src[i].ud >>  0) & 0xff),
5485                     elk_vf_to_float((inst->src[i].ud >>  8) & 0xff),
5486                     elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
5487                     elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
5488             break;
5489          case ELK_REGISTER_TYPE_V:
5490          case ELK_REGISTER_TYPE_UV:
5491             fprintf(file, "%08x%s", inst->src[i].ud,
5492                     inst->src[i].type == ELK_REGISTER_TYPE_V ? "V" : "UV");
5493             break;
5494          default:
5495             fprintf(file, "???");
5496             break;
5497          }
5498          break;
5499       case ARF:
5500          switch (inst->src[i].nr) {
5501          case ELK_ARF_NULL:
5502             fprintf(file, "null");
5503             break;
5504          case ELK_ARF_ADDRESS:
5505             fprintf(file, "a0.%d", inst->src[i].subnr);
5506             break;
5507          case ELK_ARF_ACCUMULATOR:
5508             fprintf(file, "acc%d", inst->src[i].subnr);
5509             break;
5510          case ELK_ARF_FLAG:
5511             fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5512             break;
5513          default:
5514             fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5515             break;
5516          }
5517          break;
5518       }
5519 
5520       if (inst->src[i].offset ||
5521           (inst->src[i].file == VGRF &&
5522            alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
5523          const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
5524          fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
5525                  inst->src[i].offset % reg_size);
5526       }
5527 
5528       if (inst->src[i].abs)
5529          fprintf(file, "|");
5530 
5531       if (inst->src[i].file != IMM) {
5532          unsigned stride;
5533          if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
5534             unsigned hstride = inst->src[i].hstride;
5535             stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
5536          } else {
5537             stride = inst->src[i].stride;
5538          }
5539          if (stride != 1)
5540             fprintf(file, "<%u>", stride);
5541 
5542          fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
5543       }
5544 
5545       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
5546          fprintf(file, ", ");
5547    }
5548 
5549    fprintf(file, " ");
5550 
5551    if (inst->force_writemask_all)
5552       fprintf(file, "NoMask ");
5553 
5554    if (inst->exec_size != dispatch_width)
5555       fprintf(file, "group%d ", inst->group);
5556 
5557    fprintf(file, "\n");
5558 }
5559 
register_pressure(const elk_fs_visitor * v)5560 elk::register_pressure::register_pressure(const elk_fs_visitor *v)
5561 {
5562    const fs_live_variables &live = v->live_analysis.require();
5563    const unsigned num_instructions = v->cfg->num_blocks ?
5564       v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
5565 
5566    regs_live_at_ip = new unsigned[num_instructions]();
5567 
5568    for (unsigned reg = 0; reg < v->alloc.count; reg++) {
5569       for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
5570          regs_live_at_ip[ip] += v->alloc.sizes[reg];
5571    }
5572 
5573    const unsigned payload_count = v->first_non_payload_grf;
5574 
5575    int *payload_last_use_ip = new int[payload_count];
5576    v->calculate_payload_ranges(payload_count, payload_last_use_ip);
5577 
5578    for (unsigned reg = 0; reg < payload_count; reg++) {
5579       for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
5580          ++regs_live_at_ip[ip];
5581    }
5582 
5583    delete[] payload_last_use_ip;
5584 }
5585 
~register_pressure()5586 elk::register_pressure::~register_pressure()
5587 {
5588    delete[] regs_live_at_ip;
5589 }
5590 
5591 void
invalidate_analysis(elk::analysis_dependency_class c)5592 elk_fs_visitor::invalidate_analysis(elk::analysis_dependency_class c)
5593 {
5594    elk_backend_shader::invalidate_analysis(c);
5595    live_analysis.invalidate(c);
5596    regpressure_analysis.invalidate(c);
5597 }
5598 
5599 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const5600 elk_fs_visitor::debug_optimizer(const nir_shader *nir,
5601                             const char *pass_name,
5602                             int iteration, int pass_num) const
5603 {
5604    if (!elk_should_print_shader(nir, DEBUG_OPTIMIZER))
5605       return;
5606 
5607    char *filename;
5608    int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
5609                       debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
5610                       _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
5611                       iteration, pass_num, pass_name);
5612    if (ret == -1)
5613       return;
5614    dump_instructions(filename);
5615    free(filename);
5616 }
5617 
5618 void
optimize()5619 elk_fs_visitor::optimize()
5620 {
5621    debug_optimizer(nir, "start", 0, 0);
5622 
5623    /* Start by validating the shader we currently have. */
5624    validate();
5625 
5626    bool progress = false;
5627    int iteration = 0;
5628    int pass_num = 0;
5629 
5630 #define OPT(pass, args...) ({                                           \
5631       pass_num++;                                                       \
5632       bool this_progress = pass(args);                                  \
5633                                                                         \
5634       if (this_progress)                                                \
5635          debug_optimizer(nir, #pass, iteration, pass_num);              \
5636                                                                         \
5637       validate();                                                       \
5638                                                                         \
5639       progress = progress || this_progress;                             \
5640       this_progress;                                                    \
5641    })
5642 
5643    assign_constant_locations();
5644    OPT(lower_constant_loads);
5645 
5646    validate();
5647 
5648    OPT(split_virtual_grfs);
5649 
5650    /* Before anything else, eliminate dead code.  The results of some NIR
5651     * instructions may effectively be calculated twice.  Once when the
5652     * instruction is encountered, and again when the user of that result is
5653     * encountered.  Wipe those away before algebraic optimizations and
5654     * especially copy propagation can mix things up.
5655     */
5656    OPT(dead_code_eliminate);
5657 
5658    OPT(remove_extra_rounding_modes);
5659 
5660    do {
5661       progress = false;
5662       pass_num = 0;
5663       iteration++;
5664 
5665       OPT(remove_duplicate_mrf_writes);
5666 
5667       OPT(opt_algebraic);
5668       OPT(opt_cse);
5669       OPT(opt_copy_propagation);
5670       OPT(elk_opt_predicated_break, this);
5671       OPT(opt_cmod_propagation);
5672       OPT(dead_code_eliminate);
5673       OPT(opt_peephole_sel);
5674       OPT(elk_dead_control_flow_eliminate, this);
5675       OPT(opt_saturate_propagation);
5676       OPT(register_coalesce);
5677       OPT(compute_to_mrf);
5678       OPT(eliminate_find_live_channel);
5679 
5680       OPT(compact_virtual_grfs);
5681    } while (progress);
5682 
5683    progress = false;
5684    pass_num = 0;
5685 
5686    if (OPT(lower_pack)) {
5687       OPT(register_coalesce);
5688       OPT(dead_code_eliminate);
5689    }
5690 
5691    OPT(lower_simd_width);
5692    OPT(lower_barycentrics);
5693    OPT(lower_logical_sends);
5694 
5695    /* After logical SEND lowering. */
5696 
5697    if (OPT(opt_copy_propagation))
5698       OPT(opt_algebraic);
5699 
5700    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
5701     * Do this before splitting SENDs.
5702     */
5703    if (devinfo->ver >= 7) {
5704       if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
5705          OPT(opt_algebraic);
5706    }
5707 
5708    if (progress) {
5709       if (OPT(opt_copy_propagation))
5710          OPT(opt_algebraic);
5711 
5712       /* Run after logical send lowering to give it a chance to CSE the
5713        * LOAD_PAYLOAD instructions created to construct the payloads of
5714        * e.g. texturing messages in cases where it wasn't possible to CSE the
5715        * whole logical instruction.
5716        */
5717       OPT(opt_cse);
5718       OPT(register_coalesce);
5719       OPT(compute_to_mrf);
5720       OPT(dead_code_eliminate);
5721       OPT(remove_duplicate_mrf_writes);
5722       OPT(opt_peephole_sel);
5723    }
5724 
5725    OPT(opt_redundant_halt);
5726 
5727    if (OPT(lower_load_payload)) {
5728       OPT(split_virtual_grfs);
5729 
5730       /* Lower 64 bit MOVs generated by payload lowering. */
5731       if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
5732          OPT(opt_algebraic);
5733 
5734       OPT(register_coalesce);
5735       OPT(lower_simd_width);
5736       OPT(compute_to_mrf);
5737       OPT(dead_code_eliminate);
5738    }
5739 
5740    OPT(opt_combine_constants);
5741    if (OPT(lower_integer_multiplication)) {
5742       /* If lower_integer_multiplication made progress, it may have produced
5743        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
5744        * one more time to clean those up if they exist.
5745        */
5746       OPT(lower_integer_multiplication);
5747    }
5748    OPT(lower_sub_sat);
5749 
5750    if (devinfo->ver <= 5 && OPT(lower_minmax)) {
5751       OPT(opt_cmod_propagation);
5752       OPT(opt_cse);
5753       if (OPT(opt_copy_propagation))
5754          OPT(opt_algebraic);
5755       OPT(dead_code_eliminate);
5756    }
5757 
5758    progress = false;
5759    OPT(lower_regioning);
5760    if (progress) {
5761       if (OPT(opt_copy_propagation))
5762          OPT(opt_algebraic);
5763       OPT(dead_code_eliminate);
5764       OPT(lower_simd_width);
5765    }
5766 
5767    OPT(lower_uniform_pull_constant_loads);
5768 
5769    OPT(lower_find_live_channel);
5770 
5771    validate();
5772 }
5773 
5774 /**
5775  * Three source instruction must have a GRF/MRF destination register.
5776  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
5777  */
5778 void
fixup_3src_null_dest()5779 elk_fs_visitor::fixup_3src_null_dest()
5780 {
5781    bool progress = false;
5782 
5783    foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
5784       if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
5785          inst->dst = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5786                             inst->dst.type);
5787          progress = true;
5788       }
5789    }
5790 
5791    if (progress)
5792       invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
5793                           DEPENDENCY_VARIABLES);
5794 }
5795 
5796 uint32_t
compute_max_register_pressure()5797 elk_fs_visitor::compute_max_register_pressure()
5798 {
5799    const register_pressure &rp = regpressure_analysis.require();
5800    uint32_t ip = 0, max_pressure = 0;
5801    foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5802       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5803       ip++;
5804    }
5805    return max_pressure;
5806 }
5807 
5808 static elk_fs_inst **
save_instruction_order(const struct elk_cfg_t * cfg)5809 save_instruction_order(const struct elk_cfg_t *cfg)
5810 {
5811    /* Before we schedule anything, stash off the instruction order as an array
5812     * of elk_fs_inst *.  This way, we can reset it between scheduling passes to
5813     * prevent dependencies between the different scheduling modes.
5814     */
5815    int num_insts = cfg->last_block()->end_ip + 1;
5816    elk_fs_inst **inst_arr = new elk_fs_inst * [num_insts];
5817 
5818    int ip = 0;
5819    foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
5820       assert(ip >= block->start_ip && ip <= block->end_ip);
5821       inst_arr[ip++] = inst;
5822    }
5823    assert(ip == num_insts);
5824 
5825    return inst_arr;
5826 }
5827 
5828 static void
restore_instruction_order(struct elk_cfg_t * cfg,elk_fs_inst ** inst_arr)5829 restore_instruction_order(struct elk_cfg_t *cfg, elk_fs_inst **inst_arr)
5830 {
5831    ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
5832 
5833    int ip = 0;
5834    foreach_block (block, cfg) {
5835       block->instructions.make_empty();
5836 
5837       assert(ip == block->start_ip);
5838       for (; ip <= block->end_ip; ip++)
5839          block->instructions.push_tail(inst_arr[ip]);
5840    }
5841    assert(ip == num_insts);
5842 }
5843 
5844 void
allocate_registers(bool allow_spilling)5845 elk_fs_visitor::allocate_registers(bool allow_spilling)
5846 {
5847    bool allocated;
5848 
5849    static const enum instruction_scheduler_mode pre_modes[] = {
5850       SCHEDULE_PRE,
5851       SCHEDULE_PRE_NON_LIFO,
5852       SCHEDULE_NONE,
5853       SCHEDULE_PRE_LIFO,
5854    };
5855 
5856    static const char *scheduler_mode_name[] = {
5857       [SCHEDULE_PRE] = "top-down",
5858       [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
5859       [SCHEDULE_PRE_LIFO] = "lifo",
5860       [SCHEDULE_POST] = "post",
5861       [SCHEDULE_NONE] = "none",
5862    };
5863 
5864    uint32_t best_register_pressure = UINT32_MAX;
5865    enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
5866 
5867    compact_virtual_grfs();
5868 
5869    if (needs_register_pressure)
5870       shader_stats.max_register_pressure = compute_max_register_pressure();
5871 
5872    debug_optimizer(nir, "pre_register_allocate", 90, 90);
5873 
5874    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
5875 
5876    /* Before we schedule anything, stash off the instruction order as an array
5877     * of elk_fs_inst *.  This way, we can reset it between scheduling passes to
5878     * prevent dependencies between the different scheduling modes.
5879     */
5880    elk_fs_inst **orig_order = save_instruction_order(cfg);
5881    elk_fs_inst **best_pressure_order = NULL;
5882 
5883    void *scheduler_ctx = ralloc_context(NULL);
5884    elk_fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
5885 
5886    /* Try each scheduling heuristic to see if it can successfully register
5887     * allocate without spilling.  They should be ordered by decreasing
5888     * performance but increasing likelihood of allocating.
5889     */
5890    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5891       enum instruction_scheduler_mode sched_mode = pre_modes[i];
5892 
5893       schedule_instructions_pre_ra(sched, sched_mode);
5894       this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
5895 
5896       debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
5897 
5898       if (0) {
5899          assign_regs_trivial();
5900          allocated = true;
5901          break;
5902       }
5903 
5904       /* We should only spill registers on the last scheduling. */
5905       assert(!spilled_any_registers);
5906 
5907       allocated = assign_regs(false, spill_all);
5908       if (allocated)
5909          break;
5910 
5911       /* Save the maximum register pressure */
5912       uint32_t this_pressure = compute_max_register_pressure();
5913 
5914       if (0) {
5915          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
5916                  scheduler_mode_name[sched_mode], this_pressure);
5917       }
5918 
5919       if (this_pressure < best_register_pressure) {
5920          best_register_pressure = this_pressure;
5921          best_sched = sched_mode;
5922          delete[] best_pressure_order;
5923          best_pressure_order = save_instruction_order(cfg);
5924       }
5925 
5926       /* Reset back to the original order before trying the next mode */
5927       restore_instruction_order(cfg, orig_order);
5928       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
5929    }
5930 
5931    ralloc_free(scheduler_ctx);
5932 
5933    if (!allocated) {
5934       if (0) {
5935          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
5936                  scheduler_mode_name[best_sched]);
5937       }
5938       restore_instruction_order(cfg, best_pressure_order);
5939       shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
5940 
5941       allocated = assign_regs(allow_spilling, spill_all);
5942    }
5943 
5944    delete[] orig_order;
5945    delete[] best_pressure_order;
5946 
5947    if (!allocated) {
5948       fail("Failure to register allocate.  Reduce number of "
5949            "live scalar values to avoid this.");
5950    } else if (spilled_any_registers) {
5951       elk_shader_perf_log(compiler, log_data,
5952                           "%s shader triggered register spilling.  "
5953                           "Try reducing the number of live scalar "
5954                           "values to improve performance.\n",
5955                           _mesa_shader_stage_to_string(stage));
5956    }
5957 
5958    /* This must come after all optimization and register allocation, since
5959     * it inserts dead code that happens to have side effects, and it does
5960     * so based on the actual physical registers in use.
5961     */
5962    insert_gfx4_send_dependency_workarounds();
5963 
5964    if (failed)
5965       return;
5966 
5967    opt_bank_conflicts();
5968 
5969    schedule_instructions_post_ra();
5970 
5971    if (last_scratch > 0) {
5972       ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
5973 
5974       /* Take the max of any previously compiled variant of the shader. In the
5975        * case of bindless shaders with return parts, this will also take the
5976        * max of all parts.
5977        */
5978       prog_data->total_scratch = MAX2(elk_get_scratch_size(last_scratch),
5979                                       prog_data->total_scratch);
5980 
5981       if (gl_shader_stage_is_compute(stage)) {
5982          if (devinfo->platform == INTEL_PLATFORM_HSW) {
5983             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5984              * field documentation, Haswell supports a minimum of 2kB of
5985              * scratch space for compute shaders, unlike every other stage
5986              * and platform.
5987              */
5988             prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
5989          } else if (devinfo->ver <= 7) {
5990             /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5991              * field documentation, platforms prior to Haswell measure scratch
5992              * size linearly with a range of [1kB, 12kB] and 1kB granularity.
5993              */
5994             prog_data->total_scratch = ALIGN(last_scratch, 1024);
5995             max_scratch_size = 12 * 1024;
5996          }
5997       }
5998 
5999       /* We currently only support up to 2MB of scratch space.  If we
6000        * need to support more eventually, the documentation suggests
6001        * that we could allocate a larger buffer, and partition it out
6002        * ourselves.  We'd just have to undo the hardware's address
6003        * calculation by subtracting (FFTID * Per Thread Scratch Space)
6004        * and then add FFTID * (Larger Per Thread Scratch Space).
6005        *
6006        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
6007        * Thread Group Tracking > Local Memory/Scratch Space.
6008        */
6009       assert(prog_data->total_scratch < max_scratch_size);
6010    }
6011 }
6012 
6013 bool
run_vs()6014 elk_fs_visitor::run_vs()
6015 {
6016    assert(stage == MESA_SHADER_VERTEX);
6017 
6018    payload_ = new elk_vs_thread_payload(*this);
6019 
6020    nir_to_elk(this);
6021 
6022    if (failed)
6023       return false;
6024 
6025    emit_urb_writes();
6026 
6027    calculate_cfg();
6028 
6029    optimize();
6030 
6031    assign_curb_setup();
6032    assign_vs_urb_setup();
6033 
6034    fixup_3src_null_dest();
6035 
6036    allocate_registers(true /* allow_spilling */);
6037 
6038    workaround_source_arf_before_eot();
6039 
6040    return !failed;
6041 }
6042 
6043 void
set_tcs_invocation_id()6044 elk_fs_visitor::set_tcs_invocation_id()
6045 {
6046    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(prog_data);
6047    struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
6048    const fs_builder bld = fs_builder(this).at_end();
6049 
6050    const unsigned instance_id_mask = INTEL_MASK(23, 17);
6051    const unsigned instance_id_shift = 17;
6052 
6053    elk_fs_reg t = bld.vgrf(ELK_REGISTER_TYPE_UD);
6054    bld.AND(t, elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD)),
6055            elk_imm_ud(instance_id_mask));
6056 
6057    invocation_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
6058 
6059    if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
6060       /* gl_InvocationID is just the thread number */
6061       bld.SHR(invocation_id, t, elk_imm_ud(instance_id_shift));
6062       return;
6063    }
6064 
6065    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
6066 
6067    elk_fs_reg channels_uw = bld.vgrf(ELK_REGISTER_TYPE_UW);
6068    elk_fs_reg channels_ud = bld.vgrf(ELK_REGISTER_TYPE_UD);
6069    bld.MOV(channels_uw, elk_fs_reg(elk_imm_uv(0x76543210)));
6070    bld.MOV(channels_ud, channels_uw);
6071 
6072    if (tcs_prog_data->instances == 1) {
6073       invocation_id = channels_ud;
6074    } else {
6075       elk_fs_reg instance_times_8 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6076       bld.SHR(instance_times_8, t, elk_imm_ud(instance_id_shift - 3));
6077       bld.ADD(invocation_id, instance_times_8, channels_ud);
6078    }
6079 }
6080 
6081 void
emit_tcs_thread_end()6082 elk_fs_visitor::emit_tcs_thread_end()
6083 {
6084    /* Try and tag the last URB write with EOT instead of emitting a whole
6085     * separate write just to finish the thread.  There isn't guaranteed to
6086     * be one, so this may not succeed.
6087     */
6088    if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
6089       return;
6090 
6091    const fs_builder bld = fs_builder(this).at_end();
6092 
6093    /* Emit a URB write to end the thread.  On Broadwell, we use this to write
6094     * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
6095     * algorithm to set it optimally).  On other platforms, we simply write
6096     * zero to a reserved/MBZ patch header DWord which has no consequence.
6097     */
6098    elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
6099    srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
6100    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = elk_imm_ud(WRITEMASK_X << 16);
6101    srcs[URB_LOGICAL_SRC_DATA] = elk_imm_ud(0);
6102    srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
6103    elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL,
6104                             reg_undef, srcs, ARRAY_SIZE(srcs));
6105    inst->eot = true;
6106 }
6107 
6108 bool
run_tcs()6109 elk_fs_visitor::run_tcs()
6110 {
6111    assert(stage == MESA_SHADER_TESS_CTRL);
6112 
6113    struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
6114    const fs_builder bld = fs_builder(this).at_end();
6115 
6116    assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
6117           vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
6118 
6119    payload_ = new elk_tcs_thread_payload(*this);
6120 
6121    /* Initialize gl_InvocationID */
6122    set_tcs_invocation_id();
6123 
6124    const bool fix_dispatch_mask =
6125       vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
6126       (nir->info.tess.tcs_vertices_out % 8) != 0;
6127 
6128    /* Fix the disptach mask */
6129    if (fix_dispatch_mask) {
6130       bld.CMP(bld.null_reg_ud(), invocation_id,
6131               elk_imm_ud(nir->info.tess.tcs_vertices_out), ELK_CONDITIONAL_L);
6132       bld.IF(ELK_PREDICATE_NORMAL);
6133    }
6134 
6135    nir_to_elk(this);
6136 
6137    if (fix_dispatch_mask) {
6138       bld.emit(ELK_OPCODE_ENDIF);
6139    }
6140 
6141    emit_tcs_thread_end();
6142 
6143    if (failed)
6144       return false;
6145 
6146    calculate_cfg();
6147 
6148    optimize();
6149 
6150    assign_curb_setup();
6151    assign_tcs_urb_setup();
6152 
6153    fixup_3src_null_dest();
6154 
6155    allocate_registers(true /* allow_spilling */);
6156 
6157    workaround_source_arf_before_eot();
6158 
6159    return !failed;
6160 }
6161 
6162 bool
run_tes()6163 elk_fs_visitor::run_tes()
6164 {
6165    assert(stage == MESA_SHADER_TESS_EVAL);
6166 
6167    payload_ = new elk_tes_thread_payload(*this);
6168 
6169    nir_to_elk(this);
6170 
6171    if (failed)
6172       return false;
6173 
6174    emit_urb_writes();
6175 
6176    calculate_cfg();
6177 
6178    optimize();
6179 
6180    assign_curb_setup();
6181    assign_tes_urb_setup();
6182 
6183    fixup_3src_null_dest();
6184 
6185    allocate_registers(true /* allow_spilling */);
6186 
6187    workaround_source_arf_before_eot();
6188 
6189    return !failed;
6190 }
6191 
6192 bool
run_gs()6193 elk_fs_visitor::run_gs()
6194 {
6195    assert(stage == MESA_SHADER_GEOMETRY);
6196 
6197    payload_ = new elk_gs_thread_payload(*this);
6198 
6199    this->final_gs_vertex_count = vgrf(glsl_uint_type());
6200 
6201    if (gs_compile->control_data_header_size_bits > 0) {
6202       /* Create a VGRF to store accumulated control data bits. */
6203       this->control_data_bits = vgrf(glsl_uint_type());
6204 
6205       /* If we're outputting more than 32 control data bits, then EmitVertex()
6206        * will set control_data_bits to 0 after emitting the first vertex.
6207        * Otherwise, we need to initialize it to 0 here.
6208        */
6209       if (gs_compile->control_data_header_size_bits <= 32) {
6210          const fs_builder bld = fs_builder(this).at_end();
6211          const fs_builder abld = bld.annotate("initialize control data bits");
6212          abld.MOV(this->control_data_bits, elk_imm_ud(0u));
6213       }
6214    }
6215 
6216    nir_to_elk(this);
6217 
6218    emit_gs_thread_end();
6219 
6220    if (failed)
6221       return false;
6222 
6223    calculate_cfg();
6224 
6225    optimize();
6226 
6227    assign_curb_setup();
6228    assign_gs_urb_setup();
6229 
6230    fixup_3src_null_dest();
6231 
6232    allocate_registers(true /* allow_spilling */);
6233 
6234    workaround_source_arf_before_eot();
6235 
6236    return !failed;
6237 }
6238 
6239 bool
run_fs(bool allow_spilling,bool do_rep_send)6240 elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
6241 {
6242    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(this->prog_data);
6243    elk_wm_prog_key *wm_key = (elk_wm_prog_key *) this->key;
6244    const fs_builder bld = fs_builder(this).at_end();
6245 
6246    assert(stage == MESA_SHADER_FRAGMENT);
6247 
6248    payload_ = new elk_fs_thread_payload(*this, source_depth_to_render_target,
6249                                     runtime_check_aads_emit);
6250 
6251    if (do_rep_send) {
6252       assert(dispatch_width == 16);
6253       emit_repclear_shader();
6254    } else {
6255       if (nir->info.inputs_read > 0 ||
6256           BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
6257           (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
6258          if (devinfo->ver < 6)
6259             emit_interpolation_setup_gfx4();
6260          else
6261             emit_interpolation_setup_gfx6();
6262       }
6263 
6264       /* We handle discards by keeping track of the still-live pixels in f0.1.
6265        * Initialize it with the dispatched pixels.
6266        */
6267       if (wm_prog_data->uses_kill) {
6268          const unsigned lower_width = MIN2(dispatch_width, 16);
6269          for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
6270             /* According to the "PS Thread Payload for Normal
6271              * Dispatch" pages on the BSpec, the dispatch mask is
6272              * stored in R1.7/R2.7 on gfx6+.
6273              */
6274             const elk_fs_reg dispatch_mask =
6275                devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) :
6276                elk_vec1_grf(0, 0);
6277             bld.exec_all().group(1, 0)
6278                .MOV(elk_sample_mask_reg(bld.group(lower_width, i)),
6279                     retype(dispatch_mask, ELK_REGISTER_TYPE_UW));
6280          }
6281       }
6282 
6283       if (nir->info.writes_memory)
6284          wm_prog_data->has_side_effects = true;
6285 
6286       nir_to_elk(this);
6287 
6288       if (failed)
6289 	 return false;
6290 
6291       if (wm_key->emit_alpha_test)
6292          emit_alpha_test();
6293 
6294       emit_fb_writes();
6295 
6296       calculate_cfg();
6297 
6298       optimize();
6299 
6300       assign_curb_setup();
6301 
6302       assign_urb_setup();
6303 
6304       fixup_3src_null_dest();
6305 
6306       allocate_registers(allow_spilling);
6307 
6308       workaround_source_arf_before_eot();
6309    }
6310 
6311    return !failed;
6312 }
6313 
6314 bool
run_cs(bool allow_spilling)6315 elk_fs_visitor::run_cs(bool allow_spilling)
6316 {
6317    assert(gl_shader_stage_is_compute(stage));
6318    assert(devinfo->ver >= 7);
6319    const fs_builder bld = fs_builder(this).at_end();
6320 
6321    payload_ = new elk_cs_thread_payload(*this);
6322 
6323    if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
6324       /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
6325       const fs_builder abld = bld.exec_all().group(1, 0);
6326       abld.MOV(retype(elk_sr0_reg(1), ELK_REGISTER_TYPE_UW),
6327                suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW), 1));
6328    }
6329 
6330    nir_to_elk(this);
6331 
6332    if (failed)
6333       return false;
6334 
6335    emit_cs_terminate();
6336 
6337    calculate_cfg();
6338 
6339    optimize();
6340 
6341    assign_curb_setup();
6342 
6343    fixup_3src_null_dest();
6344 
6345    allocate_registers(allow_spilling);
6346 
6347    workaround_source_arf_before_eot();
6348 
6349    return !failed;
6350 }
6351 
6352 static bool
is_used_in_not_interp_frag_coord(nir_def * def)6353 is_used_in_not_interp_frag_coord(nir_def *def)
6354 {
6355    nir_foreach_use_including_if(src, def) {
6356       if (nir_src_is_if(src))
6357          return true;
6358 
6359       if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
6360          return true;
6361 
6362       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
6363       if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
6364          return true;
6365    }
6366 
6367    return false;
6368 }
6369 
6370 /**
6371  * Return a bitfield where bit n is set if barycentric interpolation mode n
6372  * (see enum elk_barycentric_mode) is needed by the fragment shader.
6373  *
6374  * We examine the load_barycentric intrinsics rather than looking at input
6375  * variables so that we catch interpolateAtCentroid() messages too, which
6376  * also need the ELK_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
6377  */
6378 static unsigned
elk_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)6379 elk_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
6380                                      const nir_shader *shader)
6381 {
6382    unsigned barycentric_interp_modes = 0;
6383 
6384    nir_foreach_function_impl(impl, shader) {
6385       nir_foreach_block(block, impl) {
6386          nir_foreach_instr(instr, block) {
6387             if (instr->type != nir_instr_type_intrinsic)
6388                continue;
6389 
6390             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6391             switch (intrin->intrinsic) {
6392             case nir_intrinsic_load_barycentric_pixel:
6393             case nir_intrinsic_load_barycentric_centroid:
6394             case nir_intrinsic_load_barycentric_sample:
6395             case nir_intrinsic_load_barycentric_at_sample:
6396             case nir_intrinsic_load_barycentric_at_offset:
6397                break;
6398             default:
6399                continue;
6400             }
6401 
6402             /* Ignore WPOS; it doesn't require interpolation. */
6403             if (!is_used_in_not_interp_frag_coord(&intrin->def))
6404                continue;
6405 
6406             nir_intrinsic_op bary_op = intrin->intrinsic;
6407             enum elk_barycentric_mode bary =
6408                elk_barycentric_mode(intrin);
6409 
6410             barycentric_interp_modes |= 1 << bary;
6411 
6412             if (devinfo->needs_unlit_centroid_workaround &&
6413                 bary_op == nir_intrinsic_load_barycentric_centroid)
6414                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
6415          }
6416       }
6417    }
6418 
6419    return barycentric_interp_modes;
6420 }
6421 
6422 static void
elk_compute_flat_inputs(struct elk_wm_prog_data * prog_data,const nir_shader * shader)6423 elk_compute_flat_inputs(struct elk_wm_prog_data *prog_data,
6424                         const nir_shader *shader)
6425 {
6426    prog_data->flat_inputs = 0;
6427 
6428    nir_foreach_shader_in_variable(var, shader) {
6429       /* flat shading */
6430       if (var->data.interpolation != INTERP_MODE_FLAT)
6431          continue;
6432 
6433       if (var->data.per_primitive)
6434          continue;
6435 
6436       unsigned slots = glsl_count_attribute_slots(var->type, false);
6437       for (unsigned s = 0; s < slots; s++) {
6438          int input_index = prog_data->urb_setup[var->data.location + s];
6439 
6440          if (input_index >= 0)
6441             prog_data->flat_inputs |= 1 << input_index;
6442       }
6443    }
6444 }
6445 
6446 static uint8_t
computed_depth_mode(const nir_shader * shader)6447 computed_depth_mode(const nir_shader *shader)
6448 {
6449    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
6450       switch (shader->info.fs.depth_layout) {
6451       case FRAG_DEPTH_LAYOUT_NONE:
6452       case FRAG_DEPTH_LAYOUT_ANY:
6453          return ELK_PSCDEPTH_ON;
6454       case FRAG_DEPTH_LAYOUT_GREATER:
6455          return ELK_PSCDEPTH_ON_GE;
6456       case FRAG_DEPTH_LAYOUT_LESS:
6457          return ELK_PSCDEPTH_ON_LE;
6458       case FRAG_DEPTH_LAYOUT_UNCHANGED:
6459          /* We initially set this to OFF, but having the shader write the
6460           * depth means we allocate register space in the SEND message. The
6461           * difference between the SEND register count and the OFF state
6462           * programming makes the HW hang.
6463           *
6464           * Removing the depth writes also leads to test failures. So use
6465           * LesserThanOrEqual, which fits writing the same value
6466           * (unchanged/equal).
6467           *
6468           */
6469          return ELK_PSCDEPTH_ON_LE;
6470       }
6471    }
6472    return ELK_PSCDEPTH_OFF;
6473 }
6474 
6475 /**
6476  * Move load_interpolated_input with simple (payload-based) barycentric modes
6477  * to the top of the program so we don't emit multiple PLNs for the same input.
6478  *
6479  * This works around CSE not being able to handle non-dominating cases
6480  * such as:
6481  *
6482  *    if (...) {
6483  *       interpolate input
6484  *    } else {
6485  *       interpolate the same exact input
6486  *    }
6487  *
6488  * This should be replaced by global value numbering someday.
6489  */
6490 bool
elk_nir_move_interpolation_to_top(nir_shader * nir)6491 elk_nir_move_interpolation_to_top(nir_shader *nir)
6492 {
6493    bool progress = false;
6494 
6495    nir_foreach_function_impl(impl, nir) {
6496       nir_block *top = nir_start_block(impl);
6497       nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
6498       bool impl_progress = false;
6499 
6500       for (nir_block *block = nir_block_cf_tree_next(top);
6501            block != NULL;
6502            block = nir_block_cf_tree_next(block)) {
6503 
6504          nir_foreach_instr_safe(instr, block) {
6505             if (instr->type != nir_instr_type_intrinsic)
6506                continue;
6507 
6508             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6509             if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
6510                continue;
6511             nir_intrinsic_instr *bary_intrinsic =
6512                nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
6513             nir_intrinsic_op op = bary_intrinsic->intrinsic;
6514 
6515             /* Leave interpolateAtSample/Offset() where they are. */
6516             if (op == nir_intrinsic_load_barycentric_at_sample ||
6517                 op == nir_intrinsic_load_barycentric_at_offset)
6518                continue;
6519 
6520             nir_instr *move[3] = {
6521                &bary_intrinsic->instr,
6522                intrin->src[1].ssa->parent_instr,
6523                instr
6524             };
6525 
6526             for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
6527                if (move[i]->block != top) {
6528                   nir_instr_move(cursor, move[i]);
6529                   impl_progress = true;
6530                }
6531             }
6532          }
6533       }
6534 
6535       progress = progress || impl_progress;
6536 
6537       nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
6538                                                 : nir_metadata_all);
6539    }
6540 
6541    return progress;
6542 }
6543 
6544 static void
elk_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data)6545 elk_nir_populate_wm_prog_data(nir_shader *shader,
6546                               const struct intel_device_info *devinfo,
6547                               const struct elk_wm_prog_key *key,
6548                               struct elk_wm_prog_data *prog_data)
6549 {
6550    /* key->alpha_test_func means simulating alpha testing via discards,
6551     * so the shader definitely kills pixels.
6552     */
6553    prog_data->uses_kill = shader->info.fs.uses_discard ||
6554                           key->emit_alpha_test;
6555    prog_data->uses_omask = !key->ignore_sample_mask_out &&
6556       (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
6557    prog_data->color_outputs_written = key->color_outputs_valid;
6558    prog_data->computed_depth_mode = computed_depth_mode(shader);
6559    prog_data->computed_stencil =
6560       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
6561 
6562    prog_data->sample_shading =
6563       shader->info.fs.uses_sample_shading ||
6564       shader->info.outputs_read;
6565 
6566    assert(key->multisample_fbo != ELK_NEVER ||
6567           key->persample_interp == ELK_NEVER);
6568 
6569    prog_data->persample_dispatch = key->persample_interp;
6570    if (prog_data->sample_shading)
6571       prog_data->persample_dispatch = ELK_ALWAYS;
6572 
6573    /* We can only persample dispatch if we have a multisample FBO */
6574    prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
6575                                         key->multisample_fbo);
6576 
6577    /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
6578     * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
6579     * to definitively tell whether alpha_to_coverage is on or off.
6580     */
6581    prog_data->alpha_to_coverage = key->alpha_to_coverage;
6582    assert(prog_data->alpha_to_coverage != ELK_SOMETIMES ||
6583           prog_data->persample_dispatch == ELK_SOMETIMES);
6584 
6585    if (devinfo->ver >= 6) {
6586       prog_data->uses_sample_mask =
6587          BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
6588 
6589       /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
6590        *
6591        *    "MSDISPMODE_PERSAMPLE is required in order to select
6592        *    POSOFFSET_SAMPLE"
6593        *
6594        * So we can only really get sample positions if we are doing real
6595        * per-sample dispatch.  If we need gl_SamplePosition and we don't have
6596        * persample dispatch, we hard-code it to 0.5.
6597        */
6598       prog_data->uses_pos_offset =
6599          prog_data->persample_dispatch != ELK_NEVER &&
6600          (BITSET_TEST(shader->info.system_values_read,
6601                       SYSTEM_VALUE_SAMPLE_POS) ||
6602           BITSET_TEST(shader->info.system_values_read,
6603                       SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
6604    }
6605 
6606    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
6607    prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
6608    prog_data->inner_coverage = shader->info.fs.inner_coverage;
6609 
6610    prog_data->barycentric_interp_modes =
6611       elk_compute_barycentric_interp_modes(devinfo, shader);
6612 
6613    /* From the BDW PRM documentation for 3DSTATE_WM:
6614     *
6615     *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
6616     *     Sample or Non- perspective Sample barycentric coordinates."
6617     *
6618     * So cleanup any potentially set sample barycentric mode when not in per
6619     * sample dispatch.
6620     */
6621    if (prog_data->persample_dispatch == ELK_NEVER) {
6622       prog_data->barycentric_interp_modes &=
6623          ~BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
6624    }
6625 
6626    prog_data->uses_nonperspective_interp_modes |=
6627       (prog_data->barycentric_interp_modes &
6628       ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
6629 
6630    /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
6631     * Message Descriptor :
6632     *
6633     *    "Message Type. Specifies the type of message being sent when
6634     *     pixel-rate evaluation is requested :
6635     *
6636     *     Format = U2
6637     *       0: Per Message Offset (eval_snapped with immediate offset)
6638     *       1: Sample Position Offset (eval_sindex)
6639     *       2: Centroid Position Offset (eval_centroid)
6640     *       3: Per Slot Offset (eval_snapped with register offset)
6641     *
6642     *     Message Type. Specifies the type of message being sent when
6643     *     coarse-rate evaluation is requested :
6644     *
6645     *     Format = U2
6646     *       0: Coarse to Pixel Mapping Message (internal message)
6647     *       1: Reserved
6648     *       2: Coarse Centroid Position (eval_centroid)
6649     *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
6650     *
6651     * The Sample Position Offset is marked as reserved for coarse rate
6652     * evaluation and leads to hangs if we try to use it. So disable coarse
6653     * pixel shading if we have any intrinsic that will result in a pixel
6654     * interpolater message at sample.
6655     */
6656    intel_nir_pulls_at_sample(shader);
6657 
6658    /* We choose to always enable VMask prior to XeHP, as it would cause
6659     * us to lose out on the eliminate_find_live_channel() optimization.
6660     */
6661    prog_data->uses_vmask = true;
6662 
6663    prog_data->uses_src_w =
6664       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6665    prog_data->uses_src_depth =
6666       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6667 
6668    calculate_urb_setup(devinfo, key, prog_data, shader);
6669    elk_compute_flat_inputs(prog_data, shader);
6670 }
6671 
6672 /**
6673  * Pre-gfx6, the register file of the EUs was shared between threads,
6674  * and each thread used some subset allocated on a 16-register block
6675  * granularity.  The unit states wanted these block counts.
6676  */
6677 static inline int
elk_register_blocks(int reg_count)6678 elk_register_blocks(int reg_count)
6679 {
6680    return ALIGN(reg_count, 16) / 16 - 1;
6681 }
6682 
6683 const unsigned *
elk_compile_fs(const struct elk_compiler * compiler,struct elk_compile_fs_params * params)6684 elk_compile_fs(const struct elk_compiler *compiler,
6685                struct elk_compile_fs_params *params)
6686 {
6687    struct nir_shader *nir = params->base.nir;
6688    const struct elk_wm_prog_key *key = params->key;
6689    struct elk_wm_prog_data *prog_data = params->prog_data;
6690    bool allow_spilling = params->allow_spilling;
6691    const bool debug_enabled =
6692       elk_should_print_shader(nir, params->base.debug_flag ?
6693                                    params->base.debug_flag : DEBUG_WM);
6694 
6695    prog_data->base.stage = MESA_SHADER_FRAGMENT;
6696    prog_data->base.total_scratch = 0;
6697 
6698    const struct intel_device_info *devinfo = compiler->devinfo;
6699    const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
6700 
6701    elk_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
6702    elk_nir_lower_fs_inputs(nir, devinfo, key);
6703    elk_nir_lower_fs_outputs(nir);
6704 
6705    if (devinfo->ver < 6)
6706       elk_setup_vue_interpolation(params->vue_map, nir, prog_data);
6707 
6708    /* From the SKL PRM, Volume 7, "Alpha Coverage":
6709     *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
6710     *   hardware, regardless of the state setting for this feature."
6711     */
6712    if (devinfo->ver > 6 && key->alpha_to_coverage != ELK_NEVER) {
6713       /* Run constant fold optimization in order to get the correct source
6714        * offset to determine render target 0 store instruction in
6715        * emit_alpha_to_coverage pass.
6716        */
6717       NIR_PASS(_, nir, nir_opt_constant_folding);
6718       NIR_PASS(_, nir, elk_nir_lower_alpha_to_coverage, key, prog_data);
6719    }
6720 
6721    NIR_PASS(_, nir, elk_nir_move_interpolation_to_top);
6722    elk_postprocess_nir(nir, compiler, debug_enabled,
6723                        key->base.robust_flags);
6724 
6725    elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
6726 
6727    std::unique_ptr<elk_fs_visitor> v8, v16, v32, vmulti;
6728    elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
6729    float throughput = 0;
6730    bool has_spilled = false;
6731 
6732    v8 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
6733                                      prog_data, nir, 8,
6734                                      params->base.stats != NULL,
6735                                      debug_enabled);
6736    if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
6737       params->base.error_str = ralloc_strdup(params->base.mem_ctx,
6738                                              v8->fail_msg);
6739       return NULL;
6740    } else if (INTEL_SIMD(FS, 8)) {
6741       simd8_cfg = v8->cfg;
6742 
6743       assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
6744       prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
6745 
6746       prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
6747       const performance &perf = v8->performance_analysis.require();
6748       throughput = MAX2(throughput, perf.throughput);
6749       has_spilled = v8->spilled_any_registers;
6750       allow_spilling = false;
6751    }
6752 
6753    /* Limit dispatch width to simd8 with dual source blending on gfx8.
6754     * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
6755     */
6756    if (devinfo->ver == 8 && prog_data->dual_src_blend &&
6757        INTEL_SIMD(FS, 8)) {
6758       assert(!params->use_rep_send);
6759       v8->limit_dispatch_width(8, "gfx8 workaround: "
6760                                "using SIMD8 when dual src blending.\n");
6761    }
6762 
6763    if (!has_spilled &&
6764        (!v8 || v8->max_dispatch_width >= 16) &&
6765        (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
6766       /* Try a SIMD16 compile */
6767       v16 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
6768                                          prog_data, nir, 16,
6769                                          params->base.stats != NULL,
6770                                          debug_enabled);
6771       if (v8)
6772          v16->import_uniforms(v8.get());
6773       if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
6774          elk_shader_perf_log(compiler, params->base.log_data,
6775                              "SIMD16 shader failed to compile: %s\n",
6776                              v16->fail_msg);
6777       } else {
6778          simd16_cfg = v16->cfg;
6779 
6780          assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
6781          prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
6782 
6783          prog_data->reg_blocks_16 = elk_register_blocks(v16->grf_used);
6784          const performance &perf = v16->performance_analysis.require();
6785          throughput = MAX2(throughput, perf.throughput);
6786          has_spilled = v16->spilled_any_registers;
6787          allow_spilling = false;
6788       }
6789    }
6790 
6791    const bool simd16_failed = v16 && !simd16_cfg;
6792 
6793    /* Currently, the compiler only supports SIMD32 on SNB+ */
6794    if (!has_spilled &&
6795        (!v8 || v8->max_dispatch_width >= 32) &&
6796        (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
6797        devinfo->ver >= 6 && !simd16_failed &&
6798        INTEL_SIMD(FS, 32)) {
6799       /* Try a SIMD32 compile */
6800       v32 = std::make_unique<elk_fs_visitor>(compiler, &params->base, key,
6801                                          prog_data, nir, 32,
6802                                          params->base.stats != NULL,
6803                                          debug_enabled);
6804       if (v8)
6805          v32->import_uniforms(v8.get());
6806       else if (v16)
6807          v32->import_uniforms(v16.get());
6808 
6809       if (!v32->run_fs(allow_spilling, false)) {
6810          elk_shader_perf_log(compiler, params->base.log_data,
6811                              "SIMD32 shader failed to compile: %s\n",
6812                              v32->fail_msg);
6813       } else {
6814          const performance &perf = v32->performance_analysis.require();
6815 
6816          if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
6817             elk_shader_perf_log(compiler, params->base.log_data,
6818                                 "SIMD32 shader inefficient\n");
6819          } else {
6820             simd32_cfg = v32->cfg;
6821 
6822             assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
6823             prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
6824 
6825             prog_data->reg_blocks_32 = elk_register_blocks(v32->grf_used);
6826             throughput = MAX2(throughput, perf.throughput);
6827          }
6828       }
6829    }
6830 
6831    /* When the caller requests a repclear shader, they want SIMD16-only */
6832    if (params->use_rep_send)
6833       simd8_cfg = NULL;
6834 
6835    /* Prior to Iron Lake, the PS had a single shader offset with a jump table
6836     * at the top to select the shader.  We've never implemented that.
6837     * Instead, we just give them exactly one shader and we pick the widest one
6838     * available.
6839     */
6840    if (compiler->devinfo->ver < 5) {
6841       if (simd32_cfg || simd16_cfg)
6842          simd8_cfg = NULL;
6843       if (simd32_cfg)
6844          simd16_cfg = NULL;
6845    }
6846 
6847    /* If computed depth is enabled SNB only allows SIMD8. */
6848    if (compiler->devinfo->ver == 6 &&
6849        prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF)
6850       assert(simd16_cfg == NULL && simd32_cfg == NULL);
6851 
6852    if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
6853       /* Iron lake and earlier only have one Dispatch GRF start field.  Make
6854        * the data available in the base prog data struct for convenience.
6855        */
6856       if (simd16_cfg) {
6857          prog_data->base.dispatch_grf_start_reg =
6858             prog_data->dispatch_grf_start_reg_16;
6859       } else if (simd32_cfg) {
6860          prog_data->base.dispatch_grf_start_reg =
6861             prog_data->dispatch_grf_start_reg_32;
6862       }
6863    }
6864 
6865    elk_fs_generator g(compiler, &params->base, &prog_data->base,
6866                   v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
6867 
6868    if (unlikely(debug_enabled)) {
6869       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
6870                                      "%s fragment shader %s",
6871                                      nir->info.label ?
6872                                         nir->info.label : "unnamed",
6873                                      nir->info.name));
6874    }
6875 
6876    struct elk_compile_stats *stats = params->base.stats;
6877    uint32_t max_dispatch_width = 0;
6878 
6879    if (simd8_cfg) {
6880       prog_data->dispatch_8 = true;
6881       g.generate_code(simd8_cfg, 8, v8->shader_stats,
6882                       v8->performance_analysis.require(), stats);
6883       stats = stats ? stats + 1 : NULL;
6884       max_dispatch_width = 8;
6885    }
6886 
6887    if (simd16_cfg) {
6888       prog_data->dispatch_16 = true;
6889       prog_data->prog_offset_16 = g.generate_code(
6890          simd16_cfg, 16, v16->shader_stats,
6891          v16->performance_analysis.require(), stats);
6892       stats = stats ? stats + 1 : NULL;
6893       max_dispatch_width = 16;
6894    }
6895 
6896    if (simd32_cfg) {
6897       prog_data->dispatch_32 = true;
6898       prog_data->prog_offset_32 = g.generate_code(
6899          simd32_cfg, 32, v32->shader_stats,
6900          v32->performance_analysis.require(), stats);
6901       stats = stats ? stats + 1 : NULL;
6902       max_dispatch_width = 32;
6903    }
6904 
6905    for (struct elk_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
6906       s->max_dispatch_width = max_dispatch_width;
6907 
6908    g.add_const_data(nir->constant_data, nir->constant_data_size);
6909    return g.get_assembly();
6910 }
6911 
6912 unsigned
elk_cs_push_const_total_size(const struct elk_cs_prog_data * cs_prog_data,unsigned threads)6913 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
6914                              unsigned threads)
6915 {
6916    assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
6917    assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
6918    return cs_prog_data->push.per_thread.size * threads +
6919           cs_prog_data->push.cross_thread.size;
6920 }
6921 
6922 static void
fill_push_const_block_info(struct elk_push_const_block * block,unsigned dwords)6923 fill_push_const_block_info(struct elk_push_const_block *block, unsigned dwords)
6924 {
6925    block->dwords = dwords;
6926    block->regs = DIV_ROUND_UP(dwords, 8);
6927    block->size = block->regs * 32;
6928 }
6929 
6930 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct elk_cs_prog_data * cs_prog_data)6931 cs_fill_push_const_info(const struct intel_device_info *devinfo,
6932                         struct elk_cs_prog_data *cs_prog_data)
6933 {
6934    const struct elk_stage_prog_data *prog_data = &cs_prog_data->base;
6935    int subgroup_id_index = elk_get_subgroup_id_param_index(devinfo, prog_data);
6936    bool cross_thread_supported = devinfo->verx10 >= 75;
6937 
6938    /* The thread ID should be stored in the last param dword */
6939    assert(subgroup_id_index == -1 ||
6940           subgroup_id_index == (int)prog_data->nr_params - 1);
6941 
6942    unsigned cross_thread_dwords, per_thread_dwords;
6943    if (!cross_thread_supported) {
6944       cross_thread_dwords = 0u;
6945       per_thread_dwords = prog_data->nr_params;
6946    } else if (subgroup_id_index >= 0) {
6947       /* Fill all but the last register with cross-thread payload */
6948       cross_thread_dwords = 8 * (subgroup_id_index / 8);
6949       per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
6950       assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
6951    } else {
6952       /* Fill all data using cross-thread payload */
6953       cross_thread_dwords = prog_data->nr_params;
6954       per_thread_dwords = 0u;
6955    }
6956 
6957    fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
6958    fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
6959 
6960    assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
6961           cs_prog_data->push.per_thread.size == 0);
6962    assert(cs_prog_data->push.cross_thread.dwords +
6963           cs_prog_data->push.per_thread.dwords ==
6964              prog_data->nr_params);
6965 }
6966 
6967 static bool
filter_simd(const nir_instr * instr,const void *)6968 filter_simd(const nir_instr *instr, const void * /* options */)
6969 {
6970    if (instr->type != nir_instr_type_intrinsic)
6971       return false;
6972 
6973    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6974    case nir_intrinsic_load_simd_width_intel:
6975    case nir_intrinsic_load_subgroup_id:
6976       return true;
6977 
6978    default:
6979       return false;
6980    }
6981 }
6982 
6983 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)6984 lower_simd(nir_builder *b, nir_instr *instr, void *options)
6985 {
6986    uintptr_t simd_width = (uintptr_t)options;
6987 
6988    switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6989    case nir_intrinsic_load_simd_width_intel:
6990       return nir_imm_int(b, simd_width);
6991 
6992    case nir_intrinsic_load_subgroup_id:
6993       /* If the whole workgroup fits in one thread, we can lower subgroup_id
6994        * to a constant zero.
6995        */
6996       if (!b->shader->info.workgroup_size_variable) {
6997          unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
6998                                          b->shader->info.workgroup_size[1] *
6999                                          b->shader->info.workgroup_size[2];
7000          if (local_workgroup_size <= simd_width)
7001             return nir_imm_int(b, 0);
7002       }
7003       return NULL;
7004 
7005    default:
7006       return NULL;
7007    }
7008 }
7009 
7010 bool
elk_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)7011 elk_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
7012 {
7013    return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
7014                                  (void *)(uintptr_t)dispatch_width);
7015 }
7016 
7017 const unsigned *
elk_compile_cs(const struct elk_compiler * compiler,struct elk_compile_cs_params * params)7018 elk_compile_cs(const struct elk_compiler *compiler,
7019                struct elk_compile_cs_params *params)
7020 {
7021    const nir_shader *nir = params->base.nir;
7022    const struct elk_cs_prog_key *key = params->key;
7023    struct elk_cs_prog_data *prog_data = params->prog_data;
7024 
7025    const bool debug_enabled =
7026       elk_should_print_shader(nir, params->base.debug_flag ?
7027                                    params->base.debug_flag : DEBUG_CS);
7028 
7029    prog_data->base.stage = MESA_SHADER_COMPUTE;
7030    prog_data->base.total_shared = nir->info.shared_size;
7031    prog_data->base.total_scratch = 0;
7032 
7033    if (!nir->info.workgroup_size_variable) {
7034       prog_data->local_size[0] = nir->info.workgroup_size[0];
7035       prog_data->local_size[1] = nir->info.workgroup_size[1];
7036       prog_data->local_size[2] = nir->info.workgroup_size[2];
7037    }
7038 
7039    elk_simd_selection_state simd_state{
7040       .devinfo = compiler->devinfo,
7041       .prog_data = prog_data,
7042       .required_width = elk_required_dispatch_width(&nir->info),
7043    };
7044 
7045    std::unique_ptr<elk_fs_visitor> v[3];
7046 
7047    for (unsigned simd = 0; simd < 3; simd++) {
7048       if (!elk_simd_should_compile(simd_state, simd))
7049          continue;
7050 
7051       const unsigned dispatch_width = 8u << simd;
7052 
7053       nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
7054       elk_nir_apply_key(shader, compiler, &key->base,
7055                         dispatch_width);
7056 
7057       NIR_PASS(_, shader, elk_nir_lower_simd, dispatch_width);
7058 
7059       /* Clean up after the local index and ID calculations. */
7060       NIR_PASS(_, shader, nir_opt_constant_folding);
7061       NIR_PASS(_, shader, nir_opt_dce);
7062 
7063       elk_postprocess_nir(shader, compiler, debug_enabled,
7064                           key->base.robust_flags);
7065 
7066       v[simd] = std::make_unique<elk_fs_visitor>(compiler, &params->base,
7067                                              &key->base,
7068                                              &prog_data->base,
7069                                              shader, dispatch_width,
7070                                              params->base.stats != NULL,
7071                                              debug_enabled);
7072 
7073       const int first = elk_simd_first_compiled(simd_state);
7074       if (first >= 0)
7075          v[simd]->import_uniforms(v[first].get());
7076 
7077       const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
7078 
7079       if (v[simd]->run_cs(allow_spilling)) {
7080          cs_fill_push_const_info(compiler->devinfo, prog_data);
7081 
7082          elk_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
7083       } else {
7084          simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
7085          if (simd > 0) {
7086             elk_shader_perf_log(compiler, params->base.log_data,
7087                                 "SIMD%u shader failed to compile: %s\n",
7088                                 dispatch_width, v[simd]->fail_msg);
7089          }
7090       }
7091    }
7092 
7093    const int selected_simd = elk_simd_select(simd_state);
7094    if (selected_simd < 0) {
7095       params->base.error_str =
7096          ralloc_asprintf(params->base.mem_ctx,
7097                          "Can't compile shader: "
7098                          "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
7099                          simd_state.error[0], simd_state.error[1],
7100                          simd_state.error[2]);
7101       return NULL;
7102    }
7103 
7104    assert(selected_simd < 3);
7105    elk_fs_visitor *selected = v[selected_simd].get();
7106 
7107    if (!nir->info.workgroup_size_variable)
7108       prog_data->prog_mask = 1 << selected_simd;
7109 
7110    elk_fs_generator g(compiler, &params->base, &prog_data->base,
7111                   selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
7112    if (unlikely(debug_enabled)) {
7113       char *name = ralloc_asprintf(params->base.mem_ctx,
7114                                    "%s compute shader %s",
7115                                    nir->info.label ?
7116                                    nir->info.label : "unnamed",
7117                                    nir->info.name);
7118       g.enable_debug(name);
7119    }
7120 
7121    uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
7122 
7123    struct elk_compile_stats *stats = params->base.stats;
7124    for (unsigned simd = 0; simd < 3; simd++) {
7125       if (prog_data->prog_mask & (1u << simd)) {
7126          assert(v[simd]);
7127          prog_data->prog_offset[simd] =
7128             g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
7129                             v[simd]->performance_analysis.require(), stats);
7130          if (stats)
7131             stats->max_dispatch_width = max_dispatch_width;
7132          stats = stats ? stats + 1 : NULL;
7133          max_dispatch_width = 8u << simd;
7134       }
7135    }
7136 
7137    g.add_const_data(nir->constant_data, nir->constant_data_size);
7138 
7139    return g.get_assembly();
7140 }
7141 
7142 struct intel_cs_dispatch_info
elk_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct elk_cs_prog_data * prog_data,const unsigned * override_local_size)7143 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
7144                          const struct elk_cs_prog_data *prog_data,
7145                          const unsigned *override_local_size)
7146 {
7147    struct intel_cs_dispatch_info info = {};
7148 
7149    const unsigned *sizes =
7150       override_local_size ? override_local_size :
7151                             prog_data->local_size;
7152 
7153    const int simd = elk_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
7154    assert(simd >= 0 && simd < 3);
7155 
7156    info.group_size = sizes[0] * sizes[1] * sizes[2];
7157    info.simd_size = 8u << simd;
7158    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
7159 
7160    const uint32_t remainder = info.group_size & (info.simd_size - 1);
7161    if (remainder > 0)
7162       info.right_mask = ~0u >> (32 - remainder);
7163    else
7164       info.right_mask = ~0u >> (32 - info.simd_size);
7165 
7166    return info;
7167 }
7168 
7169 uint64_t
elk_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)7170 elk_bsr(const struct intel_device_info *devinfo,
7171         uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
7172 {
7173    assert(offset % 64 == 0);
7174    assert(simd_size == 8 || simd_size == 16);
7175    assert(local_arg_offset % 8 == 0);
7176 
7177    return offset |
7178           SET_BITS(simd_size == 8, 4, 4) |
7179           SET_BITS(local_arg_offset / 8, 2, 0);
7180 }
7181 
7182 /**
7183  * Test the dispatch mask packing assumptions of
7184  * elk_stage_has_packed_dispatch().  Call this from e.g. the top of
7185  * elk_fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
7186  * executed with an unexpected dispatch mask.
7187  */
7188 static UNUSED void
elk_fs_test_dispatch_packing(const fs_builder & bld)7189 elk_fs_test_dispatch_packing(const fs_builder &bld)
7190 {
7191    const elk_fs_visitor *shader = static_cast<const elk_fs_visitor *>(bld.shader);
7192    const gl_shader_stage stage = shader->stage;
7193    const bool uses_vmask =
7194       stage == MESA_SHADER_FRAGMENT &&
7195       elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;
7196 
7197    if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
7198                                      shader->stage_prog_data)) {
7199       const fs_builder ubld = bld.exec_all().group(1, 0);
7200       const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);
7201       const elk_fs_reg mask = uses_vmask ? elk_vmask_reg() : elk_dmask_reg();
7202 
7203       ubld.ADD(tmp, mask, elk_imm_ud(1));
7204       ubld.AND(tmp, mask, tmp);
7205 
7206       /* This will loop forever if the dispatch mask doesn't have the expected
7207        * form '2^n-1', in which case tmp will be non-zero.
7208        */
7209       bld.emit(ELK_OPCODE_DO);
7210       bld.CMP(bld.null_reg_ud(), tmp, elk_imm_ud(0), ELK_CONDITIONAL_NZ);
7211       set_predicate(ELK_PREDICATE_NORMAL, bld.emit(ELK_OPCODE_WHILE));
7212    }
7213 }
7214 
7215 unsigned
workgroup_size() const7216 elk_fs_visitor::workgroup_size() const
7217 {
7218    assert(gl_shader_stage_uses_workgroup(stage));
7219    const struct elk_cs_prog_data *cs = elk_cs_prog_data(prog_data);
7220    return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
7221 }
7222 
elk_should_print_shader(const nir_shader * shader,uint64_t debug_flag)7223 bool elk_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
7224 {
7225    return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
7226 }
7227 
7228 namespace elk {
7229    elk_fs_reg
fetch_payload_reg(const elk::fs_builder & bld,uint8_t regs[2],elk_reg_type type,unsigned n)7230    fetch_payload_reg(const elk::fs_builder &bld, uint8_t regs[2],
7231                      elk_reg_type type, unsigned n)
7232    {
7233       if (!regs[0])
7234          return elk_fs_reg();
7235 
7236       if (bld.dispatch_width() > 16) {
7237          const elk_fs_reg tmp = bld.vgrf(type, n);
7238          const elk::fs_builder hbld = bld.exec_all().group(16, 0);
7239          const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7240          elk_fs_reg *const components = new elk_fs_reg[m * n];
7241 
7242          for (unsigned c = 0; c < n; c++) {
7243             for (unsigned g = 0; g < m; g++)
7244                components[c * m + g] =
7245                   offset(retype(elk_vec8_grf(regs[g], 0), type), hbld, c);
7246          }
7247 
7248          hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
7249 
7250          delete[] components;
7251          return tmp;
7252 
7253       } else {
7254          return elk_fs_reg(retype(elk_vec8_grf(regs[0], 0), type));
7255       }
7256    }
7257 
7258    elk_fs_reg
fetch_barycentric_reg(const elk::fs_builder & bld,uint8_t regs[2])7259    fetch_barycentric_reg(const elk::fs_builder &bld, uint8_t regs[2])
7260    {
7261       if (!regs[0])
7262          return elk_fs_reg();
7263 
7264       const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
7265       const elk::fs_builder hbld = bld.exec_all().group(8, 0);
7266       const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7267       elk_fs_reg *const components = new elk_fs_reg[2 * m];
7268 
7269       for (unsigned c = 0; c < 2; c++) {
7270          for (unsigned g = 0; g < m; g++)
7271             components[c * m + g] = offset(elk_vec8_grf(regs[g / 2], 0),
7272                                            hbld, c + 2 * (g % 2));
7273       }
7274 
7275       hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
7276 
7277       delete[] components;
7278       return tmp;
7279    }
7280 
7281    void
check_dynamic_msaa_flag(const fs_builder & bld,const struct elk_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)7282    check_dynamic_msaa_flag(const fs_builder &bld,
7283                            const struct elk_wm_prog_data *wm_prog_data,
7284                            enum intel_msaa_flags flag)
7285    {
7286       elk_fs_inst *inst = bld.AND(bld.null_reg_ud(),
7287                               dynamic_msaa_flags(wm_prog_data),
7288                               elk_imm_ud(flag));
7289       inst->conditional_mod = ELK_CONDITIONAL_NZ;
7290    }
7291 }
7292