• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26 #include "elk_nir.h"
27 #include "elk_nir_private.h"
28 #include "elk_eu.h"
29 #include "nir.h"
30 #include "nir_intrinsics.h"
31 #include "nir_search_helpers.h"
32 #include "util/u_math.h"
33 #include "util/bitscan.h"
34 
35 #include <vector>
36 
37 using namespace elk;
38 
39 struct elk_fs_bind_info {
40    bool valid;
41    bool bindless;
42    unsigned block;
43    unsigned set;
44    unsigned binding;
45 };
46 
47 struct nir_to_elk_state {
48    elk_fs_visitor &s;
49    const nir_shader *nir;
50    const intel_device_info *devinfo;
51    void *mem_ctx;
52 
53    /* Points to the end of the program.  Annotated with the current NIR
54     * instruction when applicable.
55     */
56    fs_builder bld;
57 
58    elk_fs_reg *ssa_values;
59    elk_fs_inst **resource_insts;
60    struct elk_fs_bind_info *ssa_bind_infos;
61    elk_fs_reg *resource_values;
62    elk_fs_reg *system_values;
63 };
64 
65 static elk_fs_reg get_nir_src(nir_to_elk_state &ntb, const nir_src &src);
66 static elk_fs_reg get_nir_def(nir_to_elk_state &ntb, const nir_def &def);
67 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
68 
69 static void fs_nir_emit_intrinsic(nir_to_elk_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
70 static elk_fs_reg emit_samplepos_setup(nir_to_elk_state &ntb);
71 static elk_fs_reg emit_sampleid_setup(nir_to_elk_state &ntb);
72 static elk_fs_reg emit_samplemaskin_setup(nir_to_elk_state &ntb);
73 
74 static void fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl);
75 static void fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list);
76 static void fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt);
77 static void fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop);
78 static void fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block);
79 static void fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr);
80 
81 static void fs_nir_emit_surface_atomic(nir_to_elk_state &ntb,
82                                        const fs_builder &bld,
83                                        nir_intrinsic_instr *instr,
84                                        elk_fs_reg surface,
85                                        bool bindless);
86 static void fs_nir_emit_global_atomic(nir_to_elk_state &ntb,
87                                       const fs_builder &bld,
88                                       nir_intrinsic_instr *instr);
89 
90 static void
fs_nir_setup_outputs(nir_to_elk_state & ntb)91 fs_nir_setup_outputs(nir_to_elk_state &ntb)
92 {
93    elk_fs_visitor &s = ntb.s;
94 
95    if (s.stage == MESA_SHADER_TESS_CTRL ||
96        s.stage == MESA_SHADER_FRAGMENT)
97       return;
98 
99    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
100 
101    /* Calculate the size of output registers in a separate pass, before
102     * allocating them.  With ARB_enhanced_layouts, multiple output variables
103     * may occupy the same slot, but have different type sizes.
104     */
105    nir_foreach_shader_out_variable(var, s.nir) {
106       const int loc = var->data.driver_location;
107       const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
108       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
109    }
110 
111    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
112       if (vec4s[loc] == 0) {
113          loc++;
114          continue;
115       }
116 
117       unsigned reg_size = vec4s[loc];
118 
119       /* Check if there are any ranges that start within this range and extend
120        * past it. If so, include them in this allocation.
121        */
122       for (unsigned i = 1; i < reg_size; i++) {
123          assert(i + loc < ARRAY_SIZE(vec4s));
124          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
125       }
126 
127       elk_fs_reg reg = ntb.bld.vgrf(ELK_REGISTER_TYPE_F, 4 * reg_size);
128       for (unsigned i = 0; i < reg_size; i++) {
129          assert(loc + i < ARRAY_SIZE(s.outputs));
130          s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
131       }
132 
133       loc += reg_size;
134    }
135 }
136 
137 static void
fs_nir_setup_uniforms(elk_fs_visitor & s)138 fs_nir_setup_uniforms(elk_fs_visitor &s)
139 {
140    /* Only the first compile gets to set up uniforms. */
141    if (s.push_constant_loc)
142       return;
143 
144    s.uniforms = s.nir->num_uniforms / 4;
145 
146    if (gl_shader_stage_is_compute(s.stage)) {
147       /* Add uniforms for builtins after regular NIR uniforms. */
148       assert(s.uniforms == s.prog_data->nr_params);
149 
150       /* Subgroup ID must be the last uniform on the list.  This will make
151        * easier later to split between cross thread and per thread
152        * uniforms.
153        */
154       uint32_t *param = elk_stage_prog_data_add_params(s.prog_data, 1);
155       *param = ELK_PARAM_BUILTIN_SUBGROUP_ID;
156       s.uniforms++;
157    }
158 }
159 
160 static elk_fs_reg
emit_work_group_id_setup(nir_to_elk_state & ntb)161 emit_work_group_id_setup(nir_to_elk_state &ntb)
162 {
163    elk_fs_visitor &s = ntb.s;
164    const fs_builder &bld = ntb.bld;
165 
166    assert(gl_shader_stage_is_compute(s.stage));
167 
168    elk_fs_reg id = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
169 
170    struct elk_reg r0_1(retype(elk_vec1_grf(0, 1), ELK_REGISTER_TYPE_UD));
171    bld.MOV(id, r0_1);
172 
173    struct elk_reg r0_6(retype(elk_vec1_grf(0, 6), ELK_REGISTER_TYPE_UD));
174    struct elk_reg r0_7(retype(elk_vec1_grf(0, 7), ELK_REGISTER_TYPE_UD));
175    bld.MOV(offset(id, bld, 1), r0_6);
176    bld.MOV(offset(id, bld, 2), r0_7);
177 
178    return id;
179 }
180 
181 static bool
emit_system_values_block(nir_to_elk_state & ntb,nir_block * block)182 emit_system_values_block(nir_to_elk_state &ntb, nir_block *block)
183 {
184    elk_fs_visitor &s = ntb.s;
185    elk_fs_reg *reg;
186 
187    nir_foreach_instr(instr, block) {
188       if (instr->type != nir_instr_type_intrinsic)
189          continue;
190 
191       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
192       switch (intrin->intrinsic) {
193       case nir_intrinsic_load_vertex_id:
194       case nir_intrinsic_load_base_vertex:
195          unreachable("should be lowered by nir_lower_system_values().");
196 
197       case nir_intrinsic_load_vertex_id_zero_base:
198       case nir_intrinsic_load_is_indexed_draw:
199       case nir_intrinsic_load_first_vertex:
200       case nir_intrinsic_load_instance_id:
201       case nir_intrinsic_load_base_instance:
202          unreachable("should be lowered by elk_nir_lower_vs_inputs().");
203          break;
204 
205       case nir_intrinsic_load_draw_id:
206          unreachable("should be lowered by elk_nir_lower_vs_inputs().");
207          break;
208 
209       case nir_intrinsic_load_invocation_id:
210          if (s.stage == MESA_SHADER_TESS_CTRL)
211             break;
212          assert(s.stage == MESA_SHADER_GEOMETRY);
213          reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
214          if (reg->file == BAD_FILE) {
215             *reg = s.gs_payload().instance_id;
216          }
217          break;
218 
219       case nir_intrinsic_load_sample_pos:
220       case nir_intrinsic_load_sample_pos_or_center:
221          assert(s.stage == MESA_SHADER_FRAGMENT);
222          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
223          if (reg->file == BAD_FILE)
224             *reg = emit_samplepos_setup(ntb);
225          break;
226 
227       case nir_intrinsic_load_sample_id:
228          assert(s.stage == MESA_SHADER_FRAGMENT);
229          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
230          if (reg->file == BAD_FILE)
231             *reg = emit_sampleid_setup(ntb);
232          break;
233 
234       case nir_intrinsic_load_sample_mask_in:
235          assert(s.stage == MESA_SHADER_FRAGMENT);
236          assert(s.devinfo->ver >= 7);
237          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
238          if (reg->file == BAD_FILE)
239             *reg = emit_samplemaskin_setup(ntb);
240          break;
241 
242       case nir_intrinsic_load_workgroup_id:
243          assert(gl_shader_stage_is_compute(s.stage));
244          reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
245          if (reg->file == BAD_FILE)
246             *reg = emit_work_group_id_setup(ntb);
247          break;
248 
249       case nir_intrinsic_load_helper_invocation:
250          assert(s.stage == MESA_SHADER_FRAGMENT);
251          reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
252          if (reg->file == BAD_FILE) {
253             const fs_builder abld =
254                ntb.bld.annotate("gl_HelperInvocation", NULL);
255 
256             /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
257              * pixel mask is in g1.7 of the thread payload.
258              *
259              * We move the per-channel pixel enable bit to the low bit of each
260              * channel by shifting the byte containing the pixel mask by the
261              * vector immediate 0x76543210UV.
262              *
263              * The region of <1,8,0> reads only 1 byte (the pixel masks for
264              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
265              * masks for 2 and 3) in SIMD16.
266              */
267             elk_fs_reg shifted = abld.vgrf(ELK_REGISTER_TYPE_UW, 1);
268 
269             for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
270                const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
271                /* According to the "PS Thread Payload for Normal
272                 * Dispatch" pages on the BSpec, the dispatch mask is
273                 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
274                 * gfx6+.
275                 */
276                const struct elk_reg reg = elk_vec1_grf(i + 1, 7);
277                hbld.SHR(offset(shifted, hbld, i),
278                         stride(retype(reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
279                         elk_imm_v(0x76543210));
280             }
281 
282             /* A set bit in the pixel mask means the channel is enabled, but
283              * that is the opposite of gl_HelperInvocation so we need to invert
284              * the mask.
285              *
286              * The negate source-modifier bit of logical instructions on Gfx8+
287              * performs 1's complement negation, so we can use that instead of
288              * a NOT instruction.
289              */
290             elk_fs_reg inverted = negate(shifted);
291             if (s.devinfo->ver < 8) {
292                inverted = abld.vgrf(ELK_REGISTER_TYPE_UW);
293                abld.NOT(inverted, shifted);
294             }
295 
296             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
297              * with 1 and negating.
298              */
299             elk_fs_reg anded = abld.vgrf(ELK_REGISTER_TYPE_UD, 1);
300             abld.AND(anded, inverted, elk_imm_uw(1));
301 
302             elk_fs_reg dst = abld.vgrf(ELK_REGISTER_TYPE_D, 1);
303             abld.MOV(dst, negate(retype(anded, ELK_REGISTER_TYPE_D)));
304             *reg = dst;
305          }
306          break;
307 
308       default:
309          break;
310       }
311    }
312 
313    return true;
314 }
315 
316 static void
fs_nir_emit_system_values(nir_to_elk_state & ntb)317 fs_nir_emit_system_values(nir_to_elk_state &ntb)
318 {
319    const fs_builder &bld = ntb.bld;
320    elk_fs_visitor &s = ntb.s;
321 
322    ntb.system_values = ralloc_array(ntb.mem_ctx, elk_fs_reg, SYSTEM_VALUE_MAX);
323    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
324       ntb.system_values[i] = elk_fs_reg();
325    }
326 
327    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
328     * never end up using it.
329     */
330    {
331       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
332       elk_fs_reg &reg = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
333       reg = abld.vgrf(ELK_REGISTER_TYPE_UW);
334       abld.UNDEF(reg);
335 
336       const fs_builder allbld8 = abld.group(8, 0).exec_all();
337       allbld8.MOV(reg, elk_imm_v(0x76543210));
338       if (s.dispatch_width > 8)
339          allbld8.ADD(byte_offset(reg, 16), reg, elk_imm_uw(8u));
340       if (s.dispatch_width > 16) {
341          const fs_builder allbld16 = abld.group(16, 0).exec_all();
342          allbld16.ADD(byte_offset(reg, 32), reg, elk_imm_uw(16u));
343       }
344    }
345 
346    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
347    nir_foreach_block(block, impl)
348       emit_system_values_block(ntb, block);
349 }
350 
351 static void
fs_nir_emit_impl(nir_to_elk_state & ntb,nir_function_impl * impl)352 fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl)
353 {
354    ntb.ssa_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
355    ntb.resource_insts = rzalloc_array(ntb.mem_ctx, elk_fs_inst *, impl->ssa_alloc);
356    ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct elk_fs_bind_info, impl->ssa_alloc);
357    ntb.resource_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
358 
359    fs_nir_emit_cf_list(ntb, &impl->body);
360 }
361 
362 static void
fs_nir_emit_cf_list(nir_to_elk_state & ntb,exec_list * list)363 fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list)
364 {
365    exec_list_validate(list);
366    foreach_list_typed(nir_cf_node, node, node, list) {
367       switch (node->type) {
368       case nir_cf_node_if:
369          fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
370          break;
371 
372       case nir_cf_node_loop:
373          fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
374          break;
375 
376       case nir_cf_node_block:
377          fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
378          break;
379 
380       default:
381          unreachable("Invalid CFG node block");
382       }
383    }
384 }
385 
386 static void
fs_nir_emit_if(nir_to_elk_state & ntb,nir_if * if_stmt)387 fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt)
388 {
389    const intel_device_info *devinfo = ntb.devinfo;
390    const fs_builder &bld = ntb.bld;
391 
392    bool invert;
393    elk_fs_reg cond_reg;
394 
395    /* If the condition has the form !other_condition, use other_condition as
396     * the source, but invert the predicate on the if instruction.
397     */
398    nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
399    if (cond != NULL && cond->op == nir_op_inot) {
400       invert = true;
401       cond_reg = get_nir_src(ntb, cond->src[0].src);
402       cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
403 
404       if (devinfo->ver <= 5 &&
405 	  (cond->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
406          /* redo boolean resolve on gen5 */
407          elk_fs_reg masked = ntb.s.vgrf(glsl_int_type());
408          bld.AND(masked, cond_reg, elk_imm_d(1));
409          masked.negate = true;
410          elk_fs_reg tmp = bld.vgrf(cond_reg.type);
411          bld.MOV(retype(tmp, ELK_REGISTER_TYPE_D), masked);
412          cond_reg = tmp;
413       }
414    } else {
415       invert = false;
416       cond_reg = get_nir_src(ntb, if_stmt->condition);
417    }
418 
419    /* first, put the condition into f0 */
420    elk_fs_inst *inst = bld.MOV(bld.null_reg_d(),
421                            retype(cond_reg, ELK_REGISTER_TYPE_D));
422    inst->conditional_mod = ELK_CONDITIONAL_NZ;
423 
424    bld.IF(ELK_PREDICATE_NORMAL)->predicate_inverse = invert;
425 
426    fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
427 
428    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
429       bld.emit(ELK_OPCODE_ELSE);
430       fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
431    }
432 
433    bld.emit(ELK_OPCODE_ENDIF);
434 
435    if (devinfo->ver < 7)
436       ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
437                                    "in SIMD32 mode.");
438 }
439 
440 static void
fs_nir_emit_loop(nir_to_elk_state & ntb,nir_loop * loop)441 fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop)
442 {
443    const intel_device_info *devinfo = ntb.devinfo;
444    const fs_builder &bld = ntb.bld;
445 
446    assert(!nir_loop_has_continue_construct(loop));
447    bld.emit(ELK_OPCODE_DO);
448 
449    fs_nir_emit_cf_list(ntb, &loop->body);
450 
451    bld.emit(ELK_OPCODE_WHILE);
452 
453    if (devinfo->ver < 7)
454       ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
455                                    "in SIMD32 mode.");
456 }
457 
458 static void
fs_nir_emit_block(nir_to_elk_state & ntb,nir_block * block)459 fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block)
460 {
461    fs_builder bld = ntb.bld;
462 
463    nir_foreach_instr(instr, block) {
464       fs_nir_emit_instr(ntb, instr);
465    }
466 
467    ntb.bld = bld;
468 }
469 
470 /**
471  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
472  * match instr.
473  */
474 static bool
optimize_extract_to_float(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)475 optimize_extract_to_float(nir_to_elk_state &ntb, nir_alu_instr *instr,
476                           const elk_fs_reg &result)
477 {
478    const intel_device_info *devinfo = ntb.devinfo;
479    const fs_builder &bld = ntb.bld;
480 
481    /* No fast path for f16 or f64. */
482    assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
483 
484    if (!instr->src[0].src.ssa->parent_instr)
485       return false;
486 
487    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
488       return false;
489 
490    nir_alu_instr *src0 =
491       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
492 
493    unsigned bytes;
494    bool is_signed;
495 
496    switch (src0->op) {
497    case nir_op_extract_u8:
498    case nir_op_extract_u16:
499       bytes = src0->op == nir_op_extract_u8 ? 1 : 2;
500 
501       /* i2f(extract_u8(a, b)) and u2f(extract_u8(a, b)) produce the same
502        * result. Ditto for extract_u16.
503        */
504       is_signed = false;
505       break;
506 
507    case nir_op_extract_i8:
508    case nir_op_extract_i16:
509       bytes = src0->op == nir_op_extract_i8 ? 1 : 2;
510 
511       /* The fast path can't handle u2f(extract_i8(a, b)) because the implicit
512        * sign extension of the extract_i8 is lost. For example,
513        * u2f(extract_i8(0x0000ff00, 1)) should produce 4294967295.0, but a
514        * fast path could either give 255.0 (by implementing the fast path as
515        * u2f(extract_u8(x))) or -1.0 (by implementing the fast path as
516        * i2f(extract_i8(x))). At one point in time, we incorrectly implemented
517        * the former.
518        */
519       if (instr->op != nir_op_i2f32)
520          return false;
521 
522       is_signed = true;
523       break;
524 
525    default:
526       return false;
527    }
528 
529    unsigned element = nir_src_as_uint(src0->src[1].src);
530 
531    /* Element type to extract.*/
532    const elk_reg_type type = elk_int_type(bytes, is_signed);
533 
534    elk_fs_reg op0 = get_nir_src(ntb, src0->src[0].src);
535    op0.type = elk_type_for_nir_type(devinfo,
536       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
537                      nir_src_bit_size(src0->src[0].src)));
538    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
539 
540    bld.MOV(result, subscript(op0, type, element));
541    return true;
542 }
543 
544 static bool
optimize_frontfacing_ternary(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)545 optimize_frontfacing_ternary(nir_to_elk_state &ntb,
546                              nir_alu_instr *instr,
547                              const elk_fs_reg &result)
548 {
549    const intel_device_info *devinfo = ntb.devinfo;
550    elk_fs_visitor &s = ntb.s;
551 
552    nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
553    if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
554       return false;
555 
556    if (!nir_src_is_const(instr->src[1].src) ||
557        !nir_src_is_const(instr->src[2].src))
558       return false;
559 
560    const float value1 = nir_src_as_float(instr->src[1].src);
561    const float value2 = nir_src_as_float(instr->src[2].src);
562    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
563       return false;
564 
565    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
566    assert(value1 == -value2);
567 
568    elk_fs_reg tmp = s.vgrf(glsl_int_type());
569 
570    if (devinfo->ver >= 6) {
571       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
572       elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
573 
574       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
575        *
576        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
577        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
578        *
579        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
580        *
581        * This negation looks like it's safe in practice, because bits 0:4 will
582        * surely be TRIANGLES
583        */
584 
585       if (value1 == -1.0f) {
586          g0.negate = true;
587       }
588 
589       ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_W, 1),
590                   g0, elk_imm_uw(0x3f80));
591    } else {
592       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
593       elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
594 
595       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
596        *
597        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
598        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
599        *
600        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
601        *
602        * This negation looks like it's safe in practice, because bits 0:4 will
603        * surely be TRIANGLES
604        */
605 
606       if (value1 == -1.0f) {
607          g1_6.negate = true;
608       }
609 
610       ntb.bld.OR(tmp, g1_6, elk_imm_d(0x3f800000));
611    }
612    ntb.bld.AND(retype(result, ELK_REGISTER_TYPE_D), tmp, elk_imm_d(0xbf800000));
613 
614    return true;
615 }
616 
617 static elk_rnd_mode
elk_rnd_mode_from_nir_op(const nir_op op)618 elk_rnd_mode_from_nir_op (const nir_op op) {
619    switch (op) {
620    case nir_op_f2f16_rtz:
621       return ELK_RND_MODE_RTZ;
622    case nir_op_f2f16_rtne:
623       return ELK_RND_MODE_RTNE;
624    default:
625       unreachable("Operation doesn't support rounding mode");
626    }
627 }
628 
629 static elk_rnd_mode
elk_rnd_mode_from_execution_mode(unsigned execution_mode)630 elk_rnd_mode_from_execution_mode(unsigned execution_mode)
631 {
632    if (nir_has_any_rounding_mode_rtne(execution_mode))
633       return ELK_RND_MODE_RTNE;
634    if (nir_has_any_rounding_mode_rtz(execution_mode))
635       return ELK_RND_MODE_RTZ;
636    return ELK_RND_MODE_UNSPECIFIED;
637 }
638 
639 static elk_fs_reg
prepare_alu_destination_and_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op,bool need_dest)640 prepare_alu_destination_and_sources(nir_to_elk_state &ntb,
641                                     const fs_builder &bld,
642                                     nir_alu_instr *instr,
643                                     elk_fs_reg *op,
644                                     bool need_dest)
645 {
646    const intel_device_info *devinfo = ntb.devinfo;
647 
648    elk_fs_reg result =
649       need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
650 
651    result.type = elk_type_for_nir_type(devinfo,
652       (nir_alu_type)(nir_op_infos[instr->op].output_type |
653                      instr->def.bit_size));
654 
655    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
656       op[i] = get_nir_src(ntb, instr->src[i].src);
657       op[i].type = elk_type_for_nir_type(devinfo,
658          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
659                         nir_src_bit_size(instr->src[i].src)));
660    }
661 
662    /* Move and vecN instrutions may still be vectored.  Return the raw,
663     * vectored source and destination so that elk_fs_visitor::nir_emit_alu can
664     * handle it.  Other callers should not have to handle these kinds of
665     * instructions.
666     */
667    switch (instr->op) {
668    case nir_op_mov:
669    case nir_op_vec2:
670    case nir_op_vec3:
671    case nir_op_vec4:
672    case nir_op_vec8:
673    case nir_op_vec16:
674       return result;
675    default:
676       break;
677    }
678 
679    /* At this point, we have dealt with any instruction that operates on
680     * more than a single channel.  Therefore, we can just adjust the source
681     * and destination registers for that channel and emit the instruction.
682     */
683    unsigned channel = 0;
684    if (nir_op_infos[instr->op].output_size == 0) {
685       /* Since NIR is doing the scalarizing for us, we should only ever see
686        * vectorized operations with a single channel.
687        */
688       nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
689       assert(util_bitcount(write_mask) == 1);
690       channel = ffs(write_mask) - 1;
691 
692       result = offset(result, bld, channel);
693    }
694 
695    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
696       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
697       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
698    }
699 
700    return result;
701 }
702 
703 static elk_fs_reg
resolve_source_modifiers(const fs_builder & bld,const elk_fs_reg & src)704 resolve_source_modifiers(const fs_builder &bld, const elk_fs_reg &src)
705 {
706    if (!src.abs && !src.negate)
707       return src;
708 
709    elk_fs_reg temp = bld.vgrf(src.type);
710    bld.MOV(temp, src);
711 
712    return temp;
713 }
714 
715 static void
resolve_inot_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op)716 resolve_inot_sources(nir_to_elk_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
717                      elk_fs_reg *op)
718 {
719    for (unsigned i = 0; i < 2; i++) {
720       nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
721 
722       if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
723          /* The source of the inot is now the source of instr. */
724          prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
725 
726          assert(!op[i].negate);
727          op[i].negate = true;
728       } else {
729          op[i] = resolve_source_modifiers(bld, op[i]);
730       }
731    }
732 }
733 
734 static bool
try_emit_b2fi_of_inot(nir_to_elk_state & ntb,const fs_builder & bld,elk_fs_reg result,nir_alu_instr * instr)735 try_emit_b2fi_of_inot(nir_to_elk_state &ntb, const fs_builder &bld,
736                       elk_fs_reg result,
737                       nir_alu_instr *instr)
738 {
739    const intel_device_info *devinfo = bld.shader->devinfo;
740 
741    if (devinfo->ver < 6)
742       return false;
743 
744    nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
745 
746    if (inot_instr == NULL || inot_instr->op != nir_op_inot)
747       return false;
748 
749    /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
750     * of valid size-changing combinations is a bit more complex.
751     *
752     * The source restriction is just because I was lazy about generating the
753     * constant below.
754     */
755    if (instr->def.bit_size != 32 ||
756        nir_src_bit_size(inot_instr->src[0].src) != 32)
757       return false;
758 
759    /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
760     * this is float(1 + a).
761     */
762    elk_fs_reg op;
763 
764    prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
765 
766    /* Ignore the saturate modifier, if there is one.  The result of the
767     * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
768     */
769    bld.ADD(result, op, elk_imm_d(1));
770 
771    return true;
772 }
773 
774 /**
775  * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
776  *
777  * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
778  * the source of \c instr that is a \c nir_op_fsign.
779  */
780 static void
emit_fsign(nir_to_elk_state & ntb,const fs_builder & bld,const nir_alu_instr * instr,elk_fs_reg result,elk_fs_reg * op,unsigned fsign_src)781 emit_fsign(nir_to_elk_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
782            elk_fs_reg result, elk_fs_reg *op, unsigned fsign_src)
783 {
784    const intel_device_info *devinfo = ntb.devinfo;
785 
786    elk_fs_inst *inst;
787 
788    assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
789    assert(fsign_src < nir_op_infos[instr->op].num_inputs);
790 
791    if (instr->op != nir_op_fsign) {
792       const nir_alu_instr *const fsign_instr =
793          nir_src_as_alu_instr(instr->src[fsign_src].src);
794 
795       /* op[fsign_src] has the nominal result of the fsign, and op[1 -
796        * fsign_src] has the other multiply source.  This must be rearranged so
797        * that op[0] is the source of the fsign op[1] is the other multiply
798        * source.
799        */
800       if (fsign_src != 0)
801          op[1] = op[0];
802 
803       op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
804 
805       const nir_alu_type t =
806          (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
807                         nir_src_bit_size(fsign_instr->src[0].src));
808 
809       op[0].type = elk_type_for_nir_type(devinfo, t);
810 
811       unsigned channel = 0;
812       if (nir_op_infos[instr->op].output_size == 0) {
813          /* Since NIR is doing the scalarizing for us, we should only ever see
814           * vectorized operations with a single channel.
815           */
816          nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
817          assert(util_bitcount(write_mask) == 1);
818          channel = ffs(write_mask) - 1;
819       }
820 
821       op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
822    }
823 
824    if (type_sz(op[0].type) == 2) {
825       /* AND(val, 0x8000) gives the sign bit.
826        *
827        * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
828        */
829       elk_fs_reg zero = retype(elk_imm_uw(0), ELK_REGISTER_TYPE_HF);
830       bld.CMP(bld.null_reg_f(), op[0], zero, ELK_CONDITIONAL_NZ);
831 
832       op[0].type = ELK_REGISTER_TYPE_UW;
833       result.type = ELK_REGISTER_TYPE_UW;
834       bld.AND(result, op[0], elk_imm_uw(0x8000u));
835 
836       if (instr->op == nir_op_fsign)
837          inst = bld.OR(result, result, elk_imm_uw(0x3c00u));
838       else {
839          /* Use XOR here to get the result sign correct. */
840          inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UW));
841       }
842 
843       inst->predicate = ELK_PREDICATE_NORMAL;
844    } else if (type_sz(op[0].type) == 4) {
845       /* AND(val, 0x80000000) gives the sign bit.
846        *
847        * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
848        * zero.
849        */
850       bld.CMP(bld.null_reg_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ);
851 
852       op[0].type = ELK_REGISTER_TYPE_UD;
853       result.type = ELK_REGISTER_TYPE_UD;
854       bld.AND(result, op[0], elk_imm_ud(0x80000000u));
855 
856       if (instr->op == nir_op_fsign)
857          inst = bld.OR(result, result, elk_imm_ud(0x3f800000u));
858       else {
859          /* Use XOR here to get the result sign correct. */
860          inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UD));
861       }
862 
863       inst->predicate = ELK_PREDICATE_NORMAL;
864    } else {
865       unreachable("Should have been lowered by nir_opt_algebraic.");
866    }
867 }
868 
869 /**
870  * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
871  *
872  * Checks the operands of a \c nir_op_fmul to determine whether or not
873  * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
874  *
875  * \param instr  The multiplication instruction
876  *
877  * \param fsign_src The source of \c instr that may or may not be a
878  *                  \c nir_op_fsign
879  */
880 static bool
can_fuse_fmul_fsign(nir_alu_instr * instr,unsigned fsign_src)881 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
882 {
883    assert(instr->op == nir_op_fmul);
884 
885    nir_alu_instr *const fsign_instr =
886       nir_src_as_alu_instr(instr->src[fsign_src].src);
887 
888    /* Rules:
889     *
890     * 1. instr->src[fsign_src] must be a nir_op_fsign.
891     * 2. The nir_op_fsign can only be used by this multiplication.
892     * 3. The source that is the nir_op_fsign does not have source modifiers.
893     *    \c emit_fsign only examines the source modifiers of the source of the
894     *    \c nir_op_fsign.
895     *
896     * The nir_op_fsign must also not have the saturate modifier, but steps
897     * have already been taken (in nir_opt_algebraic) to ensure that.
898     */
899    return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
900           is_used_once(fsign_instr);
901 }
902 
903 static bool
is_const_zero(const nir_src & src)904 is_const_zero(const nir_src &src)
905 {
906    return nir_src_is_const(src) && nir_src_as_int(src) == 0;
907 }
908 
909 static void
fs_nir_emit_alu(nir_to_elk_state & ntb,nir_alu_instr * instr,bool need_dest)910 fs_nir_emit_alu(nir_to_elk_state &ntb, nir_alu_instr *instr,
911                 bool need_dest)
912 {
913    const intel_device_info *devinfo = ntb.devinfo;
914    const fs_builder &bld = ntb.bld;
915    elk_fs_visitor &s = ntb.s;
916 
917    elk_fs_inst *inst;
918    unsigned execution_mode =
919       bld.shader->nir->info.float_controls_execution_mode;
920 
921    elk_fs_reg op[NIR_MAX_VEC_COMPONENTS];
922    elk_fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
923 
924 #ifndef NDEBUG
925    /* Everything except raw moves, some type conversions, iabs, and ineg
926     * should have 8-bit sources lowered by nir_lower_bit_size in
927     * elk_preprocess_nir or by elk_nir_lower_conversions in
928     * elk_postprocess_nir.
929     */
930    switch (instr->op) {
931    case nir_op_mov:
932    case nir_op_vec2:
933    case nir_op_vec3:
934    case nir_op_vec4:
935    case nir_op_vec8:
936    case nir_op_vec16:
937    case nir_op_i2f16:
938    case nir_op_i2f32:
939    case nir_op_i2i16:
940    case nir_op_i2i32:
941    case nir_op_u2f16:
942    case nir_op_u2f32:
943    case nir_op_u2u16:
944    case nir_op_u2u32:
945    case nir_op_iabs:
946    case nir_op_ineg:
947    case nir_op_pack_32_4x8_split:
948       break;
949 
950    default:
951       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
952          assert(type_sz(op[i].type) > 1);
953       }
954    }
955 #endif
956 
957    switch (instr->op) {
958    case nir_op_mov:
959    case nir_op_vec2:
960    case nir_op_vec3:
961    case nir_op_vec4:
962    case nir_op_vec8:
963    case nir_op_vec16: {
964       elk_fs_reg temp = result;
965       bool need_extra_copy = false;
966 
967       nir_intrinsic_instr *store_reg =
968          nir_store_reg_for_def(&instr->def);
969       if (store_reg != NULL) {
970          nir_def *dest_reg = store_reg->src[1].ssa;
971          for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
972             nir_intrinsic_instr *load_reg =
973                nir_load_reg_for_def(instr->src[i].src.ssa);
974             if (load_reg == NULL)
975                continue;
976 
977             if (load_reg->src[0].ssa == dest_reg) {
978                need_extra_copy = true;
979                temp = bld.vgrf(result.type, 4);
980                break;
981             }
982          }
983       }
984 
985       nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
986       unsigned last_bit = util_last_bit(write_mask);
987 
988       for (unsigned i = 0; i < last_bit; i++) {
989          if (!(write_mask & (1 << i)))
990             continue;
991 
992          if (instr->op == nir_op_mov) {
993             bld.MOV(offset(temp, bld, i),
994                            offset(op[0], bld, instr->src[0].swizzle[i]));
995          } else {
996             bld.MOV(offset(temp, bld, i),
997                            offset(op[i], bld, instr->src[i].swizzle[0]));
998          }
999       }
1000 
1001       /* In this case the source and destination registers were the same,
1002        * so we need to insert an extra set of moves in order to deal with
1003        * any swizzling.
1004        */
1005       if (need_extra_copy) {
1006          for (unsigned i = 0; i < last_bit; i++) {
1007             if (!(write_mask & (1 << i)))
1008                continue;
1009 
1010             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1011          }
1012       }
1013       return;
1014    }
1015 
1016    case nir_op_i2f32:
1017    case nir_op_u2f32:
1018       if (optimize_extract_to_float(ntb, instr, result))
1019          return;
1020       inst = bld.MOV(result, op[0]);
1021       break;
1022 
1023    case nir_op_f2f16_rtne:
1024    case nir_op_f2f16_rtz:
1025    case nir_op_f2f16: {
1026       elk_rnd_mode rnd = ELK_RND_MODE_UNSPECIFIED;
1027 
1028       if (nir_op_f2f16 == instr->op)
1029          rnd = elk_rnd_mode_from_execution_mode(execution_mode);
1030       else
1031          rnd = elk_rnd_mode_from_nir_op(instr->op);
1032 
1033       if (ELK_RND_MODE_UNSPECIFIED != rnd)
1034          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), elk_imm_d(rnd));
1035 
1036       assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1037       inst = bld.F32TO16(result, op[0]);
1038       break;
1039    }
1040 
1041    case nir_op_b2i8:
1042    case nir_op_b2i16:
1043    case nir_op_b2i32:
1044    case nir_op_b2i64:
1045    case nir_op_b2f16:
1046    case nir_op_b2f32:
1047    case nir_op_b2f64:
1048       if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1049          break;
1050       op[0].type = ELK_REGISTER_TYPE_D;
1051       op[0].negate = !op[0].negate;
1052       FALLTHROUGH;
1053    case nir_op_i2f64:
1054    case nir_op_i2i64:
1055    case nir_op_u2f64:
1056    case nir_op_u2u64:
1057    case nir_op_f2f64:
1058    case nir_op_f2i64:
1059    case nir_op_f2u64:
1060    case nir_op_i2i32:
1061    case nir_op_u2u32:
1062    case nir_op_f2i32:
1063    case nir_op_f2u32:
1064    case nir_op_i2f16:
1065    case nir_op_u2f16:
1066    case nir_op_f2i16:
1067    case nir_op_f2u16:
1068    case nir_op_f2i8:
1069    case nir_op_f2u8:
1070       if (result.type == ELK_REGISTER_TYPE_B ||
1071           result.type == ELK_REGISTER_TYPE_UB ||
1072           result.type == ELK_REGISTER_TYPE_HF)
1073          assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1074 
1075       if (op[0].type == ELK_REGISTER_TYPE_B ||
1076           op[0].type == ELK_REGISTER_TYPE_UB ||
1077           op[0].type == ELK_REGISTER_TYPE_HF)
1078          assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1079 
1080       inst = bld.MOV(result, op[0]);
1081       break;
1082 
1083    case nir_op_i2i8:
1084    case nir_op_u2u8:
1085       assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1086       FALLTHROUGH;
1087    case nir_op_i2i16:
1088    case nir_op_u2u16: {
1089       /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1090        * Emitting the instructions one by one results in two MOV instructions
1091        * that won't be propagated.  By handling both instructions here, a
1092        * single MOV is emitted.
1093        */
1094       nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1095       if (extract_instr != NULL) {
1096          if (extract_instr->op == nir_op_extract_u8 ||
1097              extract_instr->op == nir_op_extract_i8) {
1098             prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1099 
1100             const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1101             const elk_reg_type type =
1102                elk_int_type(1, extract_instr->op == nir_op_extract_i8);
1103 
1104             op[0] = subscript(op[0], type, byte);
1105          } else if (extract_instr->op == nir_op_extract_u16 ||
1106                     extract_instr->op == nir_op_extract_i16) {
1107             prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1108 
1109             const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1110             const elk_reg_type type =
1111                elk_int_type(2, extract_instr->op == nir_op_extract_i16);
1112 
1113             op[0] = subscript(op[0], type, word);
1114          }
1115       }
1116 
1117       inst = bld.MOV(result, op[0]);
1118       break;
1119    }
1120 
1121    case nir_op_fsat:
1122       inst = bld.MOV(result, op[0]);
1123       inst->saturate = true;
1124       break;
1125 
1126    case nir_op_fneg:
1127    case nir_op_ineg:
1128       op[0].negate = true;
1129       inst = bld.MOV(result, op[0]);
1130       break;
1131 
1132    case nir_op_fabs:
1133    case nir_op_iabs:
1134       op[0].negate = false;
1135       op[0].abs = true;
1136       inst = bld.MOV(result, op[0]);
1137       break;
1138 
1139    case nir_op_f2f32:
1140       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1141          elk_rnd_mode rnd =
1142             elk_rnd_mode_from_execution_mode(execution_mode);
1143          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1144                              elk_imm_d(rnd));
1145       }
1146 
1147       if (op[0].type == ELK_REGISTER_TYPE_HF)
1148          assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1149 
1150       inst = bld.MOV(result, op[0]);
1151       break;
1152 
1153    case nir_op_fsign:
1154       emit_fsign(ntb, bld, instr, result, op, 0);
1155       break;
1156 
1157    case nir_op_frcp:
1158       inst = bld.emit(ELK_SHADER_OPCODE_RCP, result, op[0]);
1159       break;
1160 
1161    case nir_op_fexp2:
1162       inst = bld.emit(ELK_SHADER_OPCODE_EXP2, result, op[0]);
1163       break;
1164 
1165    case nir_op_flog2:
1166       inst = bld.emit(ELK_SHADER_OPCODE_LOG2, result, op[0]);
1167       break;
1168 
1169    case nir_op_fsin:
1170       inst = bld.emit(ELK_SHADER_OPCODE_SIN, result, op[0]);
1171       break;
1172 
1173    case nir_op_fcos:
1174       inst = bld.emit(ELK_SHADER_OPCODE_COS, result, op[0]);
1175       break;
1176 
1177    case nir_op_fadd:
1178       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1179          elk_rnd_mode rnd =
1180             elk_rnd_mode_from_execution_mode(execution_mode);
1181          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1182                              elk_imm_d(rnd));
1183       }
1184       FALLTHROUGH;
1185    case nir_op_iadd:
1186       inst = bld.ADD(result, op[0], op[1]);
1187       break;
1188 
1189    case nir_op_iadd_sat:
1190    case nir_op_uadd_sat:
1191       inst = bld.ADD(result, op[0], op[1]);
1192       inst->saturate = true;
1193       break;
1194 
1195    case nir_op_isub_sat:
1196       bld.emit(ELK_SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1197       break;
1198 
1199    case nir_op_usub_sat:
1200       bld.emit(ELK_SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1201       break;
1202 
1203    case nir_op_irhadd:
1204    case nir_op_urhadd:
1205       assert(instr->def.bit_size < 64);
1206       inst = bld.AVG(result, op[0], op[1]);
1207       break;
1208 
1209    case nir_op_ihadd:
1210    case nir_op_uhadd: {
1211       assert(instr->def.bit_size < 64);
1212       elk_fs_reg tmp = bld.vgrf(result.type);
1213 
1214       if (devinfo->ver >= 8) {
1215          op[0] = resolve_source_modifiers(bld, op[0]);
1216          op[1] = resolve_source_modifiers(bld, op[1]);
1217       }
1218 
1219       /* AVG(x, y) - ((x ^ y) & 1) */
1220       bld.XOR(tmp, op[0], op[1]);
1221       bld.AND(tmp, tmp, retype(elk_imm_ud(1), result.type));
1222       bld.AVG(result, op[0], op[1]);
1223       inst = bld.ADD(result, result, tmp);
1224       inst->src[1].negate = true;
1225       break;
1226    }
1227 
1228    case nir_op_fmul:
1229       for (unsigned i = 0; i < 2; i++) {
1230          if (can_fuse_fmul_fsign(instr, i)) {
1231             emit_fsign(ntb, bld, instr, result, op, i);
1232             return;
1233          }
1234       }
1235 
1236       /* We emit the rounding mode after the previous fsign optimization since
1237        * it won't result in a MUL, but will try to negate the value by other
1238        * means.
1239        */
1240       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1241          elk_rnd_mode rnd =
1242             elk_rnd_mode_from_execution_mode(execution_mode);
1243          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1244                              elk_imm_d(rnd));
1245       }
1246 
1247       inst = bld.MUL(result, op[0], op[1]);
1248       break;
1249 
1250    case nir_op_imul_2x32_64:
1251    case nir_op_umul_2x32_64:
1252       bld.MUL(result, op[0], op[1]);
1253       break;
1254 
1255    case nir_op_imul_32x16:
1256    case nir_op_umul_32x16: {
1257       const bool ud = instr->op == nir_op_umul_32x16;
1258       const enum elk_reg_type word_type =
1259          ud ? ELK_REGISTER_TYPE_UW : ELK_REGISTER_TYPE_W;
1260       const enum elk_reg_type dword_type =
1261          ud ? ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_D;
1262 
1263       assert(instr->def.bit_size == 32);
1264 
1265       /* Before copy propagation there are no immediate values. */
1266       assert(op[0].file != IMM && op[1].file != IMM);
1267 
1268       op[1] = subscript(op[1], word_type, 0);
1269 
1270       if (devinfo->ver >= 7)
1271          bld.MUL(result, retype(op[0], dword_type), op[1]);
1272       else
1273          bld.MUL(result, op[1], retype(op[0], dword_type));
1274 
1275       break;
1276    }
1277 
1278    case nir_op_imul:
1279       assert(instr->def.bit_size < 64);
1280       bld.MUL(result, op[0], op[1]);
1281       break;
1282 
1283    case nir_op_imul_high:
1284    case nir_op_umul_high:
1285       assert(instr->def.bit_size < 64);
1286       if (instr->def.bit_size == 32) {
1287          bld.emit(ELK_SHADER_OPCODE_MULH, result, op[0], op[1]);
1288       } else {
1289          elk_fs_reg tmp = bld.vgrf(elk_reg_type_from_bit_size(32, op[0].type));
1290          bld.MUL(tmp, op[0], op[1]);
1291          bld.MOV(result, subscript(tmp, result.type, 1));
1292       }
1293       break;
1294 
1295    case nir_op_idiv:
1296    case nir_op_udiv:
1297       assert(instr->def.bit_size < 64);
1298       bld.emit(ELK_SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1299       break;
1300 
1301    case nir_op_uadd_carry:
1302       unreachable("Should have been lowered by carry_to_arith().");
1303 
1304    case nir_op_usub_borrow:
1305       unreachable("Should have been lowered by borrow_to_arith().");
1306 
1307    case nir_op_umod:
1308    case nir_op_irem:
1309       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1310        * appears that our hardware just does the right thing for signed
1311        * remainder.
1312        */
1313       assert(instr->def.bit_size < 64);
1314       bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1315       break;
1316 
1317    case nir_op_imod: {
1318       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1319       bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1320 
1321       /* Math instructions don't support conditional mod */
1322       inst = bld.MOV(bld.null_reg_d(), result);
1323       inst->conditional_mod = ELK_CONDITIONAL_NZ;
1324 
1325       /* Now, we need to determine if signs of the sources are different.
1326        * When we XOR the sources, the top bit is 0 if they are the same and 1
1327        * if they are different.  We can then use a conditional modifier to
1328        * turn that into a predicate.  This leads us to an XOR.l instruction.
1329        *
1330        * Technically, according to the PRM, you're not allowed to use .l on a
1331        * XOR instruction.  However, empirical experiments and Curro's reading
1332        * of the simulator source both indicate that it's safe.
1333        */
1334       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_D);
1335       inst = bld.XOR(tmp, op[0], op[1]);
1336       inst->predicate = ELK_PREDICATE_NORMAL;
1337       inst->conditional_mod = ELK_CONDITIONAL_L;
1338 
1339       /* If the result of the initial remainder operation is non-zero and the
1340        * two sources have different signs, add in a copy of op[1] to get the
1341        * final integer modulus value.
1342        */
1343       inst = bld.ADD(result, result, op[1]);
1344       inst->predicate = ELK_PREDICATE_NORMAL;
1345       break;
1346    }
1347 
1348    case nir_op_flt32:
1349    case nir_op_fge32:
1350    case nir_op_feq32:
1351    case nir_op_fneu32: {
1352       elk_fs_reg dest = result;
1353 
1354       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1355       if (bit_size != 32) {
1356          dest = bld.vgrf(op[0].type, 1);
1357          bld.UNDEF(dest);
1358       }
1359 
1360       bld.CMP(dest, op[0], op[1], elk_cmod_for_nir_comparison(instr->op));
1361 
1362       if (bit_size > 32) {
1363          bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1364       } else if(bit_size < 32) {
1365          /* When we convert the result to 32-bit we need to be careful and do
1366           * it as a signed conversion to get sign extension (for 32-bit true)
1367           */
1368          const elk_reg_type src_type =
1369             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1370 
1371          bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1372       }
1373       break;
1374    }
1375 
1376    case nir_op_ilt32:
1377    case nir_op_ult32:
1378    case nir_op_ige32:
1379    case nir_op_uge32:
1380    case nir_op_ieq32:
1381    case nir_op_ine32: {
1382       elk_fs_reg dest = result;
1383 
1384       const uint32_t bit_size = type_sz(op[0].type) * 8;
1385       if (bit_size != 32) {
1386          dest = bld.vgrf(op[0].type, 1);
1387          bld.UNDEF(dest);
1388       }
1389 
1390       bld.CMP(dest, op[0], op[1],
1391               elk_cmod_for_nir_comparison(instr->op));
1392 
1393       if (bit_size > 32) {
1394          bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1395       } else if (bit_size < 32) {
1396          /* When we convert the result to 32-bit we need to be careful and do
1397           * it as a signed conversion to get sign extension (for 32-bit true)
1398           */
1399          const elk_reg_type src_type =
1400             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1401 
1402          bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1403       }
1404       break;
1405    }
1406 
1407    case nir_op_inot:
1408       if (devinfo->ver >= 8) {
1409          nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1410 
1411          if (inot_src_instr != NULL &&
1412              (inot_src_instr->op == nir_op_ior ||
1413               inot_src_instr->op == nir_op_ixor ||
1414               inot_src_instr->op == nir_op_iand)) {
1415             /* The sources of the source logical instruction are now the
1416              * sources of the instruction that will be generated.
1417              */
1418             prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
1419             resolve_inot_sources(ntb, bld, inot_src_instr, op);
1420 
1421             /* Smash all of the sources and destination to be signed.  This
1422              * doesn't matter for the operation of the instruction, but cmod
1423              * propagation fails on unsigned sources with negation (due to
1424              * elk_fs_inst::can_do_cmod returning false).
1425              */
1426             result.type =
1427                elk_type_for_nir_type(devinfo,
1428                                      (nir_alu_type)(nir_type_int |
1429                                                     instr->def.bit_size));
1430             op[0].type =
1431                elk_type_for_nir_type(devinfo,
1432                                      (nir_alu_type)(nir_type_int |
1433                                                     nir_src_bit_size(inot_src_instr->src[0].src)));
1434             op[1].type =
1435                elk_type_for_nir_type(devinfo,
1436                                      (nir_alu_type)(nir_type_int |
1437                                                     nir_src_bit_size(inot_src_instr->src[1].src)));
1438 
1439             /* For XOR, only invert one of the sources.  Arbitrarily choose
1440              * the first source.
1441              */
1442             op[0].negate = !op[0].negate;
1443             if (inot_src_instr->op != nir_op_ixor)
1444                op[1].negate = !op[1].negate;
1445 
1446             switch (inot_src_instr->op) {
1447             case nir_op_ior:
1448                bld.AND(result, op[0], op[1]);
1449                return;
1450 
1451             case nir_op_iand:
1452                bld.OR(result, op[0], op[1]);
1453                return;
1454 
1455             case nir_op_ixor:
1456                bld.XOR(result, op[0], op[1]);
1457                return;
1458 
1459             default:
1460                unreachable("impossible opcode");
1461             }
1462          }
1463          op[0] = resolve_source_modifiers(bld, op[0]);
1464       }
1465       bld.NOT(result, op[0]);
1466       break;
1467    case nir_op_ixor:
1468       if (devinfo->ver >= 8) {
1469          resolve_inot_sources(ntb, bld, instr, op);
1470       }
1471       bld.XOR(result, op[0], op[1]);
1472       break;
1473    case nir_op_ior:
1474       if (devinfo->ver >= 8) {
1475          resolve_inot_sources(ntb, bld, instr, op);
1476       }
1477       bld.OR(result, op[0], op[1]);
1478       break;
1479    case nir_op_iand:
1480       if (devinfo->ver >= 8) {
1481          resolve_inot_sources(ntb, bld, instr, op);
1482       }
1483       bld.AND(result, op[0], op[1]);
1484       break;
1485 
1486    case nir_op_fdot2:
1487    case nir_op_fdot3:
1488    case nir_op_fdot4:
1489    case nir_op_b32all_fequal2:
1490    case nir_op_b32all_iequal2:
1491    case nir_op_b32all_fequal3:
1492    case nir_op_b32all_iequal3:
1493    case nir_op_b32all_fequal4:
1494    case nir_op_b32all_iequal4:
1495    case nir_op_b32any_fnequal2:
1496    case nir_op_b32any_inequal2:
1497    case nir_op_b32any_fnequal3:
1498    case nir_op_b32any_inequal3:
1499    case nir_op_b32any_fnequal4:
1500    case nir_op_b32any_inequal4:
1501       unreachable("Lowered by nir_lower_alu_reductions");
1502 
1503    case nir_op_ldexp:
1504       unreachable("not reached: should be handled by ldexp_to_arith()");
1505 
1506    case nir_op_fsqrt:
1507       inst = bld.emit(ELK_SHADER_OPCODE_SQRT, result, op[0]);
1508       break;
1509 
1510    case nir_op_frsq:
1511       inst = bld.emit(ELK_SHADER_OPCODE_RSQ, result, op[0]);
1512       break;
1513 
1514    case nir_op_ftrunc:
1515       inst = bld.RNDZ(result, op[0]);
1516       if (devinfo->ver < 6) {
1517          set_condmod(ELK_CONDITIONAL_R, inst);
1518          set_predicate(ELK_PREDICATE_NORMAL,
1519                        bld.ADD(result, result, elk_imm_f(1.0f)));
1520          inst = bld.MOV(result, result); /* for potential saturation */
1521       }
1522       break;
1523 
1524    case nir_op_fceil: {
1525       op[0].negate = !op[0].negate;
1526       elk_fs_reg temp = s.vgrf(glsl_float_type());
1527       bld.RNDD(temp, op[0]);
1528       temp.negate = true;
1529       inst = bld.MOV(result, temp);
1530       break;
1531    }
1532    case nir_op_ffloor:
1533       inst = bld.RNDD(result, op[0]);
1534       break;
1535    case nir_op_ffract:
1536       inst = bld.FRC(result, op[0]);
1537       break;
1538    case nir_op_fround_even:
1539       inst = bld.RNDE(result, op[0]);
1540       if (devinfo->ver < 6) {
1541          set_condmod(ELK_CONDITIONAL_R, inst);
1542          set_predicate(ELK_PREDICATE_NORMAL,
1543                        bld.ADD(result, result, elk_imm_f(1.0f)));
1544          inst = bld.MOV(result, result); /* for potential saturation */
1545       }
1546       break;
1547 
1548    case nir_op_fquantize2f16: {
1549       elk_fs_reg tmp16 = bld.vgrf(ELK_REGISTER_TYPE_D);
1550       elk_fs_reg tmp32 = bld.vgrf(ELK_REGISTER_TYPE_F);
1551       elk_fs_reg zero = bld.vgrf(ELK_REGISTER_TYPE_F);
1552 
1553       /* The destination stride must be at least as big as the source stride. */
1554       tmp16 = subscript(tmp16, ELK_REGISTER_TYPE_HF, 0);
1555 
1556       /* Check for denormal */
1557       elk_fs_reg abs_src0 = op[0];
1558       abs_src0.abs = true;
1559       bld.CMP(bld.null_reg_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1560               ELK_CONDITIONAL_L);
1561       /* Get the appropriately signed zero */
1562       bld.AND(retype(zero, ELK_REGISTER_TYPE_UD),
1563               retype(op[0], ELK_REGISTER_TYPE_UD),
1564               elk_imm_ud(0x80000000));
1565       /* Do the actual F32 -> F16 -> F32 conversion */
1566       bld.F32TO16(tmp16, op[0]);
1567       bld.F16TO32(tmp32, tmp16);
1568       /* Select that or zero based on normal status */
1569       inst = bld.SEL(result, zero, tmp32);
1570       inst->predicate = ELK_PREDICATE_NORMAL;
1571       break;
1572    }
1573 
1574    case nir_op_imin:
1575    case nir_op_umin:
1576    case nir_op_fmin:
1577       inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_L);
1578       break;
1579 
1580    case nir_op_imax:
1581    case nir_op_umax:
1582    case nir_op_fmax:
1583       inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_GE);
1584       break;
1585 
1586    case nir_op_pack_snorm_2x16:
1587    case nir_op_pack_snorm_4x8:
1588    case nir_op_pack_unorm_2x16:
1589    case nir_op_pack_unorm_4x8:
1590    case nir_op_unpack_snorm_2x16:
1591    case nir_op_unpack_snorm_4x8:
1592    case nir_op_unpack_unorm_2x16:
1593    case nir_op_unpack_unorm_4x8:
1594    case nir_op_unpack_half_2x16:
1595    case nir_op_pack_half_2x16:
1596       unreachable("not reached: should be handled by lower_packing_builtins");
1597 
1598    case nir_op_unpack_half_2x16_split_x:
1599       inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 0));
1600       break;
1601 
1602    case nir_op_unpack_half_2x16_split_y:
1603       inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 1));
1604       break;
1605 
1606    case nir_op_pack_64_2x32_split:
1607    case nir_op_pack_32_2x16_split:
1608       bld.emit(ELK_FS_OPCODE_PACK, result, op[0], op[1]);
1609       break;
1610 
1611    case nir_op_pack_32_4x8_split:
1612       bld.emit(ELK_FS_OPCODE_PACK, result, op, 4);
1613       break;
1614 
1615    case nir_op_unpack_64_2x32_split_x:
1616    case nir_op_unpack_64_2x32_split_y: {
1617       if (instr->op == nir_op_unpack_64_2x32_split_x)
1618          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 0));
1619       else
1620          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 1));
1621       break;
1622    }
1623 
1624    case nir_op_unpack_32_2x16_split_x:
1625    case nir_op_unpack_32_2x16_split_y: {
1626       if (instr->op == nir_op_unpack_32_2x16_split_x)
1627          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 0));
1628       else
1629          bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 1));
1630       break;
1631    }
1632 
1633    case nir_op_fpow:
1634       inst = bld.emit(ELK_SHADER_OPCODE_POW, result, op[0], op[1]);
1635       break;
1636 
1637    case nir_op_bitfield_reverse:
1638       assert(instr->def.bit_size == 32);
1639       assert(nir_src_bit_size(instr->src[0].src) == 32);
1640       bld.BFREV(result, op[0]);
1641       break;
1642 
1643    case nir_op_bit_count:
1644       assert(instr->def.bit_size == 32);
1645       assert(nir_src_bit_size(instr->src[0].src) < 64);
1646       bld.CBIT(result, op[0]);
1647       break;
1648 
1649    case nir_op_uclz:
1650       assert(instr->def.bit_size == 32);
1651       assert(nir_src_bit_size(instr->src[0].src) == 32);
1652       bld.LZD(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1653       break;
1654 
1655    case nir_op_ifind_msb: {
1656       assert(instr->def.bit_size == 32);
1657       assert(nir_src_bit_size(instr->src[0].src) == 32);
1658       assert(devinfo->ver >= 7);
1659 
1660       bld.FBH(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1661 
1662       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1663        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1664        * subtract the result from 31 to convert the MSB count into an LSB
1665        * count.
1666        */
1667       bld.CMP(bld.null_reg_d(), result, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1668 
1669       inst = bld.ADD(result, result, elk_imm_d(31));
1670       inst->predicate = ELK_PREDICATE_NORMAL;
1671       inst->src[0].negate = true;
1672       break;
1673    }
1674 
1675    case nir_op_find_lsb:
1676       assert(instr->def.bit_size == 32);
1677       assert(nir_src_bit_size(instr->src[0].src) == 32);
1678       assert(devinfo->ver >= 7);
1679       bld.FBL(result, op[0]);
1680       break;
1681 
1682    case nir_op_ubitfield_extract:
1683    case nir_op_ibitfield_extract:
1684       unreachable("should have been lowered");
1685    case nir_op_ubfe:
1686    case nir_op_ibfe:
1687       assert(instr->def.bit_size < 64);
1688       bld.BFE(result, op[2], op[1], op[0]);
1689       break;
1690    case nir_op_bfm:
1691       assert(instr->def.bit_size < 64);
1692       bld.BFI1(result, op[0], op[1]);
1693       break;
1694    case nir_op_bfi:
1695       assert(instr->def.bit_size < 64);
1696 
1697       /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1698        * either 0 or src0. Replacing the 0 with another value can eliminate a
1699        * temporary register.
1700        */
1701       if (is_const_zero(instr->src[2].src))
1702          bld.BFI2(result, op[0], op[1], op[0]);
1703       else
1704          bld.BFI2(result, op[0], op[1], op[2]);
1705 
1706       break;
1707 
1708    case nir_op_bitfield_insert:
1709       unreachable("not reached: should have been lowered");
1710 
1711    /* With regards to implicit masking of the shift counts for 8- and 16-bit
1712     * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1713     * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1714     * src0) are used. The Bspec (backed by data from experimentation) state
1715     * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1716     * types.
1717     *
1718     * The match the behavior expected for the NIR opcodes, explicit masks for
1719     * 8- and 16-bit types must be added.
1720     */
1721    case nir_op_ishl:
1722       if (instr->def.bit_size < 32) {
1723          bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1724          bld.SHL(result, op[0], result);
1725       } else {
1726          bld.SHL(result, op[0], op[1]);
1727       }
1728 
1729       break;
1730    case nir_op_ishr:
1731       if (instr->def.bit_size < 32) {
1732          bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1733          bld.ASR(result, op[0], result);
1734       } else {
1735          bld.ASR(result, op[0], op[1]);
1736       }
1737 
1738       break;
1739    case nir_op_ushr:
1740       if (instr->def.bit_size < 32) {
1741          bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1742          bld.SHR(result, op[0], result);
1743       } else {
1744          bld.SHR(result, op[0], op[1]);
1745       }
1746 
1747       break;
1748 
1749    case nir_op_pack_half_2x16_split:
1750       bld.emit(ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1751       break;
1752 
1753    case nir_op_ffma:
1754       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1755          elk_rnd_mode rnd =
1756             elk_rnd_mode_from_execution_mode(execution_mode);
1757          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1758                              elk_imm_d(rnd));
1759       }
1760 
1761       inst = bld.MAD(result, op[2], op[1], op[0]);
1762       break;
1763 
1764    case nir_op_flrp:
1765       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1766          elk_rnd_mode rnd =
1767             elk_rnd_mode_from_execution_mode(execution_mode);
1768          bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1769                              elk_imm_d(rnd));
1770       }
1771 
1772       inst = bld.LRP(result, op[0], op[1], op[2]);
1773       break;
1774 
1775    case nir_op_b32csel:
1776       if (optimize_frontfacing_ternary(ntb, instr, result))
1777          return;
1778 
1779       bld.CMP(bld.null_reg_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ);
1780       inst = bld.SEL(result, op[1], op[2]);
1781       inst->predicate = ELK_PREDICATE_NORMAL;
1782       break;
1783 
1784    case nir_op_extract_u8:
1785    case nir_op_extract_i8: {
1786       unsigned byte = nir_src_as_uint(instr->src[1].src);
1787 
1788       /* The PRMs say:
1789        *
1790        *    BDW+
1791        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1792        *    Use two instructions and a word or DWord intermediate integer type.
1793        */
1794       if (instr->def.bit_size == 64) {
1795          const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1796 
1797          if (instr->op == nir_op_extract_i8) {
1798             /* If we need to sign extend, extract to a word first */
1799             elk_fs_reg w_temp = bld.vgrf(ELK_REGISTER_TYPE_W);
1800             bld.MOV(w_temp, subscript(op[0], type, byte));
1801             bld.MOV(result, w_temp);
1802          } else if (byte & 1) {
1803             /* Extract the high byte from the word containing the desired byte
1804              * offset.
1805              */
1806             bld.SHR(result,
1807                     subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1808                     elk_imm_uw(8));
1809          } else {
1810             /* Otherwise use an AND with 0xff and a word type */
1811             bld.AND(result,
1812                     subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1813                     elk_imm_uw(0xff));
1814          }
1815       } else {
1816          const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1817          bld.MOV(result, subscript(op[0], type, byte));
1818       }
1819       break;
1820    }
1821 
1822    case nir_op_extract_u16:
1823    case nir_op_extract_i16: {
1824       const elk_reg_type type = elk_int_type(2, instr->op == nir_op_extract_i16);
1825       unsigned word = nir_src_as_uint(instr->src[1].src);
1826       bld.MOV(result, subscript(op[0], type, word));
1827       break;
1828    }
1829 
1830    default:
1831       unreachable("unhandled instruction");
1832    }
1833 
1834    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1835     * to sign extend the low bit to 0/~0
1836     */
1837    if (devinfo->ver <= 5 &&
1838        !result.is_null() &&
1839        (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1840       elk_fs_reg masked = s.vgrf(glsl_int_type());
1841       bld.AND(masked, result, elk_imm_d(1));
1842       masked.negate = true;
1843       bld.MOV(retype(result, ELK_REGISTER_TYPE_D), masked);
1844    }
1845 }
1846 
1847 static void
fs_nir_emit_load_const(nir_to_elk_state & ntb,nir_load_const_instr * instr)1848 fs_nir_emit_load_const(nir_to_elk_state &ntb,
1849                        nir_load_const_instr *instr)
1850 {
1851    const intel_device_info *devinfo = ntb.devinfo;
1852    const fs_builder &bld = ntb.bld;
1853 
1854    const elk_reg_type reg_type =
1855       elk_reg_type_from_bit_size(instr->def.bit_size, ELK_REGISTER_TYPE_D);
1856    elk_fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1857 
1858    switch (instr->def.bit_size) {
1859    case 8:
1860       for (unsigned i = 0; i < instr->def.num_components; i++)
1861          bld.MOV(offset(reg, bld, i), elk_setup_imm_b(bld, instr->value[i].i8));
1862       break;
1863 
1864    case 16:
1865       for (unsigned i = 0; i < instr->def.num_components; i++)
1866          bld.MOV(offset(reg, bld, i), elk_imm_w(instr->value[i].i16));
1867       break;
1868 
1869    case 32:
1870       for (unsigned i = 0; i < instr->def.num_components; i++)
1871          bld.MOV(offset(reg, bld, i), elk_imm_d(instr->value[i].i32));
1872       break;
1873 
1874    case 64:
1875       assert(devinfo->ver >= 7);
1876       if (!devinfo->has_64bit_int) {
1877          for (unsigned i = 0; i < instr->def.num_components; i++) {
1878             bld.MOV(retype(offset(reg, bld, i), ELK_REGISTER_TYPE_DF),
1879                     elk_setup_imm_df(bld, instr->value[i].f64));
1880          }
1881       } else {
1882          for (unsigned i = 0; i < instr->def.num_components; i++)
1883             bld.MOV(offset(reg, bld, i), elk_imm_q(instr->value[i].i64));
1884       }
1885       break;
1886 
1887    default:
1888       unreachable("Invalid bit size");
1889    }
1890 
1891    ntb.ssa_values[instr->def.index] = reg;
1892 }
1893 
1894 static bool
get_nir_src_bindless(nir_to_elk_state & ntb,const nir_src & src)1895 get_nir_src_bindless(nir_to_elk_state &ntb, const nir_src &src)
1896 {
1897    return ntb.ssa_bind_infos[src.ssa->index].bindless;
1898 }
1899 
1900 static bool
is_resource_src(nir_src src)1901 is_resource_src(nir_src src)
1902 {
1903    return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
1904           nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
1905 }
1906 
1907 static elk_fs_reg
get_resource_nir_src(nir_to_elk_state & ntb,const nir_src & src)1908 get_resource_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1909 {
1910    if (!is_resource_src(src))
1911       return elk_fs_reg();
1912    return ntb.resource_values[src.ssa->index];
1913 }
1914 
1915 static elk_fs_reg
get_nir_src(nir_to_elk_state & ntb,const nir_src & src)1916 get_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1917 {
1918    const intel_device_info *devinfo = ntb.devinfo;
1919 
1920    nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
1921 
1922    elk_fs_reg reg;
1923    if (!load_reg) {
1924       if (nir_src_is_undef(src)) {
1925          const elk_reg_type reg_type =
1926             elk_reg_type_from_bit_size(src.ssa->bit_size,
1927                                        ELK_REGISTER_TYPE_D);
1928          reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
1929       } else {
1930          reg = ntb.ssa_values[src.ssa->index];
1931       }
1932    } else {
1933       nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
1934       /* We don't handle indirects on locals */
1935       assert(nir_intrinsic_base(load_reg) == 0);
1936       assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
1937       reg = ntb.ssa_values[decl_reg->def.index];
1938    }
1939 
1940    if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
1941       /* The only 64-bit type available on gfx7 is DF, so use that. */
1942       reg.type = ELK_REGISTER_TYPE_DF;
1943    } else {
1944       /* To avoid floating-point denorm flushing problems, set the type by
1945        * default to an integer type - instructions that need floating point
1946        * semantics will set this to F if they need to
1947        */
1948       reg.type = elk_reg_type_from_bit_size(nir_src_bit_size(src),
1949                                             ELK_REGISTER_TYPE_D);
1950    }
1951 
1952    return reg;
1953 }
1954 
1955 /**
1956  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1957  *
1958  * This function should not be called on any value which may be 64 bits.
1959  * We could theoretically support 64-bit on gfx8+ but we choose not to
1960  * because it wouldn't work in general (no gfx7 support) and there are
1961  * enough restrictions in 64-bit immediates that you can't take the return
1962  * value and treat it the same as the result of get_nir_src().
1963  */
1964 static elk_fs_reg
get_nir_src_imm(nir_to_elk_state & ntb,const nir_src & src)1965 get_nir_src_imm(nir_to_elk_state &ntb, const nir_src &src)
1966 {
1967    assert(nir_src_bit_size(src) == 32);
1968    return nir_src_is_const(src) ?
1969           elk_fs_reg(elk_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
1970 }
1971 
1972 static elk_fs_reg
get_nir_def(nir_to_elk_state & ntb,const nir_def & def)1973 get_nir_def(nir_to_elk_state &ntb, const nir_def &def)
1974 {
1975    const fs_builder &bld = ntb.bld;
1976 
1977    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
1978    if (!store_reg) {
1979       const elk_reg_type reg_type =
1980          elk_reg_type_from_bit_size(def.bit_size,
1981                                     def.bit_size == 8 ?
1982                                     ELK_REGISTER_TYPE_D :
1983                                     ELK_REGISTER_TYPE_F);
1984       ntb.ssa_values[def.index] =
1985          bld.vgrf(reg_type, def.num_components);
1986       bld.UNDEF(ntb.ssa_values[def.index]);
1987       return ntb.ssa_values[def.index];
1988    } else {
1989       nir_intrinsic_instr *decl_reg =
1990          nir_reg_get_decl(store_reg->src[1].ssa);
1991       /* We don't handle indirects on locals */
1992       assert(nir_intrinsic_base(store_reg) == 0);
1993       assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
1994       return ntb.ssa_values[decl_reg->def.index];
1995    }
1996 }
1997 
1998 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)1999 get_nir_write_mask(const nir_def &def)
2000 {
2001    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2002    if (!store_reg) {
2003       return nir_component_mask(def.num_components);
2004    } else {
2005       return nir_intrinsic_write_mask(store_reg);
2006    }
2007 }
2008 
2009 static elk_fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum elk_opcode opcode,const elk_fs_reg & dst,const elk_fs_reg & src,const elk_fs_reg & desc,const elk_fs_reg & flag_reg,glsl_interp_mode interpolation)2010 emit_pixel_interpolater_send(const fs_builder &bld,
2011                              enum elk_opcode opcode,
2012                              const elk_fs_reg &dst,
2013                              const elk_fs_reg &src,
2014                              const elk_fs_reg &desc,
2015                              const elk_fs_reg &flag_reg,
2016                              glsl_interp_mode interpolation)
2017 {
2018    struct elk_wm_prog_data *wm_prog_data =
2019       elk_wm_prog_data(bld.shader->stage_prog_data);
2020 
2021    elk_fs_reg srcs[INTERP_NUM_SRCS];
2022    srcs[INTERP_SRC_OFFSET]       = src;
2023    srcs[INTERP_SRC_MSG_DESC]     = desc;
2024    srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2025 
2026    elk_fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2027    /* 2 floats per slot returned */
2028    inst->size_written = 2 * dst.component_size(inst->exec_size);
2029    if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2030       inst->pi_noperspective = true;
2031       /* TGL BSpec says:
2032        *     This field cannot be set to "Linear Interpolation"
2033        *     unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2034        */
2035       wm_prog_data->uses_nonperspective_interp_modes = true;
2036    }
2037 
2038    wm_prog_data->pulls_bary = true;
2039 
2040    return inst;
2041 }
2042 
2043 /**
2044  * Computes 1 << x, given a D/UD register containing some value x.
2045  */
2046 static elk_fs_reg
intexp2(const fs_builder & bld,const elk_fs_reg & x)2047 intexp2(const fs_builder &bld, const elk_fs_reg &x)
2048 {
2049    assert(x.type == ELK_REGISTER_TYPE_UD || x.type == ELK_REGISTER_TYPE_D);
2050 
2051    elk_fs_reg result = bld.vgrf(x.type, 1);
2052    elk_fs_reg one = bld.vgrf(x.type, 1);
2053 
2054    bld.MOV(one, retype(elk_imm_d(1), one.type));
2055    bld.SHL(result, one, x);
2056    return result;
2057 }
2058 
2059 static void
emit_gs_end_primitive(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src)2060 emit_gs_end_primitive(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src)
2061 {
2062    elk_fs_visitor &s = ntb.s;
2063    assert(s.stage == MESA_SHADER_GEOMETRY);
2064 
2065    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2066 
2067    if (s.gs_compile->control_data_header_size_bits == 0)
2068       return;
2069 
2070    /* We can only do EndPrimitive() functionality when the control data
2071     * consists of cut bits.  Fortunately, the only time it isn't is when the
2072     * output type is points, in which case EndPrimitive() is a no-op.
2073     */
2074    if (gs_prog_data->control_data_format !=
2075        GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2076       return;
2077    }
2078 
2079    /* Cut bits use one bit per vertex. */
2080    assert(s.gs_compile->control_data_bits_per_vertex == 1);
2081 
2082    elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2083    vertex_count.type = ELK_REGISTER_TYPE_UD;
2084 
2085    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2086     * vertex n, 0 otherwise.  So all we need to do here is mark bit
2087     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2088     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2089     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2090     *
2091     * Note that if EndPrimitive() is called before emitting any vertices, this
2092     * will cause us to set bit 31 of the control_data_bits register to 1.
2093     * That's fine because:
2094     *
2095     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2096     *   output, so the hardware will ignore cut bit 31.
2097     *
2098     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2099     *   last vertex, so setting cut bit 31 has no effect (since the primitive
2100     *   is automatically ended when the GS terminates).
2101     *
2102     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2103     *   control_data_bits register to 0 when the first vertex is emitted.
2104     */
2105 
2106    const fs_builder abld = ntb.bld.annotate("end primitive");
2107 
2108    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2109    elk_fs_reg prev_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2110    abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2111    elk_fs_reg mask = intexp2(abld, prev_count);
2112    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2113     * attention to the lower 5 bits of its second source argument, so on this
2114     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2115     * ((vertex_count - 1) % 32).
2116     */
2117    abld.OR(s.control_data_bits, s.control_data_bits, mask);
2118 }
2119 
2120 void
emit_gs_control_data_bits(const elk_fs_reg & vertex_count)2121 elk_fs_visitor::emit_gs_control_data_bits(const elk_fs_reg &vertex_count)
2122 {
2123    assert(stage == MESA_SHADER_GEOMETRY);
2124    assert(gs_compile->control_data_bits_per_vertex != 0);
2125 
2126    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
2127 
2128    const fs_builder bld = fs_builder(this).at_end();
2129    const fs_builder abld = bld.annotate("emit control data bits");
2130    const fs_builder fwa_bld = bld.exec_all();
2131 
2132    /* We use a single UD register to accumulate control data bits (32 bits
2133     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2134     * at a time.
2135     *
2136     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2137     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2138     * use the Channel Mask phase to enable/disable which DWord within that
2139     * group to write.  (Remember, different SIMD8 channels may have emitted
2140     * different numbers of vertices, so we may need per-slot offsets.)
2141     *
2142     * Channel masking presents an annoying problem: we may have to replicate
2143     * the data up to 4 times:
2144     *
2145     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2146     *
2147     * To avoid penalizing shaders that emit a small number of vertices, we
2148     * can avoid these sometimes: if the size of the control data header is
2149     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2150     * land in the same 128-bit group, so we can skip per-slot offsets.
2151     *
2152     * Similarly, if the control data header is <= 32 bits, there is only one
2153     * DWord, so we can skip channel masks.
2154     */
2155    elk_fs_reg channel_mask, per_slot_offset;
2156 
2157    if (gs_compile->control_data_header_size_bits > 32)
2158       channel_mask = vgrf(glsl_uint_type());
2159 
2160    if (gs_compile->control_data_header_size_bits > 128)
2161       per_slot_offset = vgrf(glsl_uint_type());
2162 
2163    /* Figure out which DWord we're trying to write to using the formula:
2164     *
2165     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2166     *
2167     * Since bits_per_vertex is a power of two, and is known at compile
2168     * time, this can be optimized to:
2169     *
2170     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2171     */
2172    if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
2173       elk_fs_reg dword_index = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2174       elk_fs_reg prev_count = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2175       abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2176       unsigned log2_bits_per_vertex =
2177          util_last_bit(gs_compile->control_data_bits_per_vertex);
2178       abld.SHR(dword_index, prev_count, elk_imm_ud(6u - log2_bits_per_vertex));
2179 
2180       if (per_slot_offset.file != BAD_FILE) {
2181          /* Set the per-slot offset to dword_index / 4, so that we'll write to
2182           * the appropriate OWord within the control data header.
2183           */
2184          abld.SHR(per_slot_offset, dword_index, elk_imm_ud(2u));
2185       }
2186 
2187       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2188        * write to the appropriate DWORD within the OWORD.
2189        */
2190       elk_fs_reg channel = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2191       fwa_bld.AND(channel, dword_index, elk_imm_ud(3u));
2192       channel_mask = intexp2(fwa_bld, channel);
2193       /* Then the channel masks need to be in bits 23:16. */
2194       fwa_bld.SHL(channel_mask, channel_mask, elk_imm_ud(16u));
2195    }
2196 
2197    /* If there are channel masks, add 3 extra copies of the data. */
2198    const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2199    elk_fs_reg sources[4];
2200 
2201    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
2202       sources[i] = this->control_data_bits;
2203 
2204    elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2205    srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2206    srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2207    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2208    srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, length);
2209    srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
2210    abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2211 
2212    elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2213                              srcs, ARRAY_SIZE(srcs));
2214 
2215    /* We need to increment Global Offset by 256-bits to make room for
2216     * Broadwell's extra "Vertex Count" payload at the beginning of the
2217     * URB entry.  Since this is an OWord message, Global Offset is counted
2218     * in 128-bit units, so we must set it to 2.
2219     */
2220    if (gs_prog_data->static_vertex_count == -1)
2221       inst->offset = 2;
2222 }
2223 
2224 static void
set_gs_stream_control_data_bits(nir_to_elk_state & ntb,const elk_fs_reg & vertex_count,unsigned stream_id)2225 set_gs_stream_control_data_bits(nir_to_elk_state &ntb, const elk_fs_reg &vertex_count,
2226                                 unsigned stream_id)
2227 {
2228    elk_fs_visitor &s = ntb.s;
2229 
2230    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2231 
2232    /* Note: we are calling this *before* increasing vertex_count, so
2233     * this->vertex_count == vertex_count - 1 in the formula above.
2234     */
2235 
2236    /* Stream mode uses 2 bits per vertex */
2237    assert(s.gs_compile->control_data_bits_per_vertex == 2);
2238 
2239    /* Must be a valid stream */
2240    assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2241 
2242    /* Control data bits are initialized to 0 so we don't have to set any
2243     * bits when sending vertices to stream 0.
2244     */
2245    if (stream_id == 0)
2246       return;
2247 
2248    const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL);
2249 
2250    /* reg::sid = stream_id */
2251    elk_fs_reg sid = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2252    abld.MOV(sid, elk_imm_ud(stream_id));
2253 
2254    /* reg:shift_count = 2 * (vertex_count - 1) */
2255    elk_fs_reg shift_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2256    abld.SHL(shift_count, vertex_count, elk_imm_ud(1u));
2257 
2258    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2259     * attention to the lower 5 bits of its second source argument, so on this
2260     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2261     * stream_id << ((2 * (vertex_count - 1)) % 32).
2262     */
2263    elk_fs_reg mask = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2264    abld.SHL(mask, sid, shift_count);
2265    abld.OR(s.control_data_bits, s.control_data_bits, mask);
2266 }
2267 
2268 static void
emit_gs_vertex(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2269 emit_gs_vertex(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src,
2270                unsigned stream_id)
2271 {
2272    elk_fs_visitor &s = ntb.s;
2273 
2274    assert(s.stage == MESA_SHADER_GEOMETRY);
2275 
2276    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2277 
2278    elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2279    vertex_count.type = ELK_REGISTER_TYPE_UD;
2280 
2281    /* Haswell and later hardware ignores the "Render Stream Select" bits
2282     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2283     * and instead sends all primitives down the pipeline for rasterization.
2284     * If the SOL stage is enabled, "Render Stream Select" is honored and
2285     * primitives bound to non-zero streams are discarded after stream output.
2286     *
2287     * Since the only purpose of primives sent to non-zero streams is to
2288     * be recorded by transform feedback, we can simply discard all geometry
2289     * bound to these streams when transform feedback is disabled.
2290     */
2291    if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2292       return;
2293 
2294    /* If we're outputting 32 control data bits or less, then we can wait
2295     * until the shader is over to output them all.  Otherwise we need to
2296     * output them as we go.  Now is the time to do it, since we're about to
2297     * output the vertex_count'th vertex, so it's guaranteed that the
2298     * control data bits associated with the (vertex_count - 1)th vertex are
2299     * correct.
2300     */
2301    if (s.gs_compile->control_data_header_size_bits > 32) {
2302       const fs_builder abld =
2303          ntb.bld.annotate("emit vertex: emit control data bits");
2304 
2305       /* Only emit control data bits if we've finished accumulating a batch
2306        * of 32 bits.  This is the case when:
2307        *
2308        *     (vertex_count * bits_per_vertex) % 32 == 0
2309        *
2310        * (in other words, when the last 5 bits of vertex_count *
2311        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2312        * integer n (which is always the case, since bits_per_vertex is
2313        * always 1 or 2), this is equivalent to requiring that the last 5-n
2314        * bits of vertex_count are 0:
2315        *
2316        *     vertex_count & (2^(5-n) - 1) == 0
2317        *
2318        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2319        * equivalent to:
2320        *
2321        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2322        *
2323        * TODO: If vertex_count is an immediate, we could do some of this math
2324        *       at compile time...
2325        */
2326       elk_fs_inst *inst =
2327          abld.AND(ntb.bld.null_reg_d(), vertex_count,
2328                   elk_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2329       inst->conditional_mod = ELK_CONDITIONAL_Z;
2330 
2331       abld.IF(ELK_PREDICATE_NORMAL);
2332       /* If vertex_count is 0, then no control data bits have been
2333        * accumulated yet, so we can skip emitting them.
2334        */
2335       abld.CMP(ntb.bld.null_reg_d(), vertex_count, elk_imm_ud(0u),
2336                ELK_CONDITIONAL_NEQ);
2337       abld.IF(ELK_PREDICATE_NORMAL);
2338       s.emit_gs_control_data_bits(vertex_count);
2339       abld.emit(ELK_OPCODE_ENDIF);
2340 
2341       /* Reset control_data_bits to 0 so we can start accumulating a new
2342        * batch.
2343        *
2344        * Note: in the case where vertex_count == 0, this neutralizes the
2345        * effect of any call to EndPrimitive() that the shader may have
2346        * made before outputting its first vertex.
2347        */
2348       inst = abld.MOV(s.control_data_bits, elk_imm_ud(0u));
2349       inst->force_writemask_all = true;
2350       abld.emit(ELK_OPCODE_ENDIF);
2351    }
2352 
2353    s.emit_urb_writes(vertex_count);
2354 
2355    /* In stream mode we have to set control data bits for all vertices
2356     * unless we have disabled control data bits completely (which we do
2357     * do for MESA_PRIM_POINTS outputs that don't use streams).
2358     */
2359    if (s.gs_compile->control_data_header_size_bits > 0 &&
2360        gs_prog_data->control_data_format ==
2361           GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2362       set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2363    }
2364 }
2365 
2366 static void
emit_gs_input_load(nir_to_elk_state & ntb,const elk_fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2367 emit_gs_input_load(nir_to_elk_state &ntb, const elk_fs_reg &dst,
2368                    const nir_src &vertex_src,
2369                    unsigned base_offset,
2370                    const nir_src &offset_src,
2371                    unsigned num_components,
2372                    unsigned first_component)
2373 {
2374    const fs_builder &bld = ntb.bld;
2375    elk_fs_visitor &s = ntb.s;
2376 
2377    assert(type_sz(dst.type) == 4);
2378    struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2379    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2380 
2381    /* TODO: figure out push input layout for invocations == 1 */
2382    if (gs_prog_data->invocations == 1 &&
2383        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2384        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2385       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2386                        nir_src_as_uint(vertex_src) * push_reg_count;
2387       const elk_fs_reg attr = elk_fs_reg(ATTR, 0, dst.type);
2388       for (unsigned i = 0; i < num_components; i++) {
2389          ntb.bld.MOV(offset(dst, bld, i),
2390                      offset(attr, bld, imm_offset + i + first_component));
2391       }
2392       return;
2393    }
2394 
2395    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2396    assert(gs_prog_data->base.include_vue_handles);
2397 
2398    elk_fs_reg start = s.gs_payload().icp_handle_start;
2399    elk_fs_reg icp_handle = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2400 
2401    if (gs_prog_data->invocations == 1) {
2402       if (nir_src_is_const(vertex_src)) {
2403          /* The vertex index is constant; just select the proper URB handle. */
2404          icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src));
2405       } else {
2406          /* The vertex index is non-constant.  We need to use indirect
2407           * addressing to fetch the proper URB handle.
2408           *
2409           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2410           * indicating that channel <n> should read the handle from
2411           * DWord <n>.  We convert that to bytes by multiplying by 4.
2412           *
2413           * Next, we convert the vertex index to bytes by multiplying
2414           * by 32 (shifting by 5), and add the two together.  This is
2415           * the final indirect byte offset.
2416           */
2417          elk_fs_reg sequence =
2418             ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2419          elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2420          elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2421          elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2422 
2423          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2424          bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2425          /* Convert vertex_index to bytes (multiply by 32) */
2426          bld.SHL(vertex_offset_bytes,
2427                  retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2428                  elk_imm_ud(5u));
2429          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2430 
2431          /* Use first_icp_handle as the base offset.  There is one register
2432           * of URB handles per vertex, so inform the register allocator that
2433           * we might read up to nir->info.gs.vertices_in registers.
2434           */
2435          bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2436                   elk_fs_reg(icp_offset_bytes),
2437                   elk_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE));
2438       }
2439    } else {
2440       assert(gs_prog_data->invocations > 1);
2441 
2442       if (nir_src_is_const(vertex_src)) {
2443          unsigned vertex = nir_src_as_uint(vertex_src);
2444          assert(vertex <= 5);
2445          bld.MOV(icp_handle, component(start, vertex));
2446       } else {
2447          /* The vertex index is non-constant.  We need to use indirect
2448           * addressing to fetch the proper URB handle.
2449           *
2450           */
2451          elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2452 
2453          /* Convert vertex_index to bytes (multiply by 4) */
2454          bld.SHL(icp_offset_bytes,
2455                  retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2456                  elk_imm_ud(2u));
2457 
2458          /* Use first_icp_handle as the base offset.  There is one DWord
2459           * of URB handles per vertex, so inform the register allocator that
2460           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2461           */
2462          bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2463                   elk_fs_reg(icp_offset_bytes),
2464                   elk_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2465                              REG_SIZE));
2466       }
2467    }
2468 
2469    elk_fs_inst *inst;
2470    elk_fs_reg indirect_offset = get_nir_src(ntb, offset_src);
2471 
2472    if (nir_src_is_const(offset_src)) {
2473       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2474       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2475 
2476       /* Constant indexing - use global offset. */
2477       if (first_component != 0) {
2478          unsigned read_components = num_components + first_component;
2479          elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2480          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2481                          ARRAY_SIZE(srcs));
2482          inst->size_written = read_components *
2483                               tmp.component_size(inst->exec_size);
2484          for (unsigned i = 0; i < num_components; i++) {
2485             bld.MOV(offset(dst, bld, i),
2486                     offset(tmp, bld, i + first_component));
2487          }
2488       } else {
2489          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2490                          ARRAY_SIZE(srcs));
2491          inst->size_written = num_components *
2492                               dst.component_size(inst->exec_size);
2493       }
2494       inst->offset = base_offset + nir_src_as_uint(offset_src);
2495    } else {
2496       /* Indirect indexing - use per-slot offsets as well. */
2497       unsigned read_components = num_components + first_component;
2498       elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2499 
2500       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2501       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2502       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2503 
2504       if (first_component != 0) {
2505          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2506                          srcs, ARRAY_SIZE(srcs));
2507          inst->size_written = read_components *
2508                               tmp.component_size(inst->exec_size);
2509          for (unsigned i = 0; i < num_components; i++) {
2510             bld.MOV(offset(dst, bld, i),
2511                     offset(tmp, bld, i + first_component));
2512          }
2513       } else {
2514          inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2515                          srcs, ARRAY_SIZE(srcs));
2516          inst->size_written = num_components *
2517                               dst.component_size(inst->exec_size);
2518       }
2519       inst->offset = base_offset;
2520    }
2521 }
2522 
2523 static elk_fs_reg
get_indirect_offset(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2524 get_indirect_offset(nir_to_elk_state &ntb, nir_intrinsic_instr *instr)
2525 {
2526    nir_src *offset_src = nir_get_io_offset_src(instr);
2527 
2528    if (nir_src_is_const(*offset_src)) {
2529       /* The only constant offset we should find is 0.  elk_nir.c's
2530        * add_const_offset_to_base() will fold other constant offsets
2531        * into the "base" index.
2532        */
2533       assert(nir_src_as_uint(*offset_src) == 0);
2534       return elk_fs_reg();
2535    }
2536 
2537    return get_nir_src(ntb, *offset_src);
2538 }
2539 
2540 static void
fs_nir_emit_vs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2541 fs_nir_emit_vs_intrinsic(nir_to_elk_state &ntb,
2542                          nir_intrinsic_instr *instr)
2543 {
2544    const fs_builder &bld = ntb.bld;
2545    elk_fs_visitor &s = ntb.s;
2546    assert(s.stage == MESA_SHADER_VERTEX);
2547 
2548    elk_fs_reg dest;
2549    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2550       dest = get_nir_def(ntb, instr->def);
2551 
2552    switch (instr->intrinsic) {
2553    case nir_intrinsic_load_vertex_id:
2554    case nir_intrinsic_load_base_vertex:
2555       unreachable("should be lowered by nir_lower_system_values()");
2556 
2557    case nir_intrinsic_load_input: {
2558       assert(instr->def.bit_size == 32);
2559       const elk_fs_reg src = offset(elk_fs_reg(ATTR, 0, dest.type), bld,
2560                                 nir_intrinsic_base(instr) * 4 +
2561                                 nir_intrinsic_component(instr) +
2562                                 nir_src_as_uint(instr->src[0]));
2563 
2564       for (unsigned i = 0; i < instr->num_components; i++)
2565          bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2566       break;
2567    }
2568 
2569    case nir_intrinsic_load_vertex_id_zero_base:
2570    case nir_intrinsic_load_instance_id:
2571    case nir_intrinsic_load_base_instance:
2572    case nir_intrinsic_load_draw_id:
2573    case nir_intrinsic_load_first_vertex:
2574    case nir_intrinsic_load_is_indexed_draw:
2575       unreachable("lowered by elk_nir_lower_vs_inputs");
2576 
2577    default:
2578       fs_nir_emit_intrinsic(ntb, bld, instr);
2579       break;
2580    }
2581 }
2582 
2583 static elk_fs_reg
get_tcs_single_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2584 get_tcs_single_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2585                                 nir_intrinsic_instr *instr)
2586 {
2587    elk_fs_visitor &s = ntb.s;
2588 
2589    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2590    const nir_src &vertex_src = instr->src[0];
2591    nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2592 
2593    const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2594 
2595    elk_fs_reg icp_handle;
2596 
2597    if (nir_src_is_const(vertex_src)) {
2598       /* Emit a MOV to resolve <0,1,0> regioning. */
2599       icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2600       unsigned vertex = nir_src_as_uint(vertex_src);
2601       bld.MOV(icp_handle, component(start, vertex));
2602    } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2603               vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2604       /* For the common case of only 1 instance, an array index of
2605        * gl_InvocationID means reading the handles from the start.  Skip all
2606        * the indirect work.
2607        */
2608       icp_handle = start;
2609    } else {
2610       /* The vertex index is non-constant.  We need to use indirect
2611        * addressing to fetch the proper URB handle.
2612        */
2613       icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2614 
2615       /* Each ICP handle is a single DWord (4 bytes) */
2616       elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2617       bld.SHL(vertex_offset_bytes,
2618               retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2619               elk_imm_ud(2u));
2620 
2621       /* We might read up to 4 registers. */
2622       bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2623                start, vertex_offset_bytes,
2624                elk_imm_ud(4 * REG_SIZE));
2625    }
2626 
2627    return icp_handle;
2628 }
2629 
2630 static elk_fs_reg
get_tcs_multi_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2631 get_tcs_multi_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2632                                nir_intrinsic_instr *instr)
2633 {
2634    elk_fs_visitor &s = ntb.s;
2635    const intel_device_info *devinfo = s.devinfo;
2636 
2637    struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) s.key;
2638    const nir_src &vertex_src = instr->src[0];
2639    const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2640 
2641    const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2642 
2643    if (nir_src_is_const(vertex_src))
2644       return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2645 
2646    /* The vertex index is non-constant.  We need to use indirect
2647     * addressing to fetch the proper URB handle.
2648     *
2649     * First, we start with the sequence indicating that channel <n>
2650     * should read the handle from DWord <n>.  We convert that to bytes
2651     * by multiplying by 4.
2652     *
2653     * Next, we convert the vertex index to bytes by multiplying
2654     * by the GRF size (by shifting), and add the two together.  This is
2655     * the final indirect byte offset.
2656     */
2657    elk_fs_reg icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2658    elk_fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2659    elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2660    elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2661    elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2662 
2663    /* Offsets will be 0, 4, 8, ... */
2664    bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2665    /* Convert vertex_index to bytes (multiply by 32) */
2666    assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2667    bld.SHL(vertex_offset_bytes,
2668            retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2669            elk_imm_ud(ffs(grf_size_bytes) - 1));
2670    bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2671 
2672    /* Use start of ICP handles as the base offset.  There is one register
2673     * of URB handles per vertex, so inform the register allocator that
2674     * we might read up to nir->info.gs.vertices_in registers.
2675     */
2676    bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2677             icp_offset_bytes,
2678             elk_imm_ud(elk_tcs_prog_key_input_vertices(tcs_key) *
2679                        grf_size_bytes));
2680 
2681    return icp_handle;
2682 }
2683 
2684 static void
emit_barrier(nir_to_elk_state & ntb)2685 emit_barrier(nir_to_elk_state &ntb)
2686 {
2687    const intel_device_info *devinfo = ntb.devinfo;
2688    const fs_builder &bld = ntb.bld;
2689    elk_fs_visitor &s = ntb.s;
2690 
2691    /* We are getting the barrier ID from the compute shader header */
2692    assert(gl_shader_stage_uses_workgroup(s.stage));
2693 
2694    elk_fs_reg payload = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
2695 
2696    /* Clear the message payload */
2697    bld.exec_all().group(8, 0).MOV(payload, elk_imm_ud(0u));
2698 
2699    assert(gl_shader_stage_is_compute(s.stage));
2700 
2701    uint32_t barrier_id_mask;
2702    switch (devinfo->ver) {
2703    case 7:
2704    case 8:
2705       barrier_id_mask = 0x0f000000u; break;
2706    default:
2707       unreachable("barrier is only available on gen >= 7");
2708    }
2709 
2710    /* Copy the barrier id from r0.2 to the message payload reg.2 */
2711    elk_fs_reg r0_2 = elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD));
2712    bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
2713                                   elk_imm_ud(barrier_id_mask));
2714 
2715    /* Emit a gateway "barrier" message using the payload we set up, followed
2716     * by a wait instruction.
2717     */
2718    bld.exec_all().emit(ELK_SHADER_OPCODE_BARRIER, reg_undef, payload);
2719 }
2720 
2721 static void
emit_tcs_barrier(nir_to_elk_state & ntb)2722 emit_tcs_barrier(nir_to_elk_state &ntb)
2723 {
2724    const fs_builder &bld = ntb.bld;
2725    elk_fs_visitor &s = ntb.s;
2726 
2727    assert(s.stage == MESA_SHADER_TESS_CTRL);
2728    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2729 
2730    elk_fs_reg m0 = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2731    elk_fs_reg m0_2 = component(m0, 2);
2732 
2733    const fs_builder chanbld = bld.exec_all().group(1, 0);
2734 
2735    /* Zero the message header */
2736    bld.exec_all().MOV(m0, elk_imm_ud(0u));
2737 
2738    /* Copy "Barrier ID" from r0.2, bits 16:13 */
2739    chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
2740                elk_imm_ud(INTEL_MASK(16, 13)));
2741 
2742    /* Shift it up to bits 27:24. */
2743    chanbld.SHL(m0_2, m0_2, elk_imm_ud(11));
2744 
2745    /* Set the Barrier Count and the enable bit */
2746    chanbld.OR(m0_2, m0_2,
2747               elk_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2748 
2749    bld.emit(ELK_SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2750 }
2751 
2752 static void
fs_nir_emit_tcs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2753 fs_nir_emit_tcs_intrinsic(nir_to_elk_state &ntb,
2754                           nir_intrinsic_instr *instr)
2755 {
2756    const intel_device_info *devinfo = ntb.devinfo;
2757    const fs_builder &bld = ntb.bld;
2758    elk_fs_visitor &s = ntb.s;
2759 
2760    assert(s.stage == MESA_SHADER_TESS_CTRL);
2761    struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2762    struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2763 
2764    elk_fs_reg dst;
2765    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2766       dst = get_nir_def(ntb, instr->def);
2767 
2768    switch (instr->intrinsic) {
2769    case nir_intrinsic_load_primitive_id:
2770       bld.MOV(dst, s.tcs_payload().primitive_id);
2771       break;
2772    case nir_intrinsic_load_invocation_id:
2773       bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
2774       break;
2775 
2776    case nir_intrinsic_barrier:
2777       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
2778          fs_nir_emit_intrinsic(ntb, bld, instr);
2779       if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
2780          if (tcs_prog_data->instances != 1)
2781             emit_tcs_barrier(ntb);
2782       }
2783       break;
2784 
2785    case nir_intrinsic_load_input:
2786       unreachable("nir_lower_io should never give us these.");
2787       break;
2788 
2789    case nir_intrinsic_load_per_vertex_input: {
2790       assert(instr->def.bit_size == 32);
2791       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2792       unsigned imm_offset = nir_intrinsic_base(instr);
2793       elk_fs_inst *inst;
2794 
2795       const bool multi_patch =
2796          vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
2797 
2798       elk_fs_reg icp_handle = multi_patch ?
2799          get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
2800          get_tcs_single_patch_icp_handle(ntb, bld, instr);
2801 
2802       /* We can only read two double components with each URB read, so
2803        * we send two read messages in that case, each one loading up to
2804        * two double components.
2805        */
2806       unsigned num_components = instr->num_components;
2807       unsigned first_component = nir_intrinsic_component(instr);
2808 
2809       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2810       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2811 
2812       if (indirect_offset.file == BAD_FILE) {
2813          /* Constant indexing - use global offset. */
2814          if (first_component != 0) {
2815             unsigned read_components = num_components + first_component;
2816             elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2817             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2818                             ARRAY_SIZE(srcs));
2819             for (unsigned i = 0; i < num_components; i++) {
2820                bld.MOV(offset(dst, bld, i),
2821                        offset(tmp, bld, i + first_component));
2822             }
2823          } else {
2824             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2825                             ARRAY_SIZE(srcs));
2826          }
2827          inst->offset = imm_offset;
2828       } else {
2829          /* Indirect indexing - use per-slot offsets as well. */
2830          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2831 
2832          if (first_component != 0) {
2833             unsigned read_components = num_components + first_component;
2834             elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2835             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2836                             srcs, ARRAY_SIZE(srcs));
2837             for (unsigned i = 0; i < num_components; i++) {
2838                bld.MOV(offset(dst, bld, i),
2839                        offset(tmp, bld, i + first_component));
2840             }
2841          } else {
2842             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2843                             srcs, ARRAY_SIZE(srcs));
2844          }
2845          inst->offset = imm_offset;
2846       }
2847       inst->size_written = (num_components + first_component) *
2848                            inst->dst.component_size(inst->exec_size);
2849 
2850       /* Copy the temporary to the destination to deal with writemasking.
2851        *
2852        * Also attempt to deal with gl_PointSize being in the .w component.
2853        */
2854       if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2855          assert(type_sz(dst.type) == 4);
2856          inst->dst = bld.vgrf(dst.type, 4);
2857          inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
2858          bld.MOV(dst, offset(inst->dst, bld, 3));
2859       }
2860       break;
2861    }
2862 
2863    case nir_intrinsic_load_output:
2864    case nir_intrinsic_load_per_vertex_output: {
2865       assert(instr->def.bit_size == 32);
2866       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2867       unsigned imm_offset = nir_intrinsic_base(instr);
2868       unsigned first_component = nir_intrinsic_component(instr);
2869 
2870       elk_fs_inst *inst;
2871       if (indirect_offset.file == BAD_FILE) {
2872          /* This MOV replicates the output handle to all enabled channels
2873           * is SINGLE_PATCH mode.
2874           */
2875          elk_fs_reg patch_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2876          bld.MOV(patch_handle, s.tcs_payload().patch_urb_output);
2877 
2878          {
2879             elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2880             srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
2881 
2882             if (first_component != 0) {
2883                unsigned read_components =
2884                   instr->num_components + first_component;
2885                elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2886                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2887                                srcs, ARRAY_SIZE(srcs));
2888                inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2889                for (unsigned i = 0; i < instr->num_components; i++) {
2890                   bld.MOV(offset(dst, bld, i),
2891                           offset(tmp, bld, i + first_component));
2892                }
2893             } else {
2894                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2895                                srcs, ARRAY_SIZE(srcs));
2896                inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2897             }
2898             inst->offset = imm_offset;
2899          }
2900       } else {
2901          /* Indirect indexing - use per-slot offsets as well. */
2902          elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2903          srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2904          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2905 
2906          if (first_component != 0) {
2907             unsigned read_components =
2908                instr->num_components + first_component;
2909             elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2910             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2911                             srcs, ARRAY_SIZE(srcs));
2912             inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2913             for (unsigned i = 0; i < instr->num_components; i++) {
2914                bld.MOV(offset(dst, bld, i),
2915                        offset(tmp, bld, i + first_component));
2916             }
2917          } else {
2918             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2919                             srcs, ARRAY_SIZE(srcs));
2920             inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2921          }
2922          inst->offset = imm_offset;
2923       }
2924       break;
2925    }
2926 
2927    case nir_intrinsic_store_output:
2928    case nir_intrinsic_store_per_vertex_output: {
2929       assert(nir_src_bit_size(instr->src[0]) == 32);
2930       elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
2931       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2932       unsigned imm_offset = nir_intrinsic_base(instr);
2933       unsigned mask = nir_intrinsic_write_mask(instr);
2934 
2935       if (mask == 0)
2936          break;
2937 
2938       unsigned num_components = util_last_bit(mask);
2939       unsigned first_component = nir_intrinsic_component(instr);
2940       assert((first_component + num_components) <= 4);
2941 
2942       mask = mask << first_component;
2943 
2944       elk_fs_reg mask_reg;
2945       if (mask != WRITEMASK_XYZW)
2946          mask_reg = elk_imm_ud(mask << 16);
2947 
2948       elk_fs_reg sources[4];
2949 
2950       unsigned m = first_component;
2951       for (unsigned i = 0; i < num_components; i++) {
2952          int c = i + first_component;
2953          if (mask & (1 << c)) {
2954             sources[m++] = offset(value, bld, i);
2955          } else {
2956             m++;
2957          }
2958       }
2959 
2960       assert(m == (first_component + num_components));
2961 
2962       elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2963       srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2964       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2965       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
2966       srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, m);
2967       srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(m);
2968       bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
2969 
2970       elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2971                                srcs, ARRAY_SIZE(srcs));
2972       inst->offset = imm_offset;
2973       break;
2974    }
2975 
2976    default:
2977       fs_nir_emit_intrinsic(ntb, bld, instr);
2978       break;
2979    }
2980 }
2981 
2982 static void
fs_nir_emit_tes_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2983 fs_nir_emit_tes_intrinsic(nir_to_elk_state &ntb,
2984                           nir_intrinsic_instr *instr)
2985 {
2986    const intel_device_info *devinfo = ntb.devinfo;
2987    const fs_builder &bld = ntb.bld;
2988    elk_fs_visitor &s = ntb.s;
2989 
2990    assert(s.stage == MESA_SHADER_TESS_EVAL);
2991    struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(s.prog_data);
2992 
2993    elk_fs_reg dest;
2994    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2995       dest = get_nir_def(ntb, instr->def);
2996 
2997    switch (instr->intrinsic) {
2998    case nir_intrinsic_load_primitive_id:
2999       bld.MOV(dest, s.tes_payload().primitive_id);
3000       break;
3001 
3002    case nir_intrinsic_load_tess_coord:
3003       for (unsigned i = 0; i < 3; i++)
3004          bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3005       break;
3006 
3007    case nir_intrinsic_load_input:
3008    case nir_intrinsic_load_per_vertex_input: {
3009       assert(instr->def.bit_size == 32);
3010       elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3011       unsigned imm_offset = nir_intrinsic_base(instr);
3012       unsigned first_component = nir_intrinsic_component(instr);
3013 
3014       elk_fs_inst *inst;
3015       if (indirect_offset.file == BAD_FILE) {
3016          /* Arbitrarily only push up to 32 vec4 slots worth of data,
3017           * which is 16 registers (since each holds 2 vec4 slots).
3018           */
3019          const unsigned max_push_slots = 32;
3020          if (imm_offset < max_push_slots) {
3021             const elk_fs_reg src = horiz_offset(elk_fs_reg(ATTR, 0, dest.type),
3022                                             4 * imm_offset + first_component);
3023             for (int i = 0; i < instr->num_components; i++)
3024                bld.MOV(offset(dest, bld, i), component(src, i));
3025 
3026             tes_prog_data->base.urb_read_length =
3027                MAX2(tes_prog_data->base.urb_read_length,
3028                     (imm_offset / 2) + 1);
3029          } else {
3030             /* Replicate the patch handle to all enabled channels */
3031             elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3032             srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3033 
3034             if (first_component != 0) {
3035                unsigned read_components =
3036                   instr->num_components + first_component;
3037                elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3038                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3039                                srcs, ARRAY_SIZE(srcs));
3040                inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3041                for (unsigned i = 0; i < instr->num_components; i++) {
3042                   bld.MOV(offset(dest, bld, i),
3043                           offset(tmp, bld, i + first_component));
3044                }
3045             } else {
3046                inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3047                                srcs, ARRAY_SIZE(srcs));
3048                inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3049             }
3050             inst->offset = imm_offset;
3051          }
3052       } else {
3053          /* Indirect indexing - use per-slot offsets as well. */
3054 
3055          /* We can only read two double components with each URB read, so
3056           * we send two read messages in that case, each one loading up to
3057           * two double components.
3058           */
3059          unsigned num_components = instr->num_components;
3060 
3061          elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3062          srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3063          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3064 
3065          if (first_component != 0) {
3066             unsigned read_components =
3067                 num_components + first_component;
3068             elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3069             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3070                             srcs, ARRAY_SIZE(srcs));
3071             for (unsigned i = 0; i < num_components; i++) {
3072                bld.MOV(offset(dest, bld, i),
3073                        offset(tmp, bld, i + first_component));
3074             }
3075          } else {
3076             inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3077                             srcs, ARRAY_SIZE(srcs));
3078          }
3079          inst->offset = imm_offset;
3080          inst->size_written = (num_components + first_component) *
3081                               inst->dst.component_size(inst->exec_size);
3082       }
3083       break;
3084    }
3085    default:
3086       fs_nir_emit_intrinsic(ntb, bld, instr);
3087       break;
3088    }
3089 }
3090 
3091 static void
fs_nir_emit_gs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3092 fs_nir_emit_gs_intrinsic(nir_to_elk_state &ntb,
3093                          nir_intrinsic_instr *instr)
3094 {
3095    const fs_builder &bld = ntb.bld;
3096    elk_fs_visitor &s = ntb.s;
3097 
3098    assert(s.stage == MESA_SHADER_GEOMETRY);
3099    elk_fs_reg indirect_offset;
3100 
3101    elk_fs_reg dest;
3102    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3103       dest = get_nir_def(ntb, instr->def);
3104 
3105    switch (instr->intrinsic) {
3106    case nir_intrinsic_load_primitive_id:
3107       assert(s.stage == MESA_SHADER_GEOMETRY);
3108       assert(elk_gs_prog_data(s.prog_data)->include_primitive_id);
3109       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD), s.gs_payload().primitive_id);
3110       break;
3111 
3112    case nir_intrinsic_load_input:
3113       unreachable("load_input intrinsics are invalid for the GS stage");
3114 
3115    case nir_intrinsic_load_per_vertex_input:
3116       emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3117                          instr->src[1], instr->num_components,
3118                          nir_intrinsic_component(instr));
3119       break;
3120 
3121    case nir_intrinsic_emit_vertex_with_counter:
3122       emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3123       break;
3124 
3125    case nir_intrinsic_end_primitive_with_counter:
3126       emit_gs_end_primitive(ntb, instr->src[0]);
3127       break;
3128 
3129    case nir_intrinsic_set_vertex_and_primitive_count:
3130       bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3131       break;
3132 
3133    case nir_intrinsic_load_invocation_id: {
3134       elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3135       assert(val.file != BAD_FILE);
3136       dest.type = val.type;
3137       bld.MOV(dest, val);
3138       break;
3139    }
3140 
3141    default:
3142       fs_nir_emit_intrinsic(ntb, bld, instr);
3143       break;
3144    }
3145 }
3146 
3147 /**
3148  * Fetch the current render target layer index.
3149  */
3150 static elk_fs_reg
fetch_render_target_array_index(const fs_builder & bld)3151 fetch_render_target_array_index(const fs_builder &bld)
3152 {
3153    if (bld.shader->devinfo->ver >= 6) {
3154       /* The render target array index is provided in the thread payload as
3155        * bits 26:16 of r0.0.
3156        */
3157       const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3158       bld.AND(idx, elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 0, 1),
3159               elk_imm_uw(0x7ff));
3160       return idx;
3161    } else {
3162       /* Pre-SNB we only ever render into the first layer of the framebuffer
3163        * since layered rendering is not implemented.
3164        */
3165       return elk_imm_ud(0);
3166    }
3167 }
3168 
3169 /* Sample from the MCS surface attached to this multisample texture. */
3170 static elk_fs_reg
emit_mcs_fetch(nir_to_elk_state & ntb,const elk_fs_reg & coordinate,unsigned components,const elk_fs_reg & texture,const elk_fs_reg & texture_handle)3171 emit_mcs_fetch(nir_to_elk_state &ntb, const elk_fs_reg &coordinate, unsigned components,
3172                const elk_fs_reg &texture,
3173                const elk_fs_reg &texture_handle)
3174 {
3175    const fs_builder &bld = ntb.bld;
3176 
3177    const elk_fs_reg dest = ntb.s.vgrf(glsl_uvec4_type());
3178 
3179    elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3180    srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3181    srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3182    srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3183    srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3184    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(components);
3185    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
3186    srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
3187 
3188    elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3189                             ARRAY_SIZE(srcs));
3190 
3191    /* We only care about one or two regs of response, but the sampler always
3192     * writes 4/8.
3193     */
3194    inst->size_written = 4 * dest.component_size(inst->exec_size);
3195 
3196    return dest;
3197 }
3198 
3199 /**
3200  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3201  * framebuffer at the current fragment coordinates and sample index.
3202  */
3203 static elk_fs_inst *
emit_non_coherent_fb_read(nir_to_elk_state & ntb,const fs_builder & bld,const elk_fs_reg & dst,unsigned target)3204 emit_non_coherent_fb_read(nir_to_elk_state &ntb, const fs_builder &bld, const elk_fs_reg &dst,
3205                           unsigned target)
3206 {
3207    elk_fs_visitor &s = ntb.s;
3208 
3209    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3210    const elk_wm_prog_key *wm_key =
3211       reinterpret_cast<const elk_wm_prog_key *>(s.key);
3212    assert(!wm_key->coherent_fb_fetch);
3213 
3214    /* Calculate the fragment coordinates. */
3215    const elk_fs_reg coords = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
3216    bld.MOV(offset(coords, bld, 0), s.pixel_x);
3217    bld.MOV(offset(coords, bld, 1), s.pixel_y);
3218    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3219 
3220    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3221     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3222     * shouldn't be necessary to recompile based on whether the framebuffer is
3223     * CMS or UMS.
3224     */
3225    assert(wm_key->multisample_fbo == ELK_ALWAYS ||
3226           wm_key->multisample_fbo == ELK_NEVER);
3227    if (wm_key->multisample_fbo &&
3228        ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3229       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3230 
3231    const elk_fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3232    const elk_fs_reg mcs = wm_key->multisample_fbo ?
3233       emit_mcs_fetch(ntb, coords, 3, elk_imm_ud(target), elk_fs_reg()) : elk_fs_reg();
3234 
3235    /* Use either a normal or a CMS texel fetch message depending on whether
3236     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3237     * message just in case the framebuffer uses 16x multisampling, it should
3238     * be equivalent to the normal CMS fetch for lower multisampling modes.
3239     */
3240    elk_opcode op;
3241    if (wm_key->multisample_fbo) {
3242       op = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
3243    } else {
3244       op = ELK_SHADER_OPCODE_TXF_LOGICAL;
3245    }
3246 
3247    /* Emit the instruction. */
3248    elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3249    srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3250    srcs[TEX_LOGICAL_SRC_LOD]              = elk_imm_ud(0);
3251    srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3252    srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3253    srcs[TEX_LOGICAL_SRC_SURFACE]          = elk_imm_ud(target);
3254    srcs[TEX_LOGICAL_SRC_SAMPLER]          = elk_imm_ud(0);
3255    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_ud(3);
3256    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = elk_imm_ud(0);
3257    srcs[TEX_LOGICAL_SRC_RESIDENCY]        = elk_imm_ud(0);
3258 
3259    elk_fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3260    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3261 
3262    return inst;
3263 }
3264 
3265 static elk_fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,elk_fs_reg * regs,unsigned n)3266 alloc_temporary(const fs_builder &bld, unsigned size, elk_fs_reg *regs, unsigned n)
3267 {
3268    if (n && regs[0].file != BAD_FILE) {
3269       return regs[0];
3270 
3271    } else {
3272       const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, size);
3273 
3274       for (unsigned i = 0; i < n; i++)
3275          regs[i] = tmp;
3276 
3277       return tmp;
3278    }
3279 }
3280 
3281 static elk_fs_reg
alloc_frag_output(nir_to_elk_state & ntb,unsigned location)3282 alloc_frag_output(nir_to_elk_state &ntb, unsigned location)
3283 {
3284    elk_fs_visitor &s = ntb.s;
3285 
3286    assert(s.stage == MESA_SHADER_FRAGMENT);
3287    const elk_wm_prog_key *const key =
3288       reinterpret_cast<const elk_wm_prog_key *>(s.key);
3289    const unsigned l = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_LOCATION);
3290    const unsigned i = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_INDEX);
3291 
3292    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3293       return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3294 
3295    else if (l == FRAG_RESULT_COLOR)
3296       return alloc_temporary(ntb.bld, 4, s.outputs,
3297                              MAX2(key->nr_color_regions, 1));
3298 
3299    else if (l == FRAG_RESULT_DEPTH)
3300       return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3301 
3302    else if (l == FRAG_RESULT_STENCIL)
3303       return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3304 
3305    else if (l == FRAG_RESULT_SAMPLE_MASK)
3306       return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3307 
3308    else if (l >= FRAG_RESULT_DATA0 &&
3309             l < FRAG_RESULT_DATA0 + ELK_MAX_DRAW_BUFFERS)
3310       return alloc_temporary(ntb.bld, 4,
3311                              &s.outputs[l - FRAG_RESULT_DATA0], 1);
3312 
3313    else
3314       unreachable("Invalid location");
3315 }
3316 
3317 static void
emit_is_helper_invocation(nir_to_elk_state & ntb,elk_fs_reg result)3318 emit_is_helper_invocation(nir_to_elk_state &ntb, elk_fs_reg result)
3319 {
3320    const fs_builder &bld = ntb.bld;
3321 
3322    /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3323     * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3324     * consideration demoted invocations.
3325     */
3326    result.type = ELK_REGISTER_TYPE_UD;
3327 
3328    bld.MOV(result, elk_imm_ud(0));
3329 
3330    /* See elk_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3331    unsigned width = bld.dispatch_width();
3332    for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3333       const fs_builder b = bld.group(MIN2(width, 16), i);
3334 
3335       elk_fs_inst *mov = b.MOV(offset(result, b, i), elk_imm_ud(~0));
3336 
3337       /* The at() ensures that any code emitted to get the predicate happens
3338        * before the mov right above.  This is not an issue elsewhere because
3339        * lowering code already set up the builder this way.
3340        */
3341       elk_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3342       mov->predicate_inverse = true;
3343    }
3344 }
3345 
3346 static void
emit_fragcoord_interpolation(nir_to_elk_state & ntb,elk_fs_reg wpos)3347 emit_fragcoord_interpolation(nir_to_elk_state &ntb, elk_fs_reg wpos)
3348 {
3349    const intel_device_info *devinfo = ntb.devinfo;
3350    const fs_builder &bld = ntb.bld;
3351    elk_fs_visitor &s = ntb.s;
3352 
3353    assert(s.stage == MESA_SHADER_FRAGMENT);
3354 
3355    /* gl_FragCoord.x */
3356    bld.MOV(wpos, s.pixel_x);
3357    wpos = offset(wpos, bld, 1);
3358 
3359    /* gl_FragCoord.y */
3360    bld.MOV(wpos, s.pixel_y);
3361    wpos = offset(wpos, bld, 1);
3362 
3363    /* gl_FragCoord.z */
3364    if (devinfo->ver >= 6) {
3365       bld.MOV(wpos, s.pixel_z);
3366    } else {
3367       bld.emit(ELK_FS_OPCODE_LINTERP, wpos,
3368                s.delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL],
3369                s.interp_reg(bld, VARYING_SLOT_POS, 2, 0));
3370    }
3371    wpos = offset(wpos, bld, 1);
3372 
3373    /* gl_FragCoord.w: Already set up in emit_interpolation */
3374    bld.MOV(wpos, s.wpos_w);
3375 }
3376 
3377 static elk_fs_reg
emit_frontfacing_interpolation(nir_to_elk_state & ntb)3378 emit_frontfacing_interpolation(nir_to_elk_state &ntb)
3379 {
3380    const intel_device_info *devinfo = ntb.devinfo;
3381    const fs_builder &bld = ntb.bld;
3382 
3383    elk_fs_reg ff = bld.vgrf(ELK_REGISTER_TYPE_D);
3384 
3385    if (devinfo->ver >= 6) {
3386       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3387        * a boolean result from this (~0/true or 0/false).
3388        *
3389        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3390        * this task in only one instruction:
3391        *    - a negation source modifier will flip the bit; and
3392        *    - a W -> D type conversion will sign extend the bit into the high
3393        *      word of the destination.
3394        *
3395        * An ASR 15 fills the low word of the destination.
3396        */
3397       elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
3398       g0.negate = true;
3399 
3400       bld.ASR(ff, g0, elk_imm_d(15));
3401    } else {
3402       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
3403        * a boolean result from this (1/true or 0/false).
3404        *
3405        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
3406        * the negation source modifier to flip it. Unfortunately the SHR
3407        * instruction only operates on UD (or D with an abs source modifier)
3408        * sources without negation.
3409        *
3410        * Instead, use ASR (which will give ~0/true or 0/false).
3411        */
3412       elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
3413       g1_6.negate = true;
3414 
3415       bld.ASR(ff, g1_6, elk_imm_d(31));
3416    }
3417 
3418    return ff;
3419 }
3420 
3421 static elk_fs_reg
emit_samplepos_setup(nir_to_elk_state & ntb)3422 emit_samplepos_setup(nir_to_elk_state &ntb)
3423 {
3424    const intel_device_info *devinfo = ntb.devinfo;
3425    const fs_builder &bld = ntb.bld;
3426    elk_fs_visitor &s = ntb.s;
3427 
3428    assert(s.stage == MESA_SHADER_FRAGMENT);
3429    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3430    assert(devinfo->ver >= 6);
3431 
3432    const fs_builder abld = bld.annotate("compute sample position");
3433    elk_fs_reg pos = abld.vgrf(ELK_REGISTER_TYPE_F, 2);
3434 
3435    if (wm_prog_data->persample_dispatch == ELK_NEVER) {
3436       /* From ARB_sample_shading specification:
3437        * "When rendering to a non-multisample buffer, or if multisample
3438        *  rasterization is disabled, gl_SamplePosition will always be
3439        *  (0.5, 0.5).
3440        */
3441       bld.MOV(offset(pos, bld, 0), elk_imm_f(0.5f));
3442       bld.MOV(offset(pos, bld, 1), elk_imm_f(0.5f));
3443       return pos;
3444    }
3445 
3446    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3447     * mode will be enabled.
3448     *
3449     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3450     * R31.1:0         Position Offset X/Y for Slot[3:0]
3451     * R31.3:2         Position Offset X/Y for Slot[7:4]
3452     * .....
3453     *
3454     * The X, Y sample positions come in as bytes in  thread payload. So, read
3455     * the positions using vstride=16, width=8, hstride=2.
3456     */
3457    const elk_fs_reg sample_pos_reg =
3458       fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, ELK_REGISTER_TYPE_W);
3459 
3460    for (unsigned i = 0; i < 2; i++) {
3461       elk_fs_reg tmp_d = bld.vgrf(ELK_REGISTER_TYPE_D);
3462       abld.MOV(tmp_d, subscript(sample_pos_reg, ELK_REGISTER_TYPE_B, i));
3463       /* Convert int_sample_pos to floating point */
3464       elk_fs_reg tmp_f = bld.vgrf(ELK_REGISTER_TYPE_F);
3465       abld.MOV(tmp_f, tmp_d);
3466       /* Scale to the range [0, 1] */
3467       abld.MUL(offset(pos, abld, i), tmp_f, elk_imm_f(1 / 16.0f));
3468    }
3469 
3470    if (wm_prog_data->persample_dispatch == ELK_SOMETIMES) {
3471       check_dynamic_msaa_flag(abld, wm_prog_data,
3472                               INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3473       for (unsigned i = 0; i < 2; i++) {
3474          set_predicate(ELK_PREDICATE_NORMAL,
3475                        bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3476                                elk_imm_f(0.5f)));
3477       }
3478    }
3479 
3480    return pos;
3481 }
3482 
3483 static elk_fs_reg
emit_sampleid_setup(nir_to_elk_state & ntb)3484 emit_sampleid_setup(nir_to_elk_state &ntb)
3485 {
3486    const intel_device_info *devinfo = ntb.devinfo;
3487    const fs_builder &bld = ntb.bld;
3488    elk_fs_visitor &s = ntb.s;
3489 
3490    assert(s.stage == MESA_SHADER_FRAGMENT);
3491    ASSERTED elk_wm_prog_key *key = (elk_wm_prog_key*) s.key;
3492    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3493    assert(devinfo->ver >= 6);
3494 
3495    const fs_builder abld = bld.annotate("compute sample id");
3496    elk_fs_reg sample_id = abld.vgrf(ELK_REGISTER_TYPE_UD);
3497 
3498    assert(key->multisample_fbo != ELK_NEVER);
3499 
3500    if (devinfo->ver >= 8) {
3501       /* Sample ID comes in as 4-bit numbers in g1.0:
3502        *
3503        *    15:12 Slot 3 SampleID (only used in SIMD16)
3504        *     11:8 Slot 2 SampleID (only used in SIMD16)
3505        *      7:4 Slot 1 SampleID
3506        *      3:0 Slot 0 SampleID
3507        *
3508        * Each slot corresponds to four channels, so we want to replicate each
3509        * half-byte value to 4 channels in a row:
3510        *
3511        *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
3512        *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
3513        *
3514        *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
3515        *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
3516        *
3517        * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
3518        * channels to read the first byte (7:0), and the second group of 8
3519        * channels to read the second byte (15:8).  Then, we shift right by
3520        * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
3521        * values into place.  Finally, we AND with 0xf to keep the low nibble.
3522        *
3523        *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
3524        *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
3525        *
3526        * TODO: These payload bits exist on Gfx7 too, but they appear to always
3527        *       be zero, so this code fails to work.  We should find out why.
3528        */
3529       const elk_fs_reg tmp = abld.vgrf(ELK_REGISTER_TYPE_UW);
3530 
3531       for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3532          const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
3533          /* According to the "PS Thread Payload for Normal Dispatch"
3534           * pages on the BSpec, the sample ids are stored in R1.0/R2.0 on gfx8+.
3535           */
3536          const struct elk_reg id_reg = elk_vec1_grf(i + 1, 0);
3537          hbld.SHR(offset(tmp, hbld, i),
3538                   stride(retype(id_reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
3539                   elk_imm_v(0x44440000));
3540       }
3541 
3542       abld.AND(sample_id, tmp, elk_imm_w(0xf));
3543    } else {
3544       const elk_fs_reg t1 = component(abld.vgrf(ELK_REGISTER_TYPE_UD), 0);
3545       const elk_fs_reg t2 = abld.vgrf(ELK_REGISTER_TYPE_UW);
3546 
3547       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
3548        * 8x multisampling, subspan 0 will represent sample N (where N
3549        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
3550        * 7. We can find the value of N by looking at R0.0 bits 7:6
3551        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
3552        * (since samples are always delivered in pairs). That is, we
3553        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
3554        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
3555        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
3556        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
3557        * populating a temporary variable with the sequence (0, 1, 2, 3),
3558        * and then reading from it using vstride=1, width=4, hstride=0.
3559        * These computations hold good for 4x multisampling as well.
3560        *
3561        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
3562        * the first four slots are sample 0 of subspan 0; the next four
3563        * are sample 1 of subspan 0; the third group is sample 0 of
3564        * subspan 1, and finally sample 1 of subspan 1.
3565        */
3566 
3567       /* SKL+ has an extra bit for the Starting Sample Pair Index to
3568        * accommodate 16x MSAA.
3569        */
3570       abld.exec_all().group(1, 0)
3571           .AND(t1, elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD)),
3572                elk_imm_ud(0xc0));
3573       abld.exec_all().group(1, 0).SHR(t1, t1, elk_imm_d(5));
3574 
3575       /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
3576        * can assume 4x MSAA.  Disallow it on IVB+
3577        *
3578        * FINISHME: One day, we could come up with a way to do this that
3579        * actually works on gfx7.
3580        */
3581       if (devinfo->ver >= 7)
3582          s.limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
3583       abld.exec_all().group(8, 0).MOV(t2, elk_imm_v(0x32103210));
3584 
3585       /* This special instruction takes care of setting vstride=1,
3586        * width=4, hstride=0 of t2 during an ADD instruction.
3587        */
3588       abld.emit(ELK_FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
3589    }
3590 
3591    if (key->multisample_fbo == ELK_SOMETIMES) {
3592       check_dynamic_msaa_flag(abld, wm_prog_data,
3593                               INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3594       set_predicate(ELK_PREDICATE_NORMAL,
3595                     abld.SEL(sample_id, sample_id, elk_imm_ud(0)));
3596    }
3597 
3598    return sample_id;
3599 }
3600 
3601 static elk_fs_reg
emit_samplemaskin_setup(nir_to_elk_state & ntb)3602 emit_samplemaskin_setup(nir_to_elk_state &ntb)
3603 {
3604    const intel_device_info *devinfo = ntb.devinfo;
3605    const fs_builder &bld = ntb.bld;
3606    elk_fs_visitor &s = ntb.s;
3607 
3608    assert(s.stage == MESA_SHADER_FRAGMENT);
3609    struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3610    assert(devinfo->ver >= 6);
3611 
3612    elk_fs_reg coverage_mask =
3613       fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, ELK_REGISTER_TYPE_D);
3614 
3615    if (wm_prog_data->persample_dispatch == ELK_NEVER)
3616       return coverage_mask;
3617 
3618    /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
3619     * and a mask representing which sample is being processed by the
3620     * current shader invocation.
3621     *
3622     * From the OES_sample_variables specification:
3623     * "When per-sample shading is active due to the use of a fragment input
3624     *  qualified by "sample" or due to the use of the gl_SampleID or
3625     *  gl_SamplePosition variables, only the bit for the current sample is
3626     *  set in gl_SampleMaskIn."
3627     */
3628    const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
3629 
3630    if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3631       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3632 
3633    elk_fs_reg one = s.vgrf(glsl_int_type());
3634    elk_fs_reg enabled_mask = s.vgrf(glsl_int_type());
3635    abld.MOV(one, elk_imm_d(1));
3636    abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
3637    elk_fs_reg mask = bld.vgrf(ELK_REGISTER_TYPE_D);
3638    abld.AND(mask, enabled_mask, coverage_mask);
3639 
3640    if (wm_prog_data->persample_dispatch == ELK_ALWAYS)
3641       return mask;
3642 
3643    check_dynamic_msaa_flag(abld, wm_prog_data,
3644                            INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3645    set_predicate(ELK_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
3646 
3647    return mask;
3648 }
3649 
3650 static void
fs_nir_emit_fs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3651 fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
3652                          nir_intrinsic_instr *instr)
3653 {
3654    const intel_device_info *devinfo = ntb.devinfo;
3655    const fs_builder &bld = ntb.bld;
3656    elk_fs_visitor &s = ntb.s;
3657 
3658    assert(s.stage == MESA_SHADER_FRAGMENT);
3659 
3660    elk_fs_reg dest;
3661    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3662       dest = get_nir_def(ntb, instr->def);
3663 
3664    switch (instr->intrinsic) {
3665    case nir_intrinsic_load_front_face:
3666       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
3667               emit_frontfacing_interpolation(ntb));
3668       break;
3669 
3670    case nir_intrinsic_load_sample_pos:
3671    case nir_intrinsic_load_sample_pos_or_center: {
3672       elk_fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
3673       assert(sample_pos.file != BAD_FILE);
3674       dest.type = sample_pos.type;
3675       bld.MOV(dest, sample_pos);
3676       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3677       break;
3678    }
3679 
3680    case nir_intrinsic_load_layer_id:
3681       dest.type = ELK_REGISTER_TYPE_UD;
3682       bld.MOV(dest, fetch_render_target_array_index(bld));
3683       break;
3684 
3685    case nir_intrinsic_is_helper_invocation:
3686       emit_is_helper_invocation(ntb, dest);
3687       break;
3688 
3689    case nir_intrinsic_load_helper_invocation:
3690    case nir_intrinsic_load_sample_mask_in:
3691    case nir_intrinsic_load_sample_id:
3692    case nir_intrinsic_load_frag_shading_rate: {
3693       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3694       elk_fs_reg val = ntb.system_values[sv];
3695       assert(val.file != BAD_FILE);
3696       dest.type = val.type;
3697       bld.MOV(dest, val);
3698       break;
3699    }
3700 
3701    case nir_intrinsic_store_output: {
3702       const elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
3703       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3704       const unsigned location = nir_intrinsic_base(instr) +
3705          SET_FIELD(store_offset, ELK_NIR_FRAG_OUTPUT_LOCATION);
3706       const elk_fs_reg new_dest = retype(alloc_frag_output(ntb, location),
3707                                      src.type);
3708 
3709       for (unsigned j = 0; j < instr->num_components; j++)
3710          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3711                  offset(src, bld, j));
3712 
3713       break;
3714    }
3715 
3716    case nir_intrinsic_load_output: {
3717       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3718                                    ELK_NIR_FRAG_OUTPUT_LOCATION);
3719       assert(l >= FRAG_RESULT_DATA0);
3720       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3721       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3722       const elk_fs_reg tmp = bld.vgrf(dest.type, 4);
3723 
3724       assert(!reinterpret_cast<const elk_wm_prog_key *>(s.key)->coherent_fb_fetch);
3725       emit_non_coherent_fb_read(ntb, bld, tmp, target);
3726 
3727       for (unsigned j = 0; j < instr->num_components; j++) {
3728          bld.MOV(offset(dest, bld, j),
3729                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3730       }
3731 
3732       break;
3733    }
3734 
3735    case nir_intrinsic_demote:
3736    case nir_intrinsic_terminate:
3737    case nir_intrinsic_demote_if:
3738    case nir_intrinsic_terminate_if: {
3739       /* We track our discarded pixels in f0.1/f1.0.  By predicating on it, we
3740        * can update just the flag bits that aren't yet discarded.  If there's
3741        * no condition, we emit a CMP of g0 != g0, so all currently executing
3742        * channels will get turned off.
3743        */
3744       elk_fs_inst *cmp = NULL;
3745       if (instr->intrinsic == nir_intrinsic_demote_if ||
3746           instr->intrinsic == nir_intrinsic_terminate_if) {
3747          nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3748 
3749          if (alu != NULL &&
3750              alu->op != nir_op_bcsel &&
3751              (devinfo->ver > 5 ||
3752               (alu->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) != ELK_NIR_BOOLEAN_NEEDS_RESOLVE ||
3753               alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
3754               alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
3755               alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
3756               alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
3757               alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
3758             /* Re-emit the instruction that generated the Boolean value, but
3759              * do not store it.  Since this instruction will be conditional,
3760              * other instructions that want to use the real Boolean value may
3761              * get garbage.  This was a problem for piglit's fs-discard-exit-2
3762              * test.
3763              *
3764              * Ideally we'd detect that the instruction cannot have a
3765              * conditional modifier before emitting the instructions.  Alas,
3766              * that is nigh impossible.  Instead, we're going to assume the
3767              * instruction (or last instruction) generated can have a
3768              * conditional modifier.  If it cannot, fallback to the old-style
3769              * compare, and hope dead code elimination will clean up the
3770              * extra instructions generated.
3771              */
3772             fs_nir_emit_alu(ntb, alu, false);
3773 
3774             cmp = (elk_fs_inst *) s.instructions.get_tail();
3775             if (cmp->conditional_mod == ELK_CONDITIONAL_NONE) {
3776                if (cmp->can_do_cmod())
3777                   cmp->conditional_mod = ELK_CONDITIONAL_Z;
3778                else
3779                   cmp = NULL;
3780             } else {
3781                /* The old sequence that would have been generated is,
3782                 * basically, bool_result == false.  This is equivalent to
3783                 * !bool_result, so negate the old modifier.
3784                 *
3785                 * Unfortunately, we can't do this to most float comparisons
3786                 * because of NaN, so we'll have to fallback to the old-style
3787                 * compare.
3788                 *
3789                 * For example, this code (after negation):
3790                 *    (+f1.0) cmp.ge.f1.0(8) null<1>F g30<8,8,1>F     0x0F
3791                 * will provide different results from this:
3792                 *    cmp.l.f0.0(8)   g31<1>F         g30<1,1,0>F     0x0F
3793                 *    (+f1.0) cmp.z.f1.0(8) null<1>D  g31<8,8,1>D     0D
3794                 * because both (NaN >= 0) == false and (NaN < 0) == false.
3795                 *
3796                 * It will still work for == and != though, because
3797                 * (NaN == x) == false and (NaN != x) == true.
3798                 */
3799                if (elk_type_is_float(cmp->src[0].type) &&
3800                    cmp->conditional_mod != ELK_CONDITIONAL_EQ &&
3801                    cmp->conditional_mod != ELK_CONDITIONAL_NEQ) {
3802                   cmp = NULL;
3803                } else {
3804                   cmp->conditional_mod = elk_negate_cmod(cmp->conditional_mod);
3805                }
3806             }
3807          }
3808 
3809          if (cmp == NULL) {
3810             cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
3811                           elk_imm_d(0), ELK_CONDITIONAL_Z);
3812          }
3813       } else {
3814          elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
3815                                        ELK_REGISTER_TYPE_UW));
3816          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, ELK_CONDITIONAL_NZ);
3817       }
3818 
3819       cmp->predicate = ELK_PREDICATE_NORMAL;
3820       cmp->flag_subreg = sample_mask_flag_subreg(s);
3821 
3822       elk_fs_inst *jump = bld.emit(ELK_OPCODE_HALT);
3823       jump->flag_subreg = sample_mask_flag_subreg(s);
3824       jump->predicate_inverse = true;
3825 
3826       if (instr->intrinsic == nir_intrinsic_terminate ||
3827           instr->intrinsic == nir_intrinsic_terminate_if) {
3828          jump->predicate = ELK_PREDICATE_NORMAL;
3829       } else {
3830          /* Only jump when the whole quad is demoted.  For historical
3831           * reasons this is also used for discard.
3832           */
3833          jump->predicate = ELK_PREDICATE_ALIGN1_ANY4H;
3834       }
3835 
3836       if (devinfo->ver < 7)
3837          s.limit_dispatch_width(
3838             16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
3839       break;
3840    }
3841 
3842    case nir_intrinsic_load_input:
3843    case nir_intrinsic_load_per_primitive_input: {
3844       /* In Fragment Shaders load_input is used either for flat inputs or
3845        * per-primitive inputs.
3846        */
3847       assert(instr->def.bit_size == 32);
3848       unsigned base = nir_intrinsic_base(instr);
3849       unsigned comp = nir_intrinsic_component(instr);
3850       unsigned num_components = instr->num_components;
3851 
3852       /* Special case fields in the VUE header */
3853       if (base == VARYING_SLOT_LAYER)
3854          comp = 1;
3855       else if (base == VARYING_SLOT_VIEWPORT)
3856          comp = 2;
3857 
3858       if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
3859          assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
3860          for (unsigned int i = 0; i < num_components; i++) {
3861             bld.MOV(offset(dest, bld, i),
3862                     retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
3863          }
3864       } else {
3865          const unsigned k = 3;
3866          for (unsigned int i = 0; i < num_components; i++) {
3867             bld.MOV(offset(dest, bld, i),
3868                     retype(s.interp_reg(bld, base, comp + i, k), dest.type));
3869          }
3870       }
3871       break;
3872    }
3873 
3874    case nir_intrinsic_load_fs_input_interp_deltas: {
3875       assert(s.stage == MESA_SHADER_FRAGMENT);
3876       assert(nir_src_as_uint(instr->src[0]) == 0);
3877       const unsigned base = nir_intrinsic_base(instr);
3878       const unsigned comp = nir_intrinsic_component(instr);
3879       dest.type = ELK_REGISTER_TYPE_F;
3880 
3881       bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
3882       bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
3883       bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
3884 
3885       break;
3886    }
3887 
3888    case nir_intrinsic_load_barycentric_pixel:
3889    case nir_intrinsic_load_barycentric_centroid:
3890    case nir_intrinsic_load_barycentric_sample: {
3891       /* Use the delta_xy values computed from the payload */
3892       enum elk_barycentric_mode bary = elk_barycentric_mode(instr);
3893       const elk_fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
3894                               offset(s.delta_xy[bary], bld, 1) };
3895       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3896       break;
3897    }
3898 
3899    case nir_intrinsic_load_barycentric_at_sample: {
3900       const glsl_interp_mode interpolation =
3901          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3902 
3903       elk_fs_reg msg_data;
3904       if (nir_src_is_const(instr->src[0])) {
3905          msg_data = elk_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
3906       } else {
3907          const elk_fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
3908                                           ELK_REGISTER_TYPE_UD);
3909          const elk_fs_reg sample_id = bld.emit_uniformize(sample_src);
3910          msg_data = component(bld.group(8, 0).vgrf(ELK_REGISTER_TYPE_UD), 0);
3911          bld.exec_all().group(1, 0).SHL(msg_data, sample_id, elk_imm_ud(4u));
3912       }
3913 
3914       elk_fs_reg flag_reg;
3915       struct elk_wm_prog_key *wm_prog_key = (struct elk_wm_prog_key *) s.key;
3916       if (wm_prog_key->multisample_fbo == ELK_SOMETIMES) {
3917          struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3918 
3919          check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
3920                                  wm_prog_data,
3921                                  INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3922          flag_reg = elk_flag_reg(0, 0);
3923       }
3924 
3925       emit_pixel_interpolater_send(bld,
3926                                    ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3927                                    dest,
3928                                    elk_fs_reg(), /* src */
3929                                    msg_data,
3930                                    flag_reg,
3931                                    interpolation);
3932       break;
3933    }
3934 
3935    case nir_intrinsic_load_barycentric_at_offset: {
3936       const glsl_interp_mode interpolation =
3937          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3938 
3939       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3940 
3941       if (const_offset) {
3942          assert(nir_src_bit_size(instr->src[0]) == 32);
3943          unsigned off_x = const_offset[0].u32 & 0xf;
3944          unsigned off_y = const_offset[1].u32 & 0xf;
3945 
3946          emit_pixel_interpolater_send(bld,
3947                                       ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3948                                       dest,
3949                                       elk_fs_reg(), /* src */
3950                                       elk_imm_ud(off_x | (off_y << 4)),
3951                                       elk_fs_reg(), /* flag_reg */
3952                                       interpolation);
3953       } else {
3954          elk_fs_reg src = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_D);
3955          const enum elk_opcode opcode = ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3956          emit_pixel_interpolater_send(bld,
3957                                       opcode,
3958                                       dest,
3959                                       src,
3960                                       elk_imm_ud(0u),
3961                                       elk_fs_reg(), /* flag_reg */
3962                                       interpolation);
3963       }
3964       break;
3965    }
3966 
3967    case nir_intrinsic_load_frag_coord:
3968       emit_fragcoord_interpolation(ntb, dest);
3969       break;
3970 
3971    case nir_intrinsic_load_interpolated_input: {
3972       assert(instr->src[0].ssa &&
3973              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3974       nir_intrinsic_instr *bary_intrinsic =
3975          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3976       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3977       enum glsl_interp_mode interp_mode =
3978          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3979       elk_fs_reg dst_xy;
3980 
3981       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3982           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3983          /* Use the result of the PI message. */
3984          dst_xy = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F);
3985       } else {
3986          /* Use the delta_xy values computed from the payload */
3987          enum elk_barycentric_mode bary = elk_barycentric_mode(bary_intrinsic);
3988          dst_xy = s.delta_xy[bary];
3989       }
3990 
3991       for (unsigned int i = 0; i < instr->num_components; i++) {
3992          elk_fs_reg interp =
3993             s.interp_reg(bld, nir_intrinsic_base(instr),
3994                          nir_intrinsic_component(instr) + i, 0);
3995          interp.type = ELK_REGISTER_TYPE_F;
3996          dest.type = ELK_REGISTER_TYPE_F;
3997 
3998          if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3999             elk_fs_reg tmp = s.vgrf(glsl_float_type());
4000             bld.emit(ELK_FS_OPCODE_LINTERP, tmp, dst_xy, interp);
4001             bld.MUL(offset(dest, bld, i), tmp, s.pixel_w);
4002          } else {
4003             bld.emit(ELK_FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
4004          }
4005       }
4006       break;
4007    }
4008 
4009    default:
4010       fs_nir_emit_intrinsic(ntb, bld, instr);
4011       break;
4012    }
4013 }
4014 
4015 static void
fs_nir_emit_cs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)4016 fs_nir_emit_cs_intrinsic(nir_to_elk_state &ntb,
4017                          nir_intrinsic_instr *instr)
4018 {
4019    const intel_device_info *devinfo = ntb.devinfo;
4020    const fs_builder &bld = ntb.bld;
4021    elk_fs_visitor &s = ntb.s;
4022 
4023    assert(gl_shader_stage_uses_workgroup(s.stage));
4024    struct elk_cs_prog_data *cs_prog_data = elk_cs_prog_data(s.prog_data);
4025 
4026    elk_fs_reg dest;
4027    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4028       dest = get_nir_def(ntb, instr->def);
4029 
4030    switch (instr->intrinsic) {
4031    case nir_intrinsic_barrier:
4032       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4033          fs_nir_emit_intrinsic(ntb, bld, instr);
4034       if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4035          /* The whole workgroup fits in a single HW thread, so all the
4036           * invocations are already executed lock-step.  Instead of an actual
4037           * barrier just emit a scheduling fence, that will generate no code.
4038           */
4039          if (!s.nir->info.workgroup_size_variable &&
4040              s.workgroup_size() <= s.dispatch_width) {
4041             bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE);
4042             break;
4043          }
4044 
4045          emit_barrier(ntb);
4046          cs_prog_data->uses_barrier = true;
4047       }
4048       break;
4049 
4050    case nir_intrinsic_load_subgroup_id:
4051       s.cs_payload().load_subgroup_id(bld, dest);
4052       break;
4053 
4054    case nir_intrinsic_load_workgroup_id: {
4055       elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4056       assert(val.file != BAD_FILE);
4057       dest.type = val.type;
4058       for (unsigned i = 0; i < 3; i++)
4059          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
4060       break;
4061    }
4062 
4063    case nir_intrinsic_load_num_workgroups: {
4064       assert(instr->def.bit_size == 32);
4065 
4066       cs_prog_data->uses_num_work_groups = true;
4067 
4068       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4069       srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(0);
4070       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4071       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(3); /* num components */
4072       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = elk_imm_ud(0);
4073       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4074       elk_fs_inst *inst =
4075          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4076                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4077       inst->size_written = 3 * s.dispatch_width * 4;
4078       break;
4079    }
4080 
4081    case nir_intrinsic_shared_atomic:
4082    case nir_intrinsic_shared_atomic_swap:
4083       fs_nir_emit_surface_atomic(ntb, bld, instr, elk_imm_ud(GFX7_BTI_SLM),
4084                                  false /* bindless */);
4085       break;
4086 
4087    case nir_intrinsic_load_shared: {
4088       assert(devinfo->ver >= 7);
4089 
4090       const unsigned bit_size = instr->def.bit_size;
4091       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4092       srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4093 
4094       elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
4095       int base = nir_intrinsic_base(instr);
4096       if (base) {
4097          elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4098          bld.ADD(addr_off, addr, elk_imm_d(base));
4099          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4100       } else {
4101          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4102       }
4103 
4104       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4105       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4106 
4107       /* Make dest unsigned because that's what the temporary will be */
4108       dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4109 
4110       /* Read the vector */
4111       assert(bit_size <= 32);
4112       assert(nir_intrinsic_align(instr) > 0);
4113       if (bit_size == 32 &&
4114           nir_intrinsic_align(instr) >= 4) {
4115          assert(instr->def.num_components <= 4);
4116          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4117          elk_fs_inst *inst =
4118             bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4119                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4120          inst->size_written = instr->num_components * s.dispatch_width * 4;
4121       } else {
4122          assert(instr->def.num_components == 1);
4123          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4124 
4125          elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
4126          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4127                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4128          bld.MOV(dest, subscript(read_result, dest.type, 0));
4129       }
4130       break;
4131    }
4132 
4133    case nir_intrinsic_store_shared: {
4134       assert(devinfo->ver >= 7);
4135 
4136       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4137       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4138       srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4139 
4140       elk_fs_reg addr = get_nir_src(ntb, instr->src[1]);
4141       int base = nir_intrinsic_base(instr);
4142       if (base) {
4143          elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4144          bld.ADD(addr_off, addr, elk_imm_d(base));
4145          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4146       } else {
4147          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4148       }
4149 
4150       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4151       /* No point in masking with sample mask, here we're handling compute
4152        * intrinsics.
4153        */
4154       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4155 
4156       elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
4157       data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4158 
4159       assert(bit_size <= 32);
4160       assert(nir_intrinsic_write_mask(instr) ==
4161              (1u << instr->num_components) - 1);
4162       assert(nir_intrinsic_align(instr) > 0);
4163       if (bit_size == 32 &&
4164           nir_intrinsic_align(instr) >= 4) {
4165          assert(nir_src_num_components(instr->src[0]) <= 4);
4166          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4167          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4168          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4169                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4170       } else {
4171          assert(nir_src_num_components(instr->src[0]) == 1);
4172          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4173 
4174          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
4175          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4176 
4177          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4178                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4179       }
4180       break;
4181    }
4182 
4183    case nir_intrinsic_load_workgroup_size: {
4184       /* Should have been lowered by elk_nir_lower_cs_intrinsics() or
4185        * crocus/iris_setup_uniforms() for the variable group size case.
4186        */
4187       unreachable("Should have been lowered");
4188       break;
4189    }
4190 
4191    default:
4192       fs_nir_emit_intrinsic(ntb, bld, instr);
4193       break;
4194    }
4195 }
4196 
4197 static elk_fs_reg
elk_nir_reduction_op_identity(const fs_builder & bld,nir_op op,elk_reg_type type)4198 elk_nir_reduction_op_identity(const fs_builder &bld,
4199                               nir_op op, elk_reg_type type)
4200 {
4201    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4202    switch (type_sz(type)) {
4203    case 1:
4204       if (type == ELK_REGISTER_TYPE_UB) {
4205          return elk_imm_uw(value.u8);
4206       } else {
4207          assert(type == ELK_REGISTER_TYPE_B);
4208          return elk_imm_w(value.i8);
4209       }
4210    case 2:
4211       return retype(elk_imm_uw(value.u16), type);
4212    case 4:
4213       return retype(elk_imm_ud(value.u32), type);
4214    case 8:
4215       if (type == ELK_REGISTER_TYPE_DF)
4216          return elk_setup_imm_df(bld, value.f64);
4217       else
4218          return retype(elk_imm_u64(value.u64), type);
4219    default:
4220       unreachable("Invalid type size");
4221    }
4222 }
4223 
4224 static elk_opcode
elk_op_for_nir_reduction_op(nir_op op)4225 elk_op_for_nir_reduction_op(nir_op op)
4226 {
4227    switch (op) {
4228    case nir_op_iadd: return ELK_OPCODE_ADD;
4229    case nir_op_fadd: return ELK_OPCODE_ADD;
4230    case nir_op_imul: return ELK_OPCODE_MUL;
4231    case nir_op_fmul: return ELK_OPCODE_MUL;
4232    case nir_op_imin: return ELK_OPCODE_SEL;
4233    case nir_op_umin: return ELK_OPCODE_SEL;
4234    case nir_op_fmin: return ELK_OPCODE_SEL;
4235    case nir_op_imax: return ELK_OPCODE_SEL;
4236    case nir_op_umax: return ELK_OPCODE_SEL;
4237    case nir_op_fmax: return ELK_OPCODE_SEL;
4238    case nir_op_iand: return ELK_OPCODE_AND;
4239    case nir_op_ior:  return ELK_OPCODE_OR;
4240    case nir_op_ixor: return ELK_OPCODE_XOR;
4241    default:
4242       unreachable("Invalid reduction operation");
4243    }
4244 }
4245 
4246 static elk_conditional_mod
elk_cond_mod_for_nir_reduction_op(nir_op op)4247 elk_cond_mod_for_nir_reduction_op(nir_op op)
4248 {
4249    switch (op) {
4250    case nir_op_iadd: return ELK_CONDITIONAL_NONE;
4251    case nir_op_fadd: return ELK_CONDITIONAL_NONE;
4252    case nir_op_imul: return ELK_CONDITIONAL_NONE;
4253    case nir_op_fmul: return ELK_CONDITIONAL_NONE;
4254    case nir_op_imin: return ELK_CONDITIONAL_L;
4255    case nir_op_umin: return ELK_CONDITIONAL_L;
4256    case nir_op_fmin: return ELK_CONDITIONAL_L;
4257    case nir_op_imax: return ELK_CONDITIONAL_GE;
4258    case nir_op_umax: return ELK_CONDITIONAL_GE;
4259    case nir_op_fmax: return ELK_CONDITIONAL_GE;
4260    case nir_op_iand: return ELK_CONDITIONAL_NONE;
4261    case nir_op_ior:  return ELK_CONDITIONAL_NONE;
4262    case nir_op_ixor: return ELK_CONDITIONAL_NONE;
4263    default:
4264       unreachable("Invalid reduction operation");
4265    }
4266 }
4267 
4268 struct rebuild_resource {
4269    unsigned idx;
4270    std::vector<nir_def *> array;
4271 };
4272 
4273 static bool
add_rebuild_src(nir_src * src,void * state)4274 add_rebuild_src(nir_src *src, void *state)
4275 {
4276    struct rebuild_resource *res = (struct rebuild_resource *) state;
4277 
4278    for (nir_def *def : res->array) {
4279       if (def == src->ssa)
4280          return true;
4281    }
4282 
4283    nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
4284    res->array.push_back(src->ssa);
4285    return true;
4286 }
4287 
4288 static elk_fs_reg
try_rebuild_resource(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_def * resource_def)4289 try_rebuild_resource(nir_to_elk_state &ntb, const elk::fs_builder &bld, nir_def *resource_def)
4290 {
4291    /* Create a build at the location of the resource_intel intrinsic */
4292    fs_builder ubld8 = bld.exec_all().group(8, 0);
4293 
4294    struct rebuild_resource resources = {};
4295    resources.idx = 0;
4296 
4297    if (!nir_foreach_src(resource_def->parent_instr,
4298                         add_rebuild_src, &resources))
4299       return elk_fs_reg();
4300    resources.array.push_back(resource_def);
4301 
4302    if (resources.array.size() == 1) {
4303       nir_def *def = resources.array[0];
4304 
4305       if (def->parent_instr->type == nir_instr_type_load_const) {
4306          nir_load_const_instr *load_const =
4307             nir_instr_as_load_const(def->parent_instr);
4308          return elk_imm_ud(load_const->value[0].i32);
4309       } else {
4310          assert(def->parent_instr->type == nir_instr_type_intrinsic &&
4311                 (nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
4312                  nir_intrinsic_load_uniform));
4313          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
4314          unsigned base_offset = nir_intrinsic_base(intrin);
4315          unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4316          elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4317          src.offset = load_offset + base_offset % 4;
4318          return src;
4319       }
4320    }
4321 
4322    for (unsigned i = 0; i < resources.array.size(); i++) {
4323       nir_def *def = resources.array[i];
4324 
4325       nir_instr *instr = def->parent_instr;
4326       switch (instr->type) {
4327       case nir_instr_type_load_const: {
4328          nir_load_const_instr *load_const =
4329             nir_instr_as_load_const(instr);
4330          elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4331          ntb.resource_insts[def->index] =
4332             ubld8.MOV(dst, elk_imm_ud(load_const->value[0].i32));
4333          break;
4334       }
4335 
4336       case nir_instr_type_alu: {
4337          nir_alu_instr *alu = nir_instr_as_alu(instr);
4338 
4339          if (nir_op_infos[alu->op].num_inputs == 2) {
4340             if (alu->src[0].swizzle[0] != 0 ||
4341                 alu->src[1].swizzle[0] != 0)
4342                break;
4343          } else if (nir_op_infos[alu->op].num_inputs == 3) {
4344             if (alu->src[0].swizzle[0] != 0 ||
4345                 alu->src[1].swizzle[0] != 0 ||
4346                 alu->src[2].swizzle[0] != 0)
4347                break;
4348          } else {
4349             /* Not supported ALU input count */
4350             break;
4351          }
4352 
4353          switch (alu->op) {
4354          case nir_op_iadd: {
4355             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4356             elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4357             elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4358             assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4359             assert(src0.type == ELK_REGISTER_TYPE_UD);
4360             ntb.resource_insts[def->index] =
4361                ubld8.ADD(dst,
4362                          src0.file != IMM ? src0 : src1,
4363                          src0.file != IMM ? src1 : src0);
4364             break;
4365          }
4366          case nir_op_ushr: {
4367             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4368             elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4369             elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4370             assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4371             assert(src0.type == ELK_REGISTER_TYPE_UD);
4372             ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
4373             break;
4374          }
4375          case nir_op_ishl: {
4376             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4377             elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4378             elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4379             assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4380             assert(src0.type == ELK_REGISTER_TYPE_UD);
4381             ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
4382             break;
4383          }
4384          case nir_op_mov: {
4385             break;
4386          }
4387          default:
4388             break;
4389          }
4390          break;
4391       }
4392 
4393       case nir_instr_type_intrinsic: {
4394          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4395          switch (intrin->intrinsic) {
4396          case nir_intrinsic_resource_intel:
4397             ntb.resource_insts[def->index] =
4398                ntb.resource_insts[intrin->src[1].ssa->index];
4399             break;
4400 
4401          case nir_intrinsic_load_uniform: {
4402             if (!nir_src_is_const(intrin->src[0]))
4403                break;
4404 
4405             unsigned base_offset = nir_intrinsic_base(intrin);
4406             unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4407             elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4408             elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4409             src.offset = load_offset + base_offset % 4;
4410             ntb.resource_insts[def->index] = ubld8.MOV(dst, src);
4411             break;
4412          }
4413 
4414          default:
4415             break;
4416          }
4417          break;
4418       }
4419 
4420       default:
4421          break;
4422       }
4423 
4424       if (ntb.resource_insts[def->index] == NULL)
4425          return elk_fs_reg();
4426    }
4427 
4428    assert(ntb.resource_insts[resource_def->index] != NULL);
4429    return component(ntb.resource_insts[resource_def->index]->dst, 0);
4430 }
4431 
4432 static elk_fs_reg
get_nir_image_intrinsic_image(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4433 get_nir_image_intrinsic_image(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4434                               nir_intrinsic_instr *instr)
4435 {
4436    if (is_resource_src(instr->src[0])) {
4437       elk_fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]);
4438       if (surf_index.file != BAD_FILE)
4439          return surf_index;
4440    }
4441 
4442    elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD);
4443    elk_fs_reg surf_index = image;
4444 
4445    return bld.emit_uniformize(surf_index);
4446 }
4447 
4448 static elk_fs_reg
get_nir_buffer_intrinsic_index(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4449 get_nir_buffer_intrinsic_index(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4450                                nir_intrinsic_instr *instr)
4451 {
4452    /* SSBO stores are weird in that their index is in src[1] */
4453    const bool is_store =
4454       instr->intrinsic == nir_intrinsic_store_ssbo ||
4455       instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4456    nir_src src = is_store ? instr->src[1] : instr->src[0];
4457 
4458    if (nir_src_is_const(src)) {
4459       return elk_imm_ud(nir_src_as_uint(src));
4460    } else if (is_resource_src(src)) {
4461       elk_fs_reg surf_index = get_resource_nir_src(ntb, src);
4462       if (surf_index.file != BAD_FILE)
4463          return surf_index;
4464    }
4465    return bld.emit_uniformize(get_nir_src(ntb, src));
4466 }
4467 
4468 /**
4469  * The offsets we get from NIR act as if each SIMD channel has it's own blob
4470  * of contiguous space.  However, if we actually place each SIMD channel in
4471  * it's own space, we end up with terrible cache performance because each SIMD
4472  * channel accesses a different cache line even when they're all accessing the
4473  * same byte offset.  To deal with this problem, we swizzle the address using
4474  * a simple algorithm which ensures that any time a SIMD message reads or
4475  * writes the same address, it's all in the same cache line.  We have to keep
4476  * the bottom two bits fixed so that we can read/write up to a dword at a time
4477  * and the individual element is contiguous.  We do this by splitting the
4478  * address as follows:
4479  *
4480  *    31                             4-6           2          0
4481  *    +-------------------------------+------------+----------+
4482  *    |        Hi address bits        | chan index | addr low |
4483  *    +-------------------------------+------------+----------+
4484  *
4485  * In other words, the bottom two address bits stay, and the top 30 get
4486  * shifted up so that we can stick the SIMD channel index in the middle.  This
4487  * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4488  * at the same logical offset, the scratch read/write instruction acts on
4489  * continuous elements and we get good cache locality.
4490  */
4491 static elk_fs_reg
swizzle_nir_scratch_addr(nir_to_elk_state & ntb,const elk::fs_builder & bld,const elk_fs_reg & nir_addr,bool in_dwords)4492 swizzle_nir_scratch_addr(nir_to_elk_state &ntb,
4493                          const elk::fs_builder &bld,
4494                          const elk_fs_reg &nir_addr,
4495                          bool in_dwords)
4496 {
4497    elk_fs_visitor &s = ntb.s;
4498 
4499    const elk_fs_reg &chan_index =
4500       ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4501    const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4502 
4503    elk_fs_reg addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4504    if (in_dwords) {
4505       /* In this case, we know the address is aligned to a DWORD and we want
4506        * the final address in DWORDs.
4507        */
4508       bld.SHL(addr, nir_addr, elk_imm_ud(chan_index_bits - 2));
4509       bld.OR(addr, addr, chan_index);
4510    } else {
4511       /* This case substantially more annoying because we have to pay
4512        * attention to those pesky two bottom bits.
4513        */
4514       elk_fs_reg addr_hi = bld.vgrf(ELK_REGISTER_TYPE_UD);
4515       bld.AND(addr_hi, nir_addr, elk_imm_ud(~0x3u));
4516       bld.SHL(addr_hi, addr_hi, elk_imm_ud(chan_index_bits));
4517       elk_fs_reg chan_addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4518       bld.SHL(chan_addr, chan_index, elk_imm_ud(2));
4519       bld.AND(addr, nir_addr, elk_imm_ud(0x3u));
4520       bld.OR(addr, addr, addr_hi);
4521       bld.OR(addr, addr, chan_addr);
4522    }
4523    return addr;
4524 }
4525 
4526 static unsigned
choose_oword_block_size_dwords(const struct intel_device_info * devinfo,unsigned dwords)4527 choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
4528                                unsigned dwords)
4529 {
4530    unsigned block;
4531    if (devinfo->has_lsc && dwords >= 64) {
4532       block = 64;
4533    } else if (dwords >= 32) {
4534       block = 32;
4535    } else if (dwords >= 16) {
4536       block = 16;
4537    } else {
4538       block = 8;
4539    }
4540    assert(block <= dwords);
4541    return block;
4542 }
4543 
4544 static void
increment_a64_address(const fs_builder & bld,elk_fs_reg address,uint32_t v)4545 increment_a64_address(const fs_builder &bld, elk_fs_reg address, uint32_t v)
4546 {
4547    if (bld.shader->devinfo->has_64bit_int) {
4548       bld.ADD(address, address, elk_imm_ud(v));
4549    } else {
4550       elk_fs_reg low = retype(address, ELK_REGISTER_TYPE_UD);
4551       elk_fs_reg high = offset(low, bld, 1);
4552 
4553       /* Add low and if that overflows, add carry to high. */
4554       bld.ADD(low, low, elk_imm_ud(v))->conditional_mod = ELK_CONDITIONAL_O;
4555       bld.ADD(high, high, elk_imm_ud(0x1))->predicate = ELK_PREDICATE_NORMAL;
4556    }
4557 }
4558 
4559 static elk_fs_reg
emit_fence(const fs_builder & bld,enum elk_opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4560 emit_fence(const fs_builder &bld, enum elk_opcode opcode,
4561            uint8_t sfid, uint32_t desc,
4562            bool commit_enable, uint8_t bti)
4563 {
4564    assert(opcode == ELK_SHADER_OPCODE_INTERLOCK ||
4565           opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
4566 
4567    elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
4568    elk_fs_inst *fence = bld.emit(opcode, dst, elk_vec8_grf(0, 0),
4569                              elk_imm_ud(commit_enable),
4570                              elk_imm_ud(bti));
4571    fence->sfid = sfid;
4572    fence->desc = desc;
4573 
4574    return dst;
4575 }
4576 
4577 /**
4578  * Create a MOV to read the timestamp register.
4579  */
4580 static elk_fs_reg
get_timestamp(const fs_builder & bld)4581 get_timestamp(const fs_builder &bld)
4582 {
4583    elk_fs_visitor &s = *bld.shader;
4584    const intel_device_info *devinfo = s.devinfo;
4585 
4586    assert(devinfo->ver >= 7);
4587 
4588    elk_fs_reg ts = elk_fs_reg(retype(elk_vec4_reg(ELK_ARCHITECTURE_REGISTER_FILE,
4589                                           ELK_ARF_TIMESTAMP,
4590                                           0),
4591                              ELK_REGISTER_TYPE_UD));
4592 
4593    elk_fs_reg dst = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
4594 
4595    /* We want to read the 3 fields we care about even if it's not enabled in
4596     * the dispatch.
4597     */
4598    bld.group(4, 0).exec_all().MOV(dst, ts);
4599 
4600    return dst;
4601 }
4602 
4603 static void
fs_nir_emit_intrinsic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)4604 fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
4605                       const fs_builder &bld, nir_intrinsic_instr *instr)
4606 {
4607    const intel_device_info *devinfo = ntb.devinfo;
4608    elk_fs_visitor &s = ntb.s;
4609 
4610    /* We handle this as a special case */
4611    if (instr->intrinsic == nir_intrinsic_decl_reg) {
4612       assert(nir_intrinsic_num_array_elems(instr) == 0);
4613       unsigned bit_size = nir_intrinsic_bit_size(instr);
4614       unsigned num_components = nir_intrinsic_num_components(instr);
4615       const elk_reg_type reg_type =
4616          elk_reg_type_from_bit_size(bit_size, bit_size == 8 ?
4617                                               ELK_REGISTER_TYPE_D :
4618                                               ELK_REGISTER_TYPE_F);
4619 
4620       /* Re-use the destination's slot in the table for the register */
4621       ntb.ssa_values[instr->def.index] =
4622          bld.vgrf(reg_type, num_components);
4623       return;
4624    }
4625 
4626    elk_fs_reg dest;
4627    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4628       dest = get_nir_def(ntb, instr->def);
4629 
4630    switch (instr->intrinsic) {
4631    case nir_intrinsic_resource_intel:
4632       ntb.ssa_bind_infos[instr->def.index].valid = true;
4633       ntb.ssa_bind_infos[instr->def.index].bindless =
4634          (nir_intrinsic_resource_access_intel(instr) &
4635           nir_resource_intel_bindless) != 0;
4636       ntb.ssa_bind_infos[instr->def.index].block =
4637          nir_intrinsic_resource_block_intel(instr);
4638       ntb.ssa_bind_infos[instr->def.index].set =
4639          nir_intrinsic_desc_set(instr);
4640       ntb.ssa_bind_infos[instr->def.index].binding =
4641          nir_intrinsic_binding(instr);
4642 
4643       if (nir_intrinsic_resource_access_intel(instr) &
4644            nir_resource_intel_non_uniform) {
4645          ntb.resource_values[instr->def.index] = elk_fs_reg();
4646       } else {
4647          ntb.resource_values[instr->def.index] =
4648             try_rebuild_resource(ntb, bld, instr->src[1].ssa);
4649       }
4650       ntb.ssa_values[instr->def.index] =
4651          ntb.ssa_values[instr->src[1].ssa->index];
4652       break;
4653 
4654    case nir_intrinsic_load_reg:
4655    case nir_intrinsic_store_reg:
4656       /* Nothing to do with these. */
4657       break;
4658 
4659    case nir_intrinsic_image_load:
4660    case nir_intrinsic_image_store:
4661    case nir_intrinsic_image_atomic:
4662    case nir_intrinsic_image_atomic_swap:
4663    case nir_intrinsic_bindless_image_load:
4664    case nir_intrinsic_bindless_image_store:
4665    case nir_intrinsic_bindless_image_atomic:
4666    case nir_intrinsic_bindless_image_atomic_swap: {
4667       /* Get some metadata from the image intrinsic. */
4668       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
4669 
4670       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4671 
4672       switch (instr->intrinsic) {
4673       case nir_intrinsic_image_load:
4674       case nir_intrinsic_image_store:
4675       case nir_intrinsic_image_atomic:
4676       case nir_intrinsic_image_atomic_swap:
4677          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4678             get_nir_image_intrinsic_image(ntb, bld, instr);
4679          break;
4680 
4681       default:
4682          /* Bindless */
4683          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4684             get_nir_image_intrinsic_image(ntb, bld, instr);
4685          break;
4686       }
4687 
4688       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4689       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4690          elk_imm_ud(nir_image_intrinsic_coord_components(instr));
4691 
4692       /* Emit an image load, store or atomic op. */
4693       if (instr->intrinsic == nir_intrinsic_image_load ||
4694           instr->intrinsic == nir_intrinsic_bindless_image_load) {
4695          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4696          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4697          elk_fs_inst *inst =
4698             bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4699                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4700          inst->size_written = instr->num_components * s.dispatch_width * 4;
4701       } else if (instr->intrinsic == nir_intrinsic_image_store ||
4702                  instr->intrinsic == nir_intrinsic_bindless_image_store) {
4703          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4704          srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]);
4705          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4706          bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4707                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4708       } else {
4709          unsigned num_srcs = info->num_srcs;
4710          enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
4711          if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
4712             assert(num_srcs == 4);
4713             num_srcs = 3;
4714          }
4715 
4716          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
4717 
4718          elk_fs_reg data;
4719          if (num_srcs >= 4)
4720             data = get_nir_src(ntb, instr->src[3]);
4721          if (num_srcs >= 5) {
4722             elk_fs_reg tmp = bld.vgrf(data.type, 2);
4723             elk_fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) };
4724             bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4725             data = tmp;
4726          }
4727          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4728          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4729 
4730          bld.emit(ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4731                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4732       }
4733       break;
4734    }
4735 
4736    case nir_intrinsic_image_size:
4737    case nir_intrinsic_bindless_image_size: {
4738       /* Cube image sizes should have previously been lowered to a 2D array */
4739       assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
4740 
4741       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4742        * into will handle the binding table index for us in the geneerator.
4743        * Incidentally, this means that we can handle bindless with exactly the
4744        * same code.
4745        */
4746       elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]),
4747                             ELK_REGISTER_TYPE_UD);
4748       image = bld.emit_uniformize(image);
4749 
4750       assert(nir_src_as_uint(instr->src[1]) == 0);
4751 
4752       elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4753       if (instr->intrinsic == nir_intrinsic_image_size)
4754          srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4755       else
4756          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4757       srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_d(0);
4758       srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(0);
4759       srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
4760       srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
4761 
4762       /* Since the image size is always uniform, we can just emit a SIMD8
4763        * query instruction and splat the result out.
4764        */
4765       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
4766 
4767       elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
4768       elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4769                                 tmp, srcs, ARRAY_SIZE(srcs));
4770       inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
4771 
4772       for (unsigned c = 0; c < instr->def.num_components; ++c) {
4773          bld.MOV(offset(retype(dest, tmp.type), bld, c),
4774                  component(offset(tmp, ubld, c), 0));
4775       }
4776       break;
4777    }
4778 
4779    case nir_intrinsic_image_load_raw_intel: {
4780       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4781       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4782          get_nir_image_intrinsic_image(ntb, bld, instr);
4783       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4784       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4785       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4786       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4787 
4788       elk_fs_inst *inst =
4789          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4790                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4791       inst->size_written = instr->num_components * s.dispatch_width * 4;
4792       break;
4793    }
4794 
4795    case nir_intrinsic_image_store_raw_intel: {
4796       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4797       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4798          get_nir_image_intrinsic_image(ntb, bld, instr);
4799       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4800       srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]);
4801       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4802       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4803       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4804 
4805       bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4806                elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4807       break;
4808    }
4809 
4810    case nir_intrinsic_barrier:
4811    case nir_intrinsic_begin_invocation_interlock:
4812    case nir_intrinsic_end_invocation_interlock: {
4813       bool ugm_fence, slm_fence, tgm_fence, urb_fence;
4814       enum elk_opcode opcode = ELK_OPCODE_NOP;
4815 
4816       /* Handling interlock intrinsics here will allow the logic for IVB
4817        * render cache (see below) to be reused.
4818        */
4819 
4820       switch (instr->intrinsic) {
4821       case nir_intrinsic_barrier: {
4822          /* Note we only care about the memory part of the
4823           * barrier.  The execution part will be taken care
4824           * of by the stage specific intrinsic handler functions.
4825           */
4826          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
4827          ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
4828          slm_fence = modes & nir_var_mem_shared;
4829          tgm_fence = modes & nir_var_image;
4830          urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
4831          if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4832             opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4833          break;
4834       }
4835 
4836       case nir_intrinsic_begin_invocation_interlock:
4837          /* For beginInvocationInterlockARB(), we will generate a memory fence
4838           * but with a different opcode so that generator can pick SENDC
4839           * instead of SEND.
4840           */
4841          assert(s.stage == MESA_SHADER_FRAGMENT);
4842          ugm_fence = tgm_fence = true;
4843          slm_fence = urb_fence = false;
4844          opcode = ELK_SHADER_OPCODE_INTERLOCK;
4845          break;
4846 
4847       case nir_intrinsic_end_invocation_interlock:
4848          /* For endInvocationInterlockARB(), we need to insert a memory fence which
4849           * stalls in the shader until the memory transactions prior to that
4850           * fence are complete.  This ensures that the shader does not end before
4851           * any writes from its critical section have landed.  Otherwise, you can
4852           * end up with a case where the next invocation on that pixel properly
4853           * stalls for previous FS invocation on its pixel to complete but
4854           * doesn't actually wait for the dataport memory transactions from that
4855           * thread to land before submitting its own.
4856           */
4857          assert(s.stage == MESA_SHADER_FRAGMENT);
4858          ugm_fence = tgm_fence = true;
4859          slm_fence = urb_fence = false;
4860          opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4861          break;
4862 
4863       default:
4864          unreachable("invalid intrinsic");
4865       }
4866 
4867       if (opcode == ELK_OPCODE_NOP)
4868          break;
4869 
4870       if (s.nir->info.shared_size > 0) {
4871          assert(gl_shader_stage_uses_workgroup(s.stage));
4872       } else {
4873          slm_fence = false;
4874       }
4875 
4876       /* If the workgroup fits in a single HW thread, the messages for SLM are
4877        * processed in-order and the shader itself is already synchronized so
4878        * the memory fence is not necessary.
4879        *
4880        * TODO: Check if applies for many HW threads sharing same Data Port.
4881        */
4882       if (!s.nir->info.workgroup_size_variable &&
4883           slm_fence && s.workgroup_size() <= s.dispatch_width)
4884          slm_fence = false;
4885 
4886       switch (s.stage) {
4887          case MESA_SHADER_TESS_CTRL:
4888             break;
4889          default:
4890             urb_fence = false;
4891             break;
4892       }
4893 
4894       unsigned fence_regs_count = 0;
4895       elk_fs_reg fence_regs[4] = {};
4896 
4897       const fs_builder ubld = bld.group(8, 0);
4898 
4899       /* Prior to Icelake, they're all lumped into a single cache except on
4900        * Ivy Bridge and Bay Trail where typed messages actually go through
4901        * the render cache.  There, we need both fences because we may
4902        * access storage images as either typed or untyped.
4903        */
4904       const bool render_fence = tgm_fence && devinfo->verx10 == 70;
4905 
4906       const bool commit_enable = render_fence ||
4907          instr->intrinsic == nir_intrinsic_end_invocation_interlock;
4908 
4909       if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
4910          fence_regs[fence_regs_count++] =
4911             emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
4912                        commit_enable, 0 /* BTI */);
4913       }
4914 
4915       if (render_fence) {
4916          fence_regs[fence_regs_count++] =
4917             emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
4918                        commit_enable, /* bti */ 0);
4919       }
4920 
4921       assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
4922 
4923       /* There are four cases where we want to insert a stall:
4924        *
4925        *  1. If we're a nir_intrinsic_end_invocation_interlock.  This is
4926        *     required to ensure that the shader EOT doesn't happen until
4927        *     after the fence returns.  Otherwise, we might end up with the
4928        *     next shader invocation for that pixel not respecting our fence
4929        *     because it may happen on a different HW thread.
4930        *
4931        *  2. If we have multiple fences.  This is required to ensure that
4932        *     they all complete and nothing gets weirdly out-of-order.
4933        *
4934        *  3. If we have no fences.  In this case, we need at least a
4935        *     scheduling barrier to keep the compiler from moving things
4936        *     around in an invalid way.
4937        */
4938       if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
4939           fence_regs_count != 1) {
4940          ubld.exec_all().group(1, 0).emit(
4941             ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
4942             fence_regs, fence_regs_count);
4943       }
4944 
4945       break;
4946    }
4947 
4948    case nir_intrinsic_shader_clock: {
4949       /* We cannot do anything if there is an event, so ignore it for now */
4950       const elk_fs_reg shader_clock = get_timestamp(bld);
4951       const elk_fs_reg srcs[] = { component(shader_clock, 0),
4952                               component(shader_clock, 1) };
4953       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4954       break;
4955    }
4956 
4957    case nir_intrinsic_load_reloc_const_intel: {
4958       uint32_t id = nir_intrinsic_param_idx(instr);
4959 
4960       /* Emit the reloc in the smallest SIMD size to limit register usage. */
4961       const fs_builder ubld = bld.exec_all().group(1, 0);
4962       elk_fs_reg small_dest = ubld.vgrf(dest.type);
4963       ubld.UNDEF(small_dest);
4964       ubld.exec_all().group(1, 0).emit(ELK_SHADER_OPCODE_MOV_RELOC_IMM,
4965                                        small_dest, elk_imm_ud(id));
4966 
4967       /* Copy propagation will get rid of this MOV. */
4968       bld.MOV(dest, component(small_dest, 0));
4969       break;
4970    }
4971 
4972    case nir_intrinsic_load_uniform: {
4973       /* Offsets are in bytes but they should always aligned to
4974        * the type size
4975        */
4976       unsigned base_offset = nir_intrinsic_base(instr);
4977       assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
4978 
4979       elk_fs_reg src(UNIFORM, base_offset / 4, dest.type);
4980 
4981       if (nir_src_is_const(instr->src[0])) {
4982          unsigned load_offset = nir_src_as_uint(instr->src[0]);
4983          assert(load_offset % type_sz(dest.type) == 0);
4984          /* The base offset can only handle 32-bit units, so for 16-bit
4985           * data take the modulo of the offset with 4 bytes and add it to
4986           * the offset to read from within the source register.
4987           */
4988          src.offset = load_offset + base_offset % 4;
4989 
4990          for (unsigned j = 0; j < instr->num_components; j++) {
4991             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4992          }
4993       } else {
4994          elk_fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
4995                                   ELK_REGISTER_TYPE_UD);
4996 
4997          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4998           * go past the end of the uniform.  In order to keep the n'th
4999           * component from running past, we subtract off the size of all but
5000           * one component of the vector.
5001           */
5002          assert(nir_intrinsic_range(instr) >=
5003                 instr->num_components * type_sz(dest.type));
5004          unsigned read_size = nir_intrinsic_range(instr) -
5005             (instr->num_components - 1) * type_sz(dest.type);
5006 
5007          bool supports_64bit_indirects = devinfo->platform != INTEL_PLATFORM_CHV;
5008 
5009          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
5010             for (unsigned j = 0; j < instr->num_components; j++) {
5011                bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5012                         offset(dest, bld, j), offset(src, bld, j),
5013                         indirect, elk_imm_ud(read_size));
5014             }
5015          } else {
5016             const unsigned num_mov_indirects =
5017                type_sz(dest.type) / type_sz(ELK_REGISTER_TYPE_UD);
5018             /* We read a little bit less per MOV INDIRECT, as they are now
5019              * 32-bits ones instead of 64-bit. Fix read_size then.
5020              */
5021             const unsigned read_size_32bit = read_size -
5022                 (num_mov_indirects - 1) * type_sz(ELK_REGISTER_TYPE_UD);
5023             for (unsigned j = 0; j < instr->num_components; j++) {
5024                for (unsigned i = 0; i < num_mov_indirects; i++) {
5025                   bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5026                            subscript(offset(dest, bld, j), ELK_REGISTER_TYPE_UD, i),
5027                            subscript(offset(src, bld, j), ELK_REGISTER_TYPE_UD, i),
5028                            indirect, elk_imm_ud(read_size_32bit));
5029                }
5030             }
5031          }
5032       }
5033       break;
5034    }
5035 
5036    case nir_intrinsic_load_ubo:
5037    case nir_intrinsic_load_ubo_uniform_block_intel: {
5038       elk_fs_reg surface, surface_handle;
5039 
5040       if (get_nir_src_bindless(ntb, instr->src[0]))
5041          surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5042       else
5043          surface = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5044 
5045       if (!nir_src_is_const(instr->src[1])) {
5046          if (instr->intrinsic == nir_intrinsic_load_ubo) {
5047             /* load_ubo with non-uniform offset */
5048             elk_fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
5049                                         ELK_REGISTER_TYPE_UD);
5050 
5051             const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4;
5052 
5053             for (int i = 0; i < instr->num_components; i += comps_per_load) {
5054                const unsigned remaining = instr->num_components - i;
5055                s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
5056                                             surface, surface_handle,
5057                                             base_offset,
5058                                             i * type_sz(dest.type),
5059                                             instr->def.bit_size / 8,
5060                                             MIN2(remaining, comps_per_load));
5061             }
5062 
5063             s.prog_data->has_ubo_pull = true;
5064          } else {
5065             /* load_ubo with uniform offset */
5066             const fs_builder ubld1 = bld.exec_all().group(1, 0);
5067             const fs_builder ubld8 = bld.exec_all().group(8, 0);
5068             const fs_builder ubld16 = bld.exec_all().group(16, 0);
5069 
5070             elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5071 
5072             srcs[SURFACE_LOGICAL_SRC_SURFACE]        = surface;
5073             srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
5074 
5075             const nir_src load_offset = instr->src[1];
5076             if (nir_src_is_const(load_offset)) {
5077                elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5078                ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5079                srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5080             } else {
5081                srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5082                   bld.emit_uniformize(get_nir_src(ntb, load_offset));
5083             }
5084 
5085             const unsigned total_dwords =
5086                ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
5087             unsigned loaded_dwords = 0;
5088 
5089             const elk_fs_reg packed_consts =
5090                ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5091 
5092             while (loaded_dwords < total_dwords) {
5093                const unsigned block =
5094                   choose_oword_block_size_dwords(devinfo,
5095                                                  total_dwords - loaded_dwords);
5096                const unsigned block_bytes = block * 4;
5097 
5098                srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5099 
5100                const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5101                ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5102                          retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5103                          srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5104                   align(block_bytes, REG_SIZE * reg_unit(devinfo));
5105 
5106                loaded_dwords += block;
5107 
5108                ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5109                          srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5110                          elk_imm_ud(block_bytes));
5111             }
5112 
5113             for (unsigned c = 0; c < instr->num_components; c++) {
5114                bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5115                        component(packed_consts, c));
5116             }
5117 
5118             s.prog_data->has_ubo_pull = true;
5119          }
5120       } else {
5121          /* Even if we are loading doubles, a pull constant load will load
5122           * a 32-bit vec4, so should only reserve vgrf space for that. If we
5123           * need to load a full dvec4 we will have to emit 2 loads. This is
5124           * similar to demote_pull_constants(), except that in that case we
5125           * see individual accesses to each component of the vector and then
5126           * we let CSE deal with duplicate loads. Here we see a vector access
5127           * and we have to split it if necessary.
5128           */
5129          const unsigned type_size = type_sz(dest.type);
5130          const unsigned load_offset = nir_src_as_uint(instr->src[1]);
5131          const unsigned ubo_block =
5132             elk_nir_ubo_surface_index_get_push_block(instr->src[0]);
5133          const unsigned offset_256b = load_offset / 32;
5134          const unsigned end_256b =
5135             DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
5136 
5137          /* See if we've selected this as a push constant candidate */
5138          elk_fs_reg push_reg;
5139          for (int i = 0; i < 4; i++) {
5140             const struct elk_ubo_range *range = &s.prog_data->ubo_ranges[i];
5141             if (range->block == ubo_block &&
5142                 offset_256b >= range->start &&
5143                 end_256b <= range->start + range->length) {
5144 
5145                push_reg = elk_fs_reg(UNIFORM, UBO_START + i, dest.type);
5146                push_reg.offset = load_offset - 32 * range->start;
5147                break;
5148             }
5149          }
5150 
5151          if (push_reg.file != BAD_FILE) {
5152             for (unsigned i = 0; i < instr->num_components; i++) {
5153                bld.MOV(offset(dest, bld, i),
5154                        byte_offset(push_reg, i * type_size));
5155             }
5156             break;
5157          }
5158 
5159          s.prog_data->has_ubo_pull = true;
5160 
5161          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
5162          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
5163 
5164          for (unsigned c = 0; c < instr->num_components;) {
5165             const unsigned base = load_offset + c * type_size;
5166             /* Number of usable components in the next block-aligned load. */
5167             const unsigned count = MIN2(instr->num_components - c,
5168                                         (block_sz - base % block_sz) / type_size);
5169 
5170             const elk_fs_reg packed_consts = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5171             elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
5172             srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE]        = surface;
5173             srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
5174             srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]         = elk_imm_ud(base & ~(block_sz - 1));
5175             srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]           = elk_imm_ud(block_sz);
5176 
5177             ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
5178                       srcs, PULL_UNIFORM_CONSTANT_SRCS);
5179 
5180             const elk_fs_reg consts =
5181                retype(byte_offset(packed_consts, base & (block_sz - 1)),
5182                       dest.type);
5183 
5184             for (unsigned d = 0; d < count; d++)
5185                bld.MOV(offset(dest, bld, c + d), component(consts, d));
5186 
5187             c += count;
5188          }
5189       }
5190       break;
5191    }
5192 
5193    case nir_intrinsic_load_global:
5194    case nir_intrinsic_load_global_constant: {
5195       assert(devinfo->ver >= 8);
5196 
5197       assert(instr->def.bit_size <= 32);
5198       assert(nir_intrinsic_align(instr) > 0);
5199       elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5200       srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]);
5201       srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5202       srcs[A64_LOGICAL_ENABLE_HELPERS] =
5203          elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5204 
5205       if (instr->def.bit_size == 32 &&
5206           nir_intrinsic_align(instr) >= 4) {
5207          assert(instr->def.num_components <= 4);
5208 
5209          srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5210 
5211          elk_fs_inst *inst =
5212             bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
5213                      srcs, A64_LOGICAL_NUM_SRCS);
5214          inst->size_written = instr->num_components *
5215                               inst->dst.component_size(inst->exec_size);
5216       } else {
5217          const unsigned bit_size = instr->def.bit_size;
5218          assert(instr->def.num_components == 1);
5219          elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5220 
5221          srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5222 
5223          bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
5224                   srcs, A64_LOGICAL_NUM_SRCS);
5225          bld.MOV(dest, subscript(tmp, dest.type, 0));
5226       }
5227       break;
5228    }
5229 
5230    case nir_intrinsic_store_global: {
5231       assert(devinfo->ver >= 8);
5232 
5233       assert(nir_src_bit_size(instr->src[0]) <= 32);
5234       assert(nir_intrinsic_write_mask(instr) ==
5235              (1u << instr->num_components) - 1);
5236       assert(nir_intrinsic_align(instr) > 0);
5237 
5238       elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5239       srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5240       srcs[A64_LOGICAL_ENABLE_HELPERS] =
5241          elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5242 
5243       if (nir_src_bit_size(instr->src[0]) == 32 &&
5244           nir_intrinsic_align(instr) >= 4) {
5245          assert(nir_src_num_components(instr->src[0]) <= 4);
5246 
5247          srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */
5248          srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5249 
5250          bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, elk_fs_reg(),
5251                   srcs, A64_LOGICAL_NUM_SRCS);
5252       } else {
5253          assert(nir_src_num_components(instr->src[0]) == 1);
5254          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5255          elk_reg_type data_type =
5256             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5257          elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5258          bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type));
5259 
5260          srcs[A64_LOGICAL_SRC] = tmp;
5261          srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5262 
5263          bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, elk_fs_reg(),
5264                   srcs, A64_LOGICAL_NUM_SRCS);
5265       }
5266       break;
5267    }
5268 
5269    case nir_intrinsic_global_atomic:
5270    case nir_intrinsic_global_atomic_swap:
5271       fs_nir_emit_global_atomic(ntb, bld, instr);
5272       break;
5273 
5274    case nir_intrinsic_load_global_constant_uniform_block_intel: {
5275       const unsigned total_dwords = ALIGN(instr->num_components,
5276                                           REG_SIZE * reg_unit(devinfo) / 4);
5277       unsigned loaded_dwords = 0;
5278 
5279       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5280       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5281       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5282 
5283       const elk_fs_reg packed_consts =
5284          ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5285       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
5286 
5287       while (loaded_dwords < total_dwords) {
5288          const unsigned block =
5289             choose_oword_block_size_dwords(devinfo,
5290                                            total_dwords - loaded_dwords);
5291          const unsigned block_bytes = block * 4;
5292 
5293          const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5294 
5295          elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5296          srcs[A64_LOGICAL_ADDRESS] = address;
5297          srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5298          srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
5299          srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
5300          ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5301                    retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5302                    srcs, A64_LOGICAL_NUM_SRCS)->size_written =
5303             align(block_bytes, REG_SIZE * reg_unit(devinfo));
5304 
5305          increment_a64_address(ubld1, address, block_bytes);
5306          loaded_dwords += block;
5307       }
5308 
5309       for (unsigned c = 0; c < instr->num_components; c++)
5310          bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5311                  component(packed_consts, c));
5312 
5313       break;
5314    }
5315 
5316    case nir_intrinsic_load_ssbo: {
5317       assert(devinfo->ver >= 7);
5318 
5319       const unsigned bit_size = instr->def.bit_size;
5320       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5321       srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5322            SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5323            SURFACE_LOGICAL_SRC_SURFACE] =
5324          get_nir_buffer_intrinsic_index(ntb, bld, instr);
5325       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5326       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5327       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5328 
5329       /* Make dest unsigned because that's what the temporary will be */
5330       dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5331 
5332       /* Read the vector */
5333       assert(bit_size <= 32);
5334       assert(nir_intrinsic_align(instr) > 0);
5335       if (bit_size == 32 &&
5336           nir_intrinsic_align(instr) >= 4) {
5337          assert(instr->def.num_components <= 4);
5338          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5339          elk_fs_inst *inst =
5340             bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5341                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5342          inst->size_written = instr->num_components * s.dispatch_width * 4;
5343       } else {
5344          assert(instr->def.num_components == 1);
5345          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5346 
5347          elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5348          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5349                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5350          bld.MOV(dest, subscript(read_result, dest.type, 0));
5351       }
5352       break;
5353    }
5354 
5355    case nir_intrinsic_store_ssbo: {
5356       assert(devinfo->ver >= 7);
5357 
5358       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5359       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5360       srcs[get_nir_src_bindless(ntb, instr->src[1]) ?
5361            SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5362            SURFACE_LOGICAL_SRC_SURFACE] =
5363          get_nir_buffer_intrinsic_index(ntb, bld, instr);
5364       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]);
5365       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5366       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
5367 
5368       elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5369       data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5370 
5371       assert(bit_size <= 32);
5372       assert(nir_intrinsic_write_mask(instr) ==
5373              (1u << instr->num_components) - 1);
5374       assert(nir_intrinsic_align(instr) > 0);
5375       if (bit_size == 32 &&
5376           nir_intrinsic_align(instr) >= 4) {
5377          assert(nir_src_num_components(instr->src[0]) <= 4);
5378          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5379          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5380          bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5381                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5382       } else {
5383          assert(nir_src_num_components(instr->src[0]) == 1);
5384          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5385 
5386          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5387          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5388 
5389          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5390                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5391       }
5392       break;
5393    }
5394 
5395    case nir_intrinsic_load_ssbo_uniform_block_intel:
5396    case nir_intrinsic_load_shared_uniform_block_intel: {
5397       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5398 
5399       const bool is_ssbo =
5400          instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
5401       if (is_ssbo) {
5402          srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5403               SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5404               SURFACE_LOGICAL_SRC_SURFACE] =
5405             get_nir_buffer_intrinsic_index(ntb, bld, instr);
5406       } else {
5407          srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
5408       }
5409 
5410       const unsigned total_dwords = ALIGN(instr->num_components,
5411                                           REG_SIZE * reg_unit(devinfo) / 4);
5412       unsigned loaded_dwords = 0;
5413 
5414       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5415       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5416       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5417 
5418       const elk_fs_reg packed_consts =
5419          ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5420 
5421       const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
5422       if (nir_src_is_const(load_offset)) {
5423          elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5424          ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5425          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5426       } else {
5427          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5428             bld.emit_uniformize(get_nir_src(ntb, load_offset));
5429       }
5430 
5431       while (loaded_dwords < total_dwords) {
5432          const unsigned block =
5433             choose_oword_block_size_dwords(devinfo,
5434                                            total_dwords - loaded_dwords);
5435          const unsigned block_bytes = block * 4;
5436 
5437          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5438 
5439          const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5440          ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5441                    retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5442                    srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5443             align(block_bytes, REG_SIZE * reg_unit(devinfo));
5444 
5445          loaded_dwords += block;
5446 
5447          ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5448                    srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5449                    elk_imm_ud(block_bytes));
5450       }
5451 
5452       for (unsigned c = 0; c < instr->num_components; c++)
5453          bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5454                  component(packed_consts, c));
5455 
5456       break;
5457    }
5458 
5459    case nir_intrinsic_store_output: {
5460       assert(nir_src_bit_size(instr->src[0]) == 32);
5461       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5462 
5463       unsigned store_offset = nir_src_as_uint(instr->src[1]);
5464       unsigned num_components = instr->num_components;
5465       unsigned first_component = nir_intrinsic_component(instr);
5466 
5467       elk_fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
5468                                       4 * store_offset), src.type);
5469       for (unsigned j = 0; j < num_components; j++) {
5470          bld.MOV(offset(new_dest, bld, j + first_component),
5471                  offset(src, bld, j));
5472       }
5473       break;
5474    }
5475 
5476    case nir_intrinsic_ssbo_atomic:
5477    case nir_intrinsic_ssbo_atomic_swap:
5478       fs_nir_emit_surface_atomic(ntb, bld, instr,
5479                                  get_nir_buffer_intrinsic_index(ntb, bld, instr),
5480                                  get_nir_src_bindless(ntb, instr->src[0]));
5481       break;
5482 
5483    case nir_intrinsic_get_ssbo_size: {
5484       assert(nir_src_num_components(instr->src[0]) == 1);
5485 
5486       /* A resinfo's sampler message is used to get the buffer size.  The
5487        * SIMD8's writeback message consists of four registers and SIMD16's
5488        * writeback message consists of 8 destination registers (two per each
5489        * component).  Because we are only interested on the first channel of
5490        * the first returned component, where resinfo returns the buffer size
5491        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
5492        * the dispatch width.
5493        */
5494       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
5495       elk_fs_reg src_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5496       elk_fs_reg ret_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
5497 
5498       /* Set LOD = 0 */
5499       ubld.MOV(src_payload, elk_imm_d(0));
5500 
5501       elk_fs_reg srcs[GET_BUFFER_SIZE_SRCS];
5502       srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5503            GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
5504            GET_BUFFER_SIZE_SRC_SURFACE] =
5505          get_nir_buffer_intrinsic_index(ntb, bld, instr);
5506       srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
5507       elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
5508                                 srcs, GET_BUFFER_SIZE_SRCS);
5509       inst->header_size = 0;
5510       inst->mlen = reg_unit(devinfo);
5511       inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5512 
5513       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
5514        *
5515        * "Out-of-bounds checking is always performed at a DWord granularity. If
5516        * any part of the DWord is out-of-bounds then the whole DWord is
5517        * considered out-of-bounds."
5518        *
5519        * This implies that types with size smaller than 4-bytes need to be
5520        * padded if they don't complete the last dword of the buffer. But as we
5521        * need to maintain the original size we need to reverse the padding
5522        * calculation to return the correct size to know the number of elements
5523        * of an unsized array. As we stored in the last two bits of the surface
5524        * size the needed padding for the buffer, we calculate here the
5525        * original buffer_size reversing the surface_size calculation:
5526        *
5527        * surface_size = isl_align(buffer_size, 4) +
5528        *                (isl_align(buffer_size) - buffer_size)
5529        *
5530        * buffer_size = surface_size & ~3 - surface_size & 3
5531        */
5532 
5533       elk_fs_reg size_aligned4 = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5534       elk_fs_reg size_padding = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5535       elk_fs_reg buffer_size = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5536 
5537       ubld.AND(size_padding, ret_payload, elk_imm_ud(3));
5538       ubld.AND(size_aligned4, ret_payload, elk_imm_ud(~3));
5539       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
5540 
5541       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
5542       break;
5543    }
5544 
5545    case nir_intrinsic_load_scratch: {
5546       assert(devinfo->ver >= 7);
5547 
5548       assert(instr->def.num_components == 1);
5549       const unsigned bit_size = instr->def.bit_size;
5550       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5551 
5552       if (devinfo->ver >= 8) {
5553          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5554             elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5555       } else {
5556          srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5557       }
5558 
5559       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5560       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5561       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5562       const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[0]);
5563 
5564       /* Make dest unsigned because that's what the temporary will be */
5565       dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5566 
5567       /* Read the vector */
5568       assert(instr->def.num_components == 1);
5569       assert(bit_size <= 32);
5570       assert(nir_intrinsic_align(instr) > 0);
5571       if (bit_size == 32 &&
5572           nir_intrinsic_align(instr) >= 4) {
5573          /* The offset for a DWORD scattered message is in dwords. */
5574          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5575             swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5576 
5577          bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
5578                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5579       } else {
5580          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5581             swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5582 
5583          elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5584          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5585                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5586          bld.MOV(dest, read_result);
5587       }
5588 
5589       s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5590       break;
5591    }
5592 
5593    case nir_intrinsic_store_scratch: {
5594       assert(devinfo->ver >= 7);
5595 
5596       assert(nir_src_num_components(instr->src[0]) == 1);
5597       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5598       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5599 
5600       if (devinfo->ver >= 8) {
5601          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5602             elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5603       } else {
5604          srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5605       }
5606 
5607       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5608       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5609       /**
5610        * While this instruction has side-effects, it should not be predicated
5611        * on sample mask, because otherwise fs helper invocations would
5612        * load undefined values from scratch memory. And scratch memory
5613        * load-stores are produced from operations without side-effects, thus
5614        * they should not have different behaviour in the helper invocations.
5615        */
5616       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5617       const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[1]);
5618 
5619       elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5620       data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5621 
5622       assert(nir_src_num_components(instr->src[0]) == 1);
5623       assert(bit_size <= 32);
5624       assert(nir_intrinsic_write_mask(instr) == 1);
5625       assert(nir_intrinsic_align(instr) > 0);
5626       if (bit_size == 32 &&
5627           nir_intrinsic_align(instr) >= 4) {
5628          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5629 
5630          /* The offset for a DWORD scattered message is in dwords. */
5631          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5632             swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5633 
5634          bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
5635                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5636       } else {
5637          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5638          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5639 
5640          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5641             swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5642 
5643          bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5644                   elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5645       }
5646       s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5647       break;
5648    }
5649 
5650    case nir_intrinsic_load_subgroup_size:
5651       /* This should only happen for fragment shaders because every other case
5652        * is lowered in NIR so we can optimize on it.
5653        */
5654       assert(s.stage == MESA_SHADER_FRAGMENT);
5655       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), elk_imm_d(s.dispatch_width));
5656       break;
5657 
5658    case nir_intrinsic_load_subgroup_invocation:
5659       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
5660               ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
5661       break;
5662 
5663    case nir_intrinsic_load_subgroup_eq_mask:
5664    case nir_intrinsic_load_subgroup_ge_mask:
5665    case nir_intrinsic_load_subgroup_gt_mask:
5666    case nir_intrinsic_load_subgroup_le_mask:
5667    case nir_intrinsic_load_subgroup_lt_mask:
5668       unreachable("not reached");
5669 
5670    case nir_intrinsic_vote_any: {
5671       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5672 
5673       /* The any/all predicates do not consider channel enables. To prevent
5674        * dead channels from affecting the result, we initialize the flag with
5675        * with the identity value for the logical operation.
5676        */
5677       if (s.dispatch_width == 32) {
5678          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5679          ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5680                    elk_imm_ud(0));
5681       } else {
5682          ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0));
5683       }
5684       bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5685 
5686       /* For some reason, the any/all predicates don't work properly with
5687        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5688        * doesn't read the correct subset of the flag register and you end up
5689        * getting garbage in the second half.  Work around this by using a pair
5690        * of 1-wide MOVs and scattering the result.
5691        */
5692       const fs_builder ubld = ubld1;
5693       elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5694       ubld.MOV(res1, elk_imm_d(0));
5695       set_predicate(s.dispatch_width == 8  ? ELK_PREDICATE_ALIGN1_ANY8H :
5696                     s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ANY16H :
5697                                              ELK_PREDICATE_ALIGN1_ANY32H,
5698                     ubld.MOV(res1, elk_imm_d(-1)));
5699 
5700       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5701       break;
5702    }
5703    case nir_intrinsic_vote_all: {
5704       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5705 
5706       /* The any/all predicates do not consider channel enables. To prevent
5707        * dead channels from affecting the result, we initialize the flag with
5708        * with the identity value for the logical operation.
5709        */
5710       if (s.dispatch_width == 32) {
5711          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5712          ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5713                    elk_imm_ud(0xffffffff));
5714       } else {
5715          ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5716       }
5717       bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5718 
5719       /* For some reason, the any/all predicates don't work properly with
5720        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5721        * doesn't read the correct subset of the flag register and you end up
5722        * getting garbage in the second half.  Work around this by using a pair
5723        * of 1-wide MOVs and scattering the result.
5724        */
5725       const fs_builder ubld = ubld1;
5726       elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5727       ubld.MOV(res1, elk_imm_d(0));
5728       set_predicate(s.dispatch_width == 8  ? ELK_PREDICATE_ALIGN1_ALL8H :
5729                     s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5730                                              ELK_PREDICATE_ALIGN1_ALL32H,
5731                     ubld.MOV(res1, elk_imm_d(-1)));
5732 
5733       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5734       break;
5735    }
5736    case nir_intrinsic_vote_feq:
5737    case nir_intrinsic_vote_ieq: {
5738       elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5739       if (instr->intrinsic == nir_intrinsic_vote_feq) {
5740          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5741          value.type = bit_size == 8 ? ELK_REGISTER_TYPE_B :
5742             elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_F);
5743       }
5744 
5745       elk_fs_reg uniformized = bld.emit_uniformize(value);
5746       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5747 
5748       /* The any/all predicates do not consider channel enables. To prevent
5749        * dead channels from affecting the result, we initialize the flag with
5750        * with the identity value for the logical operation.
5751        */
5752       if (s.dispatch_width == 32) {
5753          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5754          ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5755                          elk_imm_ud(0xffffffff));
5756       } else {
5757          ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5758       }
5759       bld.CMP(bld.null_reg_d(), value, uniformized, ELK_CONDITIONAL_Z);
5760 
5761       /* For some reason, the any/all predicates don't work properly with
5762        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5763        * doesn't read the correct subset of the flag register and you end up
5764        * getting garbage in the second half.  Work around this by using a pair
5765        * of 1-wide MOVs and scattering the result.
5766        */
5767       const fs_builder ubld = ubld1;
5768       elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5769       ubld.MOV(res1, elk_imm_d(0));
5770       set_predicate(s.dispatch_width == 8  ? ELK_PREDICATE_ALIGN1_ALL8H :
5771                     s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5772                                              ELK_PREDICATE_ALIGN1_ALL32H,
5773                     ubld.MOV(res1, elk_imm_d(-1)));
5774 
5775       bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5776       break;
5777    }
5778 
5779    case nir_intrinsic_ballot: {
5780       const elk_fs_reg value = retype(get_nir_src(ntb, instr->src[0]),
5781                                   ELK_REGISTER_TYPE_UD);
5782       struct elk_reg flag = elk_flag_reg(0, 0);
5783       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
5784        * as f0.0.  This is a problem for fragment programs as we currently use
5785        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
5786        * programs yet so this isn't a problem.  When we do, something will
5787        * have to change.
5788        */
5789       if (s.dispatch_width == 32)
5790          flag.type = ELK_REGISTER_TYPE_UD;
5791 
5792       bld.exec_all().group(1, 0).MOV(flag, elk_imm_ud(0u));
5793       bld.CMP(bld.null_reg_ud(), value, elk_imm_ud(0u), ELK_CONDITIONAL_NZ);
5794 
5795       if (instr->def.bit_size > 32) {
5796          dest.type = ELK_REGISTER_TYPE_UQ;
5797       } else {
5798          dest.type = ELK_REGISTER_TYPE_UD;
5799       }
5800       bld.MOV(dest, flag);
5801       break;
5802    }
5803 
5804    case nir_intrinsic_read_invocation: {
5805       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5806       const elk_fs_reg invocation = get_nir_src(ntb, instr->src[1]);
5807 
5808       elk_fs_reg tmp = bld.vgrf(value.type);
5809 
5810       /* When for some reason the subgroup_size picked by NIR is larger than
5811        * the dispatch size picked by the backend (this could happen in RT,
5812        * FS), bound the invocation to the dispatch size.
5813        */
5814       elk_fs_reg bound_invocation;
5815       if (s.api_subgroup_size == 0 ||
5816           bld.dispatch_width() < s.api_subgroup_size) {
5817          bound_invocation = bld.vgrf(ELK_REGISTER_TYPE_UD);
5818          bld.AND(bound_invocation, invocation, elk_imm_ud(s.dispatch_width - 1));
5819       } else {
5820          bound_invocation = invocation;
5821       }
5822       bld.exec_all().emit(ELK_SHADER_OPCODE_BROADCAST, tmp, value,
5823                           bld.emit_uniformize(bound_invocation));
5824 
5825       bld.MOV(retype(dest, value.type), elk_fs_reg(component(tmp, 0)));
5826       break;
5827    }
5828 
5829    case nir_intrinsic_read_first_invocation: {
5830       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5831       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
5832       break;
5833    }
5834 
5835    case nir_intrinsic_shuffle: {
5836       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5837       const elk_fs_reg index = get_nir_src(ntb, instr->src[1]);
5838 
5839       bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
5840       break;
5841    }
5842 
5843    case nir_intrinsic_first_invocation: {
5844       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5845       bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
5846       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5847               elk_fs_reg(component(tmp, 0)));
5848       break;
5849    }
5850 
5851    case nir_intrinsic_last_invocation: {
5852       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5853       bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
5854       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5855               elk_fs_reg(component(tmp, 0)));
5856       break;
5857    }
5858 
5859    case nir_intrinsic_quad_broadcast: {
5860       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5861       const unsigned index = nir_src_as_uint(instr->src[1]);
5862 
5863       bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
5864                value, elk_imm_ud(index), elk_imm_ud(4));
5865       break;
5866    }
5867 
5868    case nir_intrinsic_quad_swap_horizontal: {
5869       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5870       const elk_fs_reg tmp = bld.vgrf(value.type);
5871       if (devinfo->ver <= 7) {
5872          /* The hardware doesn't seem to support these crazy regions with
5873           * compressed instructions on gfx7 and earlier so we fall back to
5874           * using quad swizzles.  Fortunately, we don't support 64-bit
5875           * anything in Vulkan on gfx7.
5876           */
5877          assert(nir_src_bit_size(instr->src[0]) == 32);
5878          const fs_builder ubld = bld.exec_all();
5879          ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5880                    elk_imm_ud(ELK_SWIZZLE4(1,0,3,2)));
5881          bld.MOV(retype(dest, value.type), tmp);
5882       } else {
5883          const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
5884 
5885          const elk_fs_reg src_left = horiz_stride(value, 2);
5886          const elk_fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
5887          const elk_fs_reg tmp_left = horiz_stride(tmp, 2);
5888          const elk_fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
5889 
5890          ubld.MOV(tmp_left, src_right);
5891          ubld.MOV(tmp_right, src_left);
5892 
5893       }
5894       bld.MOV(retype(dest, value.type), tmp);
5895       break;
5896    }
5897 
5898    case nir_intrinsic_quad_swap_vertical: {
5899       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5900       if (nir_src_bit_size(instr->src[0]) == 32) {
5901          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5902          const elk_fs_reg tmp = bld.vgrf(value.type);
5903          const fs_builder ubld = bld.exec_all();
5904          ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5905                    elk_imm_ud(ELK_SWIZZLE4(2,3,0,1)));
5906          bld.MOV(retype(dest, value.type), tmp);
5907       } else {
5908          /* For larger data types, we have to either emit dispatch_width many
5909           * MOVs or else fall back to doing indirects.
5910           */
5911          elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5912          bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5913                       elk_imm_w(0x2));
5914          bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5915       }
5916       break;
5917    }
5918 
5919    case nir_intrinsic_quad_swap_diagonal: {
5920       const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5921       if (nir_src_bit_size(instr->src[0]) == 32) {
5922          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5923          const elk_fs_reg tmp = bld.vgrf(value.type);
5924          const fs_builder ubld = bld.exec_all();
5925          ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5926                    elk_imm_ud(ELK_SWIZZLE4(3,2,1,0)));
5927          bld.MOV(retype(dest, value.type), tmp);
5928       } else {
5929          /* For larger data types, we have to either emit dispatch_width many
5930           * MOVs or else fall back to doing indirects.
5931           */
5932          elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5933          bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5934                       elk_imm_w(0x3));
5935          bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5936       }
5937       break;
5938    }
5939 
5940    case nir_intrinsic_ddx_fine:
5941       bld.emit(ELK_FS_OPCODE_DDX_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5942                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5943       break;
5944    case nir_intrinsic_ddx:
5945    case nir_intrinsic_ddx_coarse:
5946       bld.emit(ELK_FS_OPCODE_DDX_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5947                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5948       break;
5949    case nir_intrinsic_ddy_fine:
5950       bld.emit(ELK_FS_OPCODE_DDY_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5951                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5952       break;
5953    case nir_intrinsic_ddy:
5954    case nir_intrinsic_ddy_coarse:
5955       bld.emit(ELK_FS_OPCODE_DDY_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5956                retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5957       break;
5958 
5959    case nir_intrinsic_reduce: {
5960       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5961       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5962       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
5963       if (cluster_size == 0 || cluster_size > s.dispatch_width)
5964          cluster_size = s.dispatch_width;
5965 
5966       /* Figure out the source type */
5967       src.type = elk_type_for_nir_type(devinfo,
5968          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5969                         nir_src_bit_size(instr->src[0])));
5970 
5971       elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
5972       elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
5973       elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
5974 
5975       /* Set up a register for all of our scratching around and initialize it
5976        * to reduction operation's identity value.
5977        */
5978       elk_fs_reg scan = bld.vgrf(src.type);
5979       bld.exec_all().emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5980 
5981       bld.emit_scan(elk_op, scan, cluster_size, cond_mod);
5982 
5983       dest.type = src.type;
5984       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
5985          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
5986           * the distance between clusters is at least 2 GRFs.  In this case,
5987           * we don't need the weird striding of the CLUSTER_BROADCAST
5988           * instruction and can just do regular MOVs.
5989           */
5990          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
5991          const unsigned groups =
5992             (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
5993          const unsigned group_size = s.dispatch_width / groups;
5994          for (unsigned i = 0; i < groups; i++) {
5995             const unsigned cluster = (i * group_size) / cluster_size;
5996             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
5997             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
5998                                          component(scan, comp));
5999          }
6000       } else {
6001          bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
6002                   elk_imm_ud(cluster_size - 1), elk_imm_ud(cluster_size));
6003       }
6004       break;
6005    }
6006 
6007    case nir_intrinsic_inclusive_scan:
6008    case nir_intrinsic_exclusive_scan: {
6009       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6010       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
6011 
6012       /* Figure out the source type */
6013       src.type = elk_type_for_nir_type(devinfo,
6014          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
6015                         nir_src_bit_size(instr->src[0])));
6016 
6017       elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
6018       elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
6019       elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
6020 
6021       /* Set up a register for all of our scratching around and initialize it
6022        * to reduction operation's identity value.
6023        */
6024       elk_fs_reg scan = bld.vgrf(src.type);
6025       const fs_builder allbld = bld.exec_all();
6026       allbld.emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
6027 
6028       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
6029          /* Exclusive scan is a bit harder because we have to do an annoying
6030           * shift of the contents before we can begin.  To make things worse,
6031           * we can't do this with a normal stride; we have to use indirects.
6032           */
6033          elk_fs_reg shifted = bld.vgrf(src.type);
6034          elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
6035          allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
6036                          elk_imm_w(-1));
6037          allbld.emit(ELK_SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
6038          allbld.group(1, 0).MOV(component(shifted, 0), identity);
6039          scan = shifted;
6040       }
6041 
6042       bld.emit_scan(elk_op, scan, s.dispatch_width, cond_mod);
6043 
6044       bld.MOV(retype(dest, src.type), scan);
6045       break;
6046    }
6047 
6048    case nir_intrinsic_load_global_block_intel: {
6049       assert(instr->def.bit_size == 32);
6050 
6051       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
6052 
6053       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6054       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6055       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6056 
6057       const unsigned total = instr->num_components * s.dispatch_width;
6058       unsigned loaded = 0;
6059 
6060       while (loaded < total) {
6061          const unsigned block =
6062             choose_oword_block_size_dwords(devinfo, total - loaded);
6063          const unsigned block_bytes = block * 4;
6064 
6065          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6066 
6067          elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6068          srcs[A64_LOGICAL_ADDRESS] = address;
6069          srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
6070          srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6071          srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(1);
6072          ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6073                    retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6074                    srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6075 
6076          increment_a64_address(ubld1, address, block_bytes);
6077          loaded += block;
6078       }
6079 
6080       assert(loaded == total);
6081       break;
6082    }
6083 
6084    case nir_intrinsic_store_global_block_intel: {
6085       assert(nir_src_bit_size(instr->src[0]) == 32);
6086 
6087       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
6088       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6089 
6090       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6091       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6092       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6093 
6094       const unsigned total = instr->num_components * s.dispatch_width;
6095       unsigned written = 0;
6096 
6097       while (written < total) {
6098          const unsigned block =
6099             choose_oword_block_size_dwords(devinfo, total - written);
6100 
6101          elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6102          srcs[A64_LOGICAL_ADDRESS] = address;
6103          srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
6104                                         ELK_REGISTER_TYPE_UD);
6105          srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6106          srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6107 
6108          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6109          ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, elk_fs_reg(),
6110                    srcs, A64_LOGICAL_NUM_SRCS);
6111 
6112          const unsigned block_bytes = block * 4;
6113          increment_a64_address(ubld1, address, block_bytes);
6114          written += block;
6115       }
6116 
6117       assert(written == total);
6118       break;
6119    }
6120 
6121    case nir_intrinsic_load_shared_block_intel:
6122    case nir_intrinsic_load_ssbo_block_intel: {
6123       assert(instr->def.bit_size == 32);
6124 
6125       const bool is_ssbo =
6126          instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
6127       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0]));
6128 
6129       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6130       srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6131          get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6132          elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6133       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6134 
6135       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6136       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6137       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6138 
6139       const unsigned total = instr->num_components * s.dispatch_width;
6140       unsigned loaded = 0;
6141 
6142       while (loaded < total) {
6143          const unsigned block =
6144             choose_oword_block_size_dwords(devinfo, total - loaded);
6145          const unsigned block_bytes = block * 4;
6146 
6147          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6148 
6149          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6150          ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6151                    retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6152                    srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6153 
6154          ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6155          loaded += block;
6156       }
6157 
6158       assert(loaded == total);
6159       break;
6160    }
6161 
6162    case nir_intrinsic_store_shared_block_intel:
6163    case nir_intrinsic_store_ssbo_block_intel: {
6164       assert(nir_src_bit_size(instr->src[0]) == 32);
6165 
6166       const bool is_ssbo =
6167          instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
6168 
6169       elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1]));
6170       elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6171 
6172       elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6173       srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6174          get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6175          elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6176       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6177 
6178       const fs_builder ubld1 = bld.exec_all().group(1, 0);
6179       const fs_builder ubld8 = bld.exec_all().group(8, 0);
6180       const fs_builder ubld16 = bld.exec_all().group(16, 0);
6181 
6182       const unsigned total = instr->num_components * s.dispatch_width;
6183       unsigned written = 0;
6184 
6185       while (written < total) {
6186          const unsigned block =
6187             choose_oword_block_size_dwords(devinfo, total - written);
6188 
6189          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6190          srcs[SURFACE_LOGICAL_SRC_DATA] =
6191             retype(byte_offset(src, written * 4), ELK_REGISTER_TYPE_UD);
6192 
6193          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6194          ubld.emit(ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
6195                    elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6196 
6197          const unsigned block_bytes = block * 4;
6198          ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6199          written += block;
6200       }
6201 
6202       assert(written == total);
6203       break;
6204    }
6205 
6206    default:
6207 #ifndef NDEBUG
6208       assert(instr->intrinsic < nir_num_intrinsics);
6209       fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
6210 #endif
6211       unreachable("unknown intrinsic");
6212    }
6213 }
6214 
6215 static elk_fs_reg
expand_to_32bit(const fs_builder & bld,const elk_fs_reg & src)6216 expand_to_32bit(const fs_builder &bld, const elk_fs_reg &src)
6217 {
6218    if (type_sz(src.type) == 2) {
6219       elk_fs_reg src32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6220       bld.MOV(src32, retype(src, ELK_REGISTER_TYPE_UW));
6221       return src32;
6222    } else {
6223       return src;
6224    }
6225 }
6226 
6227 static void
fs_nir_emit_surface_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,elk_fs_reg surface,bool bindless)6228 fs_nir_emit_surface_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6229                            nir_intrinsic_instr *instr,
6230                            elk_fs_reg surface,
6231                            bool bindless)
6232 {
6233    const intel_device_info *devinfo = ntb.devinfo;
6234    elk_fs_visitor &s = ntb.s;
6235 
6236    enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6237    int num_data = lsc_op_num_data_values(op);
6238 
6239    bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
6240 
6241    /* The BTI untyped atomic messages only support 32-bit atomics.  If you
6242     * just look at the big table of messages in the Vol 7 of the SKL PRM, they
6243     * appear to exist.  However, if you look at Vol 2a, there are no message
6244     * descriptors provided for Qword atomic ops except for A64 messages.
6245     *
6246     * 16-bit float atomics are supported, however.
6247     */
6248    assert(instr->def.bit_size == 32 ||
6249           (instr->def.bit_size == 64 && devinfo->has_lsc) ||
6250           (instr->def.bit_size == 16 &&
6251            (devinfo->has_lsc || elk_lsc_opcode_is_atomic_float(op))));
6252 
6253    elk_fs_reg dest = get_nir_def(ntb, instr->def);
6254 
6255    elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6256    srcs[bindless ?
6257         SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6258         SURFACE_LOGICAL_SRC_SURFACE] = surface;
6259    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
6260    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
6261    srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
6262 
6263    if (shared) {
6264       /* SLM - Get the offset */
6265       if (nir_src_is_const(instr->src[0])) {
6266          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6267             elk_imm_ud(nir_intrinsic_base(instr) +
6268                        nir_src_as_uint(instr->src[0]));
6269       } else {
6270          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type());
6271          bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6272                  retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD),
6273                  elk_imm_ud(nir_intrinsic_base(instr)));
6274       }
6275    } else {
6276       /* SSBOs */
6277       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6278    }
6279 
6280    elk_fs_reg data;
6281    if (num_data >= 1)
6282       data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2]));
6283 
6284    if (num_data >= 2) {
6285       elk_fs_reg tmp = bld.vgrf(data.type, 2);
6286       elk_fs_reg sources[2] = {
6287          data,
6288          expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3]))
6289       };
6290       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6291       data = tmp;
6292    }
6293    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6294 
6295    /* Emit the actual atomic operation */
6296 
6297    switch (instr->def.bit_size) {
6298       case 16: {
6299          elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6300          bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6301                   retype(dest32, dest.type),
6302                   srcs, SURFACE_LOGICAL_NUM_SRCS);
6303          bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW),
6304                  retype(dest32, ELK_REGISTER_TYPE_UD));
6305          break;
6306       }
6307 
6308       case 32:
6309       case 64:
6310          bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6311                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6312          break;
6313       default:
6314          unreachable("Unsupported bit size");
6315    }
6316 }
6317 
6318 static void
fs_nir_emit_global_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)6319 fs_nir_emit_global_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6320                           nir_intrinsic_instr *instr)
6321 {
6322    enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6323    int num_data = lsc_op_num_data_values(op);
6324 
6325    elk_fs_reg dest = get_nir_def(ntb, instr->def);
6326 
6327    elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
6328 
6329    elk_fs_reg data;
6330    if (num_data >= 1)
6331       data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1]));
6332 
6333    if (num_data >= 2) {
6334       elk_fs_reg tmp = bld.vgrf(data.type, 2);
6335       elk_fs_reg sources[2] = {
6336          data,
6337          expand_to_32bit(bld, get_nir_src(ntb, instr->src[2]))
6338       };
6339       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6340       data = tmp;
6341    }
6342 
6343    elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6344    srcs[A64_LOGICAL_ADDRESS] = addr;
6345    srcs[A64_LOGICAL_SRC] = data;
6346    srcs[A64_LOGICAL_ARG] = elk_imm_ud(op);
6347    srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6348 
6349    switch (instr->def.bit_size) {
6350    case 16: {
6351       elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6352       bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
6353                retype(dest32, dest.type),
6354                srcs, A64_LOGICAL_NUM_SRCS);
6355       bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW), dest32);
6356       break;
6357    }
6358    case 32:
6359    case 64:
6360       bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
6361                srcs, A64_LOGICAL_NUM_SRCS);
6362       break;
6363    default:
6364       unreachable("Unsupported bit size");
6365    }
6366 }
6367 
6368 static void
fs_nir_emit_texture(nir_to_elk_state & ntb,nir_tex_instr * instr)6369 fs_nir_emit_texture(nir_to_elk_state &ntb,
6370                     nir_tex_instr *instr)
6371 {
6372    const intel_device_info *devinfo = ntb.devinfo;
6373    const fs_builder &bld = ntb.bld;
6374    elk_fs_visitor &s = ntb.s;
6375 
6376    elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
6377 
6378    /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
6379     *
6380     *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask
6381     *     Enable will be incorect for sample_c when applied to a surface with
6382     *     64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
6383     *     Enable may incorrectly report pixels as referencing a Null surface."
6384     *
6385     * We'll take care of this in NIR.
6386     */
6387    assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
6388 
6389    srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(instr->is_sparse);
6390 
6391    int lod_components = 0;
6392 
6393    /* The hardware requires a LOD for buffer textures */
6394    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6395       srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_d(0);
6396 
6397    ASSERTED bool got_lod = false;
6398    ASSERTED bool got_bias = false;
6399    uint32_t header_bits = 0;
6400    for (unsigned i = 0; i < instr->num_srcs; i++) {
6401       nir_src nir_src = instr->src[i].src;
6402       elk_fs_reg src = get_nir_src(ntb, nir_src);
6403       switch (instr->src[i].src_type) {
6404       case nir_tex_src_bias:
6405          assert(!got_lod);
6406          got_bias = true;
6407 
6408          srcs[TEX_LOGICAL_SRC_LOD] =
6409             retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6410          break;
6411       case nir_tex_src_comparator:
6412          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, ELK_REGISTER_TYPE_F);
6413          break;
6414       case nir_tex_src_coord:
6415          switch (instr->op) {
6416          case nir_texop_txf:
6417          case nir_texop_txf_ms:
6418          case nir_texop_txf_ms_mcs_intel:
6419          case nir_texop_samples_identical:
6420             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_D);
6421             break;
6422          default:
6423             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_F);
6424             break;
6425          }
6426          break;
6427       case nir_tex_src_ddx:
6428          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, ELK_REGISTER_TYPE_F);
6429          lod_components = nir_tex_instr_src_size(instr, i);
6430          break;
6431       case nir_tex_src_ddy:
6432          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, ELK_REGISTER_TYPE_F);
6433          break;
6434       case nir_tex_src_lod:
6435          assert(!got_bias);
6436          got_lod = true;
6437 
6438          switch (instr->op) {
6439          case nir_texop_txs:
6440             srcs[TEX_LOGICAL_SRC_LOD] =
6441                retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_UD);
6442             break;
6443          case nir_texop_txf:
6444             srcs[TEX_LOGICAL_SRC_LOD] =
6445                retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_D);
6446             break;
6447          default:
6448             srcs[TEX_LOGICAL_SRC_LOD] =
6449                retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6450             break;
6451          }
6452          break;
6453       case nir_tex_src_min_lod:
6454          srcs[TEX_LOGICAL_SRC_MIN_LOD] =
6455             retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6456          break;
6457       case nir_tex_src_ms_index:
6458          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, ELK_REGISTER_TYPE_UD);
6459          break;
6460 
6461       case nir_tex_src_offset: {
6462          uint32_t offset_bits = 0;
6463          if (elk_texture_offset(instr, i, &offset_bits)) {
6464             header_bits |= offset_bits;
6465          } else {
6466             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
6467                retype(src, ELK_REGISTER_TYPE_D);
6468          }
6469          break;
6470       }
6471 
6472       case nir_tex_src_projector:
6473          unreachable("should be lowered");
6474 
6475       case nir_tex_src_texture_offset: {
6476          assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
6477          /* Emit code to evaluate the actual indexing expression */
6478          if (instr->texture_index == 0 && is_resource_src(nir_src))
6479             srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src);
6480          if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
6481             elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6482             bld.ADD(tmp, src, elk_imm_ud(instr->texture_index));
6483             srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
6484          }
6485          assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
6486          break;
6487       }
6488 
6489       case nir_tex_src_sampler_offset: {
6490          /* Emit code to evaluate the actual indexing expression */
6491          if (instr->sampler_index == 0 && is_resource_src(nir_src))
6492             srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src);
6493          if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
6494             elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6495             bld.ADD(tmp, src, elk_imm_ud(instr->sampler_index));
6496             srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
6497          }
6498          break;
6499       }
6500 
6501       case nir_tex_src_texture_handle:
6502          assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
6503          srcs[TEX_LOGICAL_SRC_SURFACE] = elk_fs_reg();
6504          if (is_resource_src(nir_src))
6505             srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src);
6506          if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6507             srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
6508          break;
6509 
6510       case nir_tex_src_sampler_handle:
6511          assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
6512          srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_fs_reg();
6513          if (is_resource_src(nir_src))
6514             srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src);
6515          if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6516             srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
6517          break;
6518 
6519       case nir_tex_src_ms_mcs_intel:
6520          assert(instr->op == nir_texop_txf_ms);
6521          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, ELK_REGISTER_TYPE_D);
6522          break;
6523 
6524       /* If this parameter is present, we are packing either the explicit LOD
6525        * or LOD bias and the array index into a single (32-bit) value when
6526        * 32-bit texture coordinates are used.
6527        */
6528       case nir_tex_src_backend1:
6529          assert(!got_lod && !got_bias);
6530          got_lod = true;
6531 
6532          assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
6533          srcs[TEX_LOGICAL_SRC_LOD] =
6534             retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6535          break;
6536 
6537       default:
6538          unreachable("unknown texture source");
6539       }
6540    }
6541 
6542    /* If the surface or sampler were not specified through sources, use the
6543     * instruction index.
6544     */
6545    if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
6546        srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6547       srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(instr->texture_index);
6548    if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
6549        srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6550       srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(instr->sampler_index);
6551 
6552    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
6553        (instr->op == nir_texop_txf_ms ||
6554         instr->op == nir_texop_samples_identical)) {
6555       if (devinfo->ver >= 7) {
6556          srcs[TEX_LOGICAL_SRC_MCS] =
6557             emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
6558                            instr->coord_components,
6559                            srcs[TEX_LOGICAL_SRC_SURFACE],
6560                            srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
6561       } else {
6562          srcs[TEX_LOGICAL_SRC_MCS] = elk_imm_ud(0u);
6563       }
6564    }
6565 
6566    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(instr->coord_components);
6567    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(lod_components);
6568 
6569    enum elk_opcode opcode;
6570    switch (instr->op) {
6571    case nir_texop_tex:
6572       opcode = ELK_SHADER_OPCODE_TEX_LOGICAL;
6573       break;
6574    case nir_texop_txb:
6575       opcode = ELK_FS_OPCODE_TXB_LOGICAL;
6576       break;
6577    case nir_texop_txl:
6578       opcode = ELK_SHADER_OPCODE_TXL_LOGICAL;
6579       break;
6580    case nir_texop_txd:
6581       opcode = ELK_SHADER_OPCODE_TXD_LOGICAL;
6582       break;
6583    case nir_texop_txf:
6584       opcode = ELK_SHADER_OPCODE_TXF_LOGICAL;
6585       break;
6586    case nir_texop_txf_ms:
6587       opcode = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
6588       break;
6589    case nir_texop_txf_ms_mcs_intel:
6590       opcode = ELK_SHADER_OPCODE_TXF_MCS_LOGICAL;
6591       break;
6592    case nir_texop_query_levels:
6593    case nir_texop_txs:
6594       opcode = ELK_SHADER_OPCODE_TXS_LOGICAL;
6595       break;
6596    case nir_texop_lod:
6597       opcode = ELK_SHADER_OPCODE_LOD_LOGICAL;
6598       break;
6599    case nir_texop_tg4:
6600       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
6601          opcode = ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL;
6602       else
6603          opcode = ELK_SHADER_OPCODE_TG4_LOGICAL;
6604       break;
6605    case nir_texop_texture_samples:
6606       opcode = ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL;
6607       break;
6608    case nir_texop_samples_identical: {
6609       elk_fs_reg dst = retype(get_nir_def(ntb, instr->def), ELK_REGISTER_TYPE_D);
6610 
6611       /* If mcs is an immediate value, it means there is no MCS.  In that case
6612        * just return false.
6613        */
6614       if (srcs[TEX_LOGICAL_SRC_MCS].file == ELK_IMMEDIATE_VALUE) {
6615          bld.MOV(dst, elk_imm_ud(0u));
6616       } else {
6617          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], elk_imm_ud(0u),
6618                  ELK_CONDITIONAL_EQ);
6619       }
6620       return;
6621    }
6622    default:
6623       unreachable("unknown texture opcode");
6624    }
6625 
6626    if (instr->op == nir_texop_tg4) {
6627       if (instr->component == 1 &&
6628           s.key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) {
6629          /* gather4 sampler is broken for green channel on RG32F --
6630           * we must ask for blue instead.
6631           */
6632          header_bits |= 2 << 16;
6633       } else {
6634          header_bits |= instr->component << 16;
6635       }
6636    }
6637 
6638    elk_fs_reg dst = bld.vgrf(elk_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
6639    elk_fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
6640    inst->offset = header_bits;
6641 
6642    const unsigned dest_size = nir_tex_instr_dest_size(instr);
6643    inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
6644                         (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
6645 
6646    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
6647       inst->shadow_compare = true;
6648 
6649    /* Wa_14012688258:
6650     *
6651     * Don't trim zeros at the end of payload for sample operations
6652     * in cube and cube arrays.
6653     */
6654    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6655        intel_needs_workaround(devinfo, 14012688258)) {
6656 
6657       /* Compiler should send U,V,R parameters even if V,R are 0. */
6658       if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
6659          assert(instr->coord_components >= 3u);
6660 
6661       /* See opt_zero_samples(). */
6662       inst->keep_payload_trailing_zeros = true;
6663    }
6664 
6665    elk_fs_reg nir_dest[5];
6666    for (unsigned i = 0; i < dest_size; i++)
6667       nir_dest[i] = offset(dst, bld, i);
6668 
6669    if (instr->op == nir_texop_query_levels) {
6670       /* # levels is in .w */
6671       /**
6672        * Wa_1940217:
6673        *
6674        * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
6675        * MIPCount returned is undefined instead of 0.
6676        */
6677       elk_fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
6678       mov->conditional_mod = ELK_CONDITIONAL_NZ;
6679       nir_dest[0] = bld.vgrf(ELK_REGISTER_TYPE_D);
6680       elk_fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), elk_imm_d(0));
6681       sel->predicate = ELK_PREDICATE_NORMAL;
6682    } else if (instr->op == nir_texop_txs &&
6683               dest_size >= 3 && devinfo->ver < 7) {
6684       /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
6685       elk_fs_reg depth = offset(dst, bld, 2);
6686       nir_dest[2] = s.vgrf(glsl_int_type());
6687       bld.emit_minmax(nir_dest[2], depth, elk_imm_d(1), ELK_CONDITIONAL_GE);
6688    }
6689 
6690    /* The residency bits are only in the first component. */
6691    if (instr->is_sparse)
6692       nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
6693 
6694    bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0);
6695 }
6696 
6697 static void
fs_nir_emit_jump(nir_to_elk_state & ntb,nir_jump_instr * instr)6698 fs_nir_emit_jump(nir_to_elk_state &ntb, nir_jump_instr *instr)
6699 {
6700    switch (instr->type) {
6701    case nir_jump_break:
6702       ntb.bld.emit(ELK_OPCODE_BREAK);
6703       break;
6704    case nir_jump_continue:
6705       ntb.bld.emit(ELK_OPCODE_CONTINUE);
6706       break;
6707    case nir_jump_halt:
6708       ntb.bld.emit(ELK_OPCODE_HALT);
6709       break;
6710    case nir_jump_return:
6711    default:
6712       unreachable("unknown jump");
6713    }
6714 }
6715 
6716 /*
6717  * This helper takes a source register and un/shuffles it into the destination
6718  * register.
6719  *
6720  * If source type size is smaller than destination type size the operation
6721  * needed is a component shuffle. The opposite case would be an unshuffle. If
6722  * source/destination type size is equal a shuffle is done that would be
6723  * equivalent to a simple MOV.
6724  *
6725  * For example, if source is a 16-bit type and destination is 32-bit. A 3
6726  * components .xyz 16-bit vector on SIMD8 would be.
6727  *
6728  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
6729  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
6730  *
6731  * This helper will return the following 2 32-bit components with the 16-bit
6732  * values shuffled:
6733  *
6734  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
6735  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
6736  *
6737  * For unshuffle, the example would be the opposite, a 64-bit type source
6738  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
6739  * would be:
6740  *
6741  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
6742  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
6743  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
6744  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
6745  *
6746  * The returned result would be the following 4 32-bit components unshuffled:
6747  *
6748  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
6749  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
6750  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
6751  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
6752  *
6753  * - Source and destination register must not be overlapped.
6754  * - components units are measured in terms of the smaller type between
6755  *   source and destination because we are un/shuffling the smaller
6756  *   components from/into the bigger ones.
6757  * - first_component parameter allows skipping source components.
6758  */
6759 void
elk_shuffle_src_to_dst(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6760 elk_shuffle_src_to_dst(const fs_builder &bld,
6761                    const elk_fs_reg &dst,
6762                    const elk_fs_reg &src,
6763                    uint32_t first_component,
6764                    uint32_t components)
6765 {
6766    if (type_sz(src.type) == type_sz(dst.type)) {
6767       assert(!regions_overlap(dst,
6768          type_sz(dst.type) * bld.dispatch_width() * components,
6769          offset(src, bld, first_component),
6770          type_sz(src.type) * bld.dispatch_width() * components));
6771       for (unsigned i = 0; i < components; i++) {
6772          bld.MOV(retype(offset(dst, bld, i), src.type),
6773                  offset(src, bld, i + first_component));
6774       }
6775    } else if (type_sz(src.type) < type_sz(dst.type)) {
6776       /* Source is shuffled into destination */
6777       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
6778       assert(!regions_overlap(dst,
6779          type_sz(dst.type) * bld.dispatch_width() *
6780          DIV_ROUND_UP(components, size_ratio),
6781          offset(src, bld, first_component),
6782          type_sz(src.type) * bld.dispatch_width() * components));
6783 
6784       elk_reg_type shuffle_type =
6785          elk_reg_type_from_bit_size(8 * type_sz(src.type),
6786                                     ELK_REGISTER_TYPE_D);
6787       for (unsigned i = 0; i < components; i++) {
6788          elk_fs_reg shuffle_component_i =
6789             subscript(offset(dst, bld, i / size_ratio),
6790                       shuffle_type, i % size_ratio);
6791          bld.MOV(shuffle_component_i,
6792                  retype(offset(src, bld, i + first_component), shuffle_type));
6793       }
6794    } else {
6795       /* Source is unshuffled into destination */
6796       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
6797       assert(!regions_overlap(dst,
6798          type_sz(dst.type) * bld.dispatch_width() * components,
6799          offset(src, bld, first_component / size_ratio),
6800          type_sz(src.type) * bld.dispatch_width() *
6801          DIV_ROUND_UP(components + (first_component % size_ratio),
6802                       size_ratio)));
6803 
6804       elk_reg_type shuffle_type =
6805          elk_reg_type_from_bit_size(8 * type_sz(dst.type),
6806                                     ELK_REGISTER_TYPE_D);
6807       for (unsigned i = 0; i < components; i++) {
6808          elk_fs_reg shuffle_component_i =
6809             subscript(offset(src, bld, (first_component + i) / size_ratio),
6810                       shuffle_type, (first_component + i) % size_ratio);
6811          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
6812                  shuffle_component_i);
6813       }
6814    }
6815 }
6816 
6817 void
elk_shuffle_from_32bit_read(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6818 elk_shuffle_from_32bit_read(const fs_builder &bld,
6819                         const elk_fs_reg &dst,
6820                         const elk_fs_reg &src,
6821                         uint32_t first_component,
6822                         uint32_t components)
6823 {
6824    assert(type_sz(src.type) == 4);
6825 
6826    /* This function takes components in units of the destination type while
6827     * elk_shuffle_src_to_dst takes components in units of the smallest type
6828     */
6829    if (type_sz(dst.type) > 4) {
6830       assert(type_sz(dst.type) == 8);
6831       first_component *= 2;
6832       components *= 2;
6833    }
6834 
6835    elk_shuffle_src_to_dst(bld, dst, src, first_component, components);
6836 }
6837 
6838 elk_fs_reg
elk_setup_imm_df(const fs_builder & bld,double v)6839 elk_setup_imm_df(const fs_builder &bld, double v)
6840 {
6841    const struct intel_device_info *devinfo = bld.shader->devinfo;
6842    assert(devinfo->ver >= 7);
6843 
6844    if (devinfo->ver >= 8)
6845       return elk_imm_df(v);
6846 
6847    /* gfx7.5 does not support DF immediates straightforward but the DIM
6848     * instruction allows to set the 64-bit immediate value.
6849     */
6850    if (devinfo->platform == INTEL_PLATFORM_HSW) {
6851       const fs_builder ubld = bld.exec_all().group(1, 0);
6852       elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_DF, 1);
6853       ubld.DIM(dst, elk_imm_df(v));
6854       return component(dst, 0);
6855    }
6856 
6857    /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
6858     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
6859     * the high 32-bit to suboffset 4 and then applying a stride of 0.
6860     *
6861     * Alternatively, we could also produce a normal VGRF (without stride 0)
6862     * by writing to all the channels in the VGRF, however, that would hit the
6863     * gfx7 bug where we have to split writes that span more than 1 register
6864     * into instructions with a width of 4 (otherwise the write to the second
6865     * register written runs into an execmask hardware bug) which isn't very
6866     * nice.
6867     */
6868    union {
6869       double d;
6870       struct {
6871          uint32_t i1;
6872          uint32_t i2;
6873       };
6874    } di;
6875 
6876    di.d = v;
6877 
6878    const fs_builder ubld = bld.exec_all().group(1, 0);
6879    const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
6880    ubld.MOV(tmp, elk_imm_ud(di.i1));
6881    ubld.MOV(horiz_offset(tmp, 1), elk_imm_ud(di.i2));
6882 
6883    return component(retype(tmp, ELK_REGISTER_TYPE_DF), 0);
6884 }
6885 
6886 elk_fs_reg
elk_setup_imm_b(const fs_builder & bld,int8_t v)6887 elk_setup_imm_b(const fs_builder &bld, int8_t v)
6888 {
6889    const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_B);
6890    bld.MOV(tmp, elk_imm_w(v));
6891    return tmp;
6892 }
6893 
6894 elk_fs_reg
elk_setup_imm_ub(const fs_builder & bld,uint8_t v)6895 elk_setup_imm_ub(const fs_builder &bld, uint8_t v)
6896 {
6897    const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UB);
6898    bld.MOV(tmp, elk_imm_uw(v));
6899    return tmp;
6900 }
6901 
6902 static void
fs_nir_emit_instr(nir_to_elk_state & ntb,nir_instr * instr)6903 fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr)
6904 {
6905    ntb.bld = ntb.bld.annotate(NULL, instr);
6906 
6907    switch (instr->type) {
6908    case nir_instr_type_alu:
6909       fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
6910       break;
6911 
6912    case nir_instr_type_deref:
6913       unreachable("All derefs should've been lowered");
6914       break;
6915 
6916    case nir_instr_type_intrinsic:
6917       switch (ntb.s.stage) {
6918       case MESA_SHADER_VERTEX:
6919          fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6920          break;
6921       case MESA_SHADER_TESS_CTRL:
6922          fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6923          break;
6924       case MESA_SHADER_TESS_EVAL:
6925          fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6926          break;
6927       case MESA_SHADER_GEOMETRY:
6928          fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6929          break;
6930       case MESA_SHADER_FRAGMENT:
6931          fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6932          break;
6933       case MESA_SHADER_COMPUTE:
6934          fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6935          break;
6936       default:
6937          unreachable("unsupported shader stage");
6938       }
6939       break;
6940 
6941    case nir_instr_type_tex:
6942       fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
6943       break;
6944 
6945    case nir_instr_type_load_const:
6946       fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
6947       break;
6948 
6949    case nir_instr_type_undef:
6950       /* We create a new VGRF for undefs on every use (by handling
6951        * them in get_nir_src()), rather than for each definition.
6952        * This helps register coalescing eliminate MOVs from undef.
6953        */
6954       break;
6955 
6956    case nir_instr_type_jump:
6957       fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
6958       break;
6959 
6960    default:
6961       unreachable("unknown instruction type");
6962    }
6963 }
6964 
6965 static unsigned
elk_rnd_mode_from_nir(unsigned mode,unsigned * mask)6966 elk_rnd_mode_from_nir(unsigned mode, unsigned *mask)
6967 {
6968    unsigned elk_mode = 0;
6969    *mask = 0;
6970 
6971    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
6972         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
6973         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
6974        mode) {
6975       elk_mode |= ELK_RND_MODE_RTZ << ELK_CR0_RND_MODE_SHIFT;
6976       *mask |= ELK_CR0_RND_MODE_MASK;
6977    }
6978    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
6979         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
6980         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
6981        mode) {
6982       elk_mode |= ELK_RND_MODE_RTNE << ELK_CR0_RND_MODE_SHIFT;
6983       *mask |= ELK_CR0_RND_MODE_MASK;
6984    }
6985    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
6986       elk_mode |= ELK_CR0_FP16_DENORM_PRESERVE;
6987       *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6988    }
6989    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
6990       elk_mode |= ELK_CR0_FP32_DENORM_PRESERVE;
6991       *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
6992    }
6993    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
6994       elk_mode |= ELK_CR0_FP64_DENORM_PRESERVE;
6995       *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
6996    }
6997    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
6998       *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6999    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
7000       *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
7001    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
7002       *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
7003    if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7004       *mask |= ELK_CR0_FP_MODE_MASK;
7005 
7006    if (*mask != 0)
7007       assert((*mask & elk_mode) == elk_mode);
7008 
7009    return elk_mode;
7010 }
7011 
7012 static void
emit_shader_float_controls_execution_mode(nir_to_elk_state & ntb)7013 emit_shader_float_controls_execution_mode(nir_to_elk_state &ntb)
7014 {
7015    const fs_builder &bld = ntb.bld;
7016    elk_fs_visitor &s = ntb.s;
7017 
7018    unsigned execution_mode = s.nir->info.float_controls_execution_mode;
7019    if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7020       return;
7021 
7022    fs_builder ubld = bld.exec_all().group(1, 0);
7023    fs_builder abld = ubld.annotate("shader floats control execution mode");
7024    unsigned mask, mode = elk_rnd_mode_from_nir(execution_mode, &mask);
7025 
7026    if (mask == 0)
7027       return;
7028 
7029    abld.emit(ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
7030              elk_imm_d(mode), elk_imm_d(mask));
7031 }
7032 
7033 void
nir_to_elk(elk_fs_visitor * s)7034 nir_to_elk(elk_fs_visitor *s)
7035 {
7036    nir_to_elk_state ntb = {
7037       .s       = *s,
7038       .nir     = s->nir,
7039       .devinfo = s->devinfo,
7040       .mem_ctx = ralloc_context(NULL),
7041       .bld     = fs_builder(s).at_end(),
7042    };
7043 
7044    emit_shader_float_controls_execution_mode(ntb);
7045 
7046    /* emit the arrays used for inputs and outputs - load/store intrinsics will
7047     * be converted to reads/writes of these arrays
7048     */
7049    fs_nir_setup_outputs(ntb);
7050    fs_nir_setup_uniforms(ntb.s);
7051    fs_nir_emit_system_values(ntb);
7052    ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
7053 
7054    fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
7055 
7056    ntb.bld.emit(ELK_SHADER_OPCODE_HALT_TARGET);
7057 
7058    ralloc_free(ntb.mem_ctx);
7059 }
7060 
7061