• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_fs_builder.h"
26 #include "brw_nir.h"
27 #include "brw_eu.h"
28 #include "nir.h"
29 #include "nir_intrinsics.h"
30 #include "nir_search_helpers.h"
31 #include "dev/intel_debug.h"
32 #include "util/u_math.h"
33 #include "util/bitscan.h"
34 
35 #include <vector>
36 
37 using namespace brw;
38 
39 struct brw_fs_bind_info {
40    bool valid;
41    bool bindless;
42    unsigned block;
43    unsigned set;
44    unsigned binding;
45 };
46 
47 struct nir_to_brw_state {
48    fs_visitor &s;
49    const nir_shader *nir;
50    const intel_device_info *devinfo;
51    void *mem_ctx;
52 
53    /* Points to the end of the program.  Annotated with the current NIR
54     * instruction when applicable.
55     */
56    fs_builder bld;
57 
58    brw_reg *ssa_values;
59    struct brw_fs_bind_info *ssa_bind_infos;
60    brw_reg *system_values;
61 
62    bool annotate;
63 };
64 
65 static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel = 0);
66 static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false);
67 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
68 
69 static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
70 static brw_reg emit_samplepos_setup(nir_to_brw_state &ntb);
71 static brw_reg emit_sampleid_setup(nir_to_brw_state &ntb);
72 static brw_reg emit_samplemaskin_setup(nir_to_brw_state &ntb);
73 static brw_reg emit_shading_rate_setup(nir_to_brw_state &ntb);
74 
75 static void fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl);
76 static void fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list);
77 static void fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt);
78 static void fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop);
79 static void fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block);
80 static void fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr);
81 
82 static void fs_nir_emit_memory_access(nir_to_brw_state &ntb,
83                                       const fs_builder &bld,
84                                       const fs_builder &xbld,
85                                       nir_intrinsic_instr *instr);
86 
87 static void brw_combine_with_vec(const fs_builder &bld, const brw_reg &dst,
88                                  const brw_reg &src, unsigned n);
89 
90 static bool
brw_texture_offset(const nir_tex_instr * tex,unsigned src,uint32_t * offset_bits_out)91 brw_texture_offset(const nir_tex_instr *tex, unsigned src,
92                    uint32_t *offset_bits_out)
93 {
94    if (!nir_src_is_const(tex->src[src].src))
95       return false;
96 
97    const unsigned num_components = nir_tex_instr_src_size(tex, src);
98 
99    /* Combine all three offsets into a single unsigned dword:
100     *
101     *    bits 11:8 - U Offset (X component)
102     *    bits  7:4 - V Offset (Y component)
103     *    bits  3:0 - R Offset (Z component)
104     */
105    uint32_t offset_bits = 0;
106    for (unsigned i = 0; i < num_components; i++) {
107       int offset = nir_src_comp_as_int(tex->src[src].src, i);
108 
109       /* offset out of bounds; caller will handle it. */
110       if (offset > 7 || offset < -8)
111          return false;
112 
113       const unsigned shift = 4 * (2 - i);
114       offset_bits |= (offset & 0xF) << shift;
115    }
116 
117    *offset_bits_out = offset_bits;
118 
119    return true;
120 }
121 
122 static brw_reg
setup_imm_b(const fs_builder & bld,int8_t v)123 setup_imm_b(const fs_builder &bld, int8_t v)
124 {
125    const brw_reg tmp = bld.vgrf(BRW_TYPE_B);
126    bld.MOV(tmp, brw_imm_w(v));
127    return tmp;
128 }
129 
130 static void
fs_nir_setup_outputs(nir_to_brw_state & ntb)131 fs_nir_setup_outputs(nir_to_brw_state &ntb)
132 {
133    fs_visitor &s = ntb.s;
134 
135    if (s.stage == MESA_SHADER_TESS_CTRL ||
136        s.stage == MESA_SHADER_TASK ||
137        s.stage == MESA_SHADER_MESH ||
138        s.stage == MESA_SHADER_FRAGMENT ||
139        s.stage == MESA_SHADER_COMPUTE)
140       return;
141 
142    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
143 
144    /* Calculate the size of output registers in a separate pass, before
145     * allocating them.  With ARB_enhanced_layouts, multiple output variables
146     * may occupy the same slot, but have different type sizes.
147     */
148    nir_foreach_shader_out_variable(var, s.nir) {
149       const int loc = var->data.driver_location;
150       const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
151       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
152    }
153 
154    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
155       if (vec4s[loc] == 0) {
156          loc++;
157          continue;
158       }
159 
160       unsigned reg_size = vec4s[loc];
161 
162       /* Check if there are any ranges that start within this range and extend
163        * past it. If so, include them in this allocation.
164        */
165       for (unsigned i = 1; i < reg_size; i++) {
166          assert(i + loc < ARRAY_SIZE(vec4s));
167          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
168       }
169 
170       brw_reg reg = ntb.bld.vgrf(BRW_TYPE_F, 4 * reg_size);
171       for (unsigned i = 0; i < reg_size; i++) {
172          assert(loc + i < ARRAY_SIZE(s.outputs));
173          s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
174       }
175 
176       loc += reg_size;
177    }
178 }
179 
180 static void
fs_nir_setup_uniforms(fs_visitor & s)181 fs_nir_setup_uniforms(fs_visitor &s)
182 {
183    const intel_device_info *devinfo = s.devinfo;
184 
185    /* Only the first compile gets to set up uniforms. */
186    if (s.uniforms)
187       return;
188 
189    s.uniforms = s.nir->num_uniforms / 4;
190 
191    if (gl_shader_stage_is_compute(s.stage) && devinfo->verx10 < 125) {
192       /* Add uniforms for builtins after regular NIR uniforms. */
193       assert(s.uniforms == s.prog_data->nr_params);
194 
195       /* Subgroup ID must be the last uniform on the list.  This will make
196        * easier later to split between cross thread and per thread
197        * uniforms.
198        */
199       uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1);
200       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
201       s.uniforms++;
202    }
203 }
204 
205 static brw_reg
emit_work_group_id_setup(nir_to_brw_state & ntb)206 emit_work_group_id_setup(nir_to_brw_state &ntb)
207 {
208    fs_visitor &s = ntb.s;
209    const fs_builder &bld = ntb.bld.scalar_group();
210 
211    assert(gl_shader_stage_is_compute(s.stage));
212 
213    brw_reg id = bld.vgrf(BRW_TYPE_UD, 3);
214 
215    id.is_scalar = true;
216 
217    struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_TYPE_UD));
218    bld.MOV(id, r0_1);
219 
220    struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_TYPE_UD));
221    struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_TYPE_UD));
222    bld.MOV(offset(id, bld, 1), r0_6);
223    bld.MOV(offset(id, bld, 2), r0_7);
224 
225    return id;
226 }
227 
228 static bool
emit_system_values_block(nir_to_brw_state & ntb,nir_block * block)229 emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
230 {
231    fs_visitor &s = ntb.s;
232    brw_reg *reg;
233 
234    nir_foreach_instr(instr, block) {
235       if (instr->type != nir_instr_type_intrinsic)
236          continue;
237 
238       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
239       switch (intrin->intrinsic) {
240       case nir_intrinsic_load_vertex_id:
241       case nir_intrinsic_load_base_vertex:
242          unreachable("should be lowered by nir_lower_system_values().");
243 
244       case nir_intrinsic_load_vertex_id_zero_base:
245       case nir_intrinsic_load_is_indexed_draw:
246       case nir_intrinsic_load_first_vertex:
247       case nir_intrinsic_load_instance_id:
248       case nir_intrinsic_load_base_instance:
249          unreachable("should be lowered by brw_nir_lower_vs_inputs().");
250          break;
251 
252       case nir_intrinsic_load_draw_id:
253          /* For Task/Mesh, draw_id will be handled later in
254           * nir_emit_mesh_task_intrinsic().
255           */
256          if (!gl_shader_stage_is_mesh(s.stage))
257             unreachable("should be lowered by brw_nir_lower_vs_inputs().");
258          break;
259 
260       case nir_intrinsic_load_invocation_id:
261          if (s.stage == MESA_SHADER_TESS_CTRL)
262             break;
263          assert(s.stage == MESA_SHADER_GEOMETRY);
264          reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
265          if (reg->file == BAD_FILE) {
266             *reg = s.gs_payload().instance_id;
267          }
268          break;
269 
270       case nir_intrinsic_load_sample_pos:
271       case nir_intrinsic_load_sample_pos_or_center:
272          assert(s.stage == MESA_SHADER_FRAGMENT);
273          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
274          if (reg->file == BAD_FILE)
275             *reg = emit_samplepos_setup(ntb);
276          break;
277 
278       case nir_intrinsic_load_sample_id:
279          assert(s.stage == MESA_SHADER_FRAGMENT);
280          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
281          if (reg->file == BAD_FILE)
282             *reg = emit_sampleid_setup(ntb);
283          break;
284 
285       case nir_intrinsic_load_sample_mask_in:
286          assert(s.stage == MESA_SHADER_FRAGMENT);
287          reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
288          if (reg->file == BAD_FILE)
289             *reg = emit_samplemaskin_setup(ntb);
290          break;
291 
292       case nir_intrinsic_load_workgroup_id:
293          if (gl_shader_stage_is_mesh(s.stage))
294             unreachable("should be lowered by nir_lower_compute_system_values().");
295          assert(gl_shader_stage_is_compute(s.stage));
296          reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
297          if (reg->file == BAD_FILE)
298             *reg = emit_work_group_id_setup(ntb);
299          break;
300 
301       case nir_intrinsic_load_helper_invocation:
302          assert(s.stage == MESA_SHADER_FRAGMENT);
303          reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
304          if (reg->file == BAD_FILE) {
305             const fs_builder abld =
306                ntb.bld.annotate("gl_HelperInvocation");
307 
308             /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
309              * pixel mask is in g1.7 of the thread payload.
310              *
311              * We move the per-channel pixel enable bit to the low bit of each
312              * channel by shifting the byte containing the pixel mask by the
313              * vector immediate 0x76543210UV.
314              *
315              * The region of <1,8,0> reads only 1 byte (the pixel masks for
316              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
317              * masks for 2 and 3) in SIMD16.
318              */
319             brw_reg shifted = abld.vgrf(BRW_TYPE_UW);
320 
321             for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
322                const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
323                /* According to the "PS Thread Payload for Normal
324                 * Dispatch" pages on the BSpec, the dispatch mask is
325                 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
326                 * gfx6+.
327                 */
328                const struct brw_reg reg = s.devinfo->ver >= 20 ?
329                   xe2_vec1_grf(i, 15) : brw_vec1_grf(i + 1, 7);
330                hbld.SHR(offset(shifted, hbld, i),
331                         stride(retype(reg, BRW_TYPE_UB), 1, 8, 0),
332                         brw_imm_v(0x76543210));
333             }
334 
335             /* A set bit in the pixel mask means the channel is enabled, but
336              * that is the opposite of gl_HelperInvocation so we need to invert
337              * the mask.
338              *
339              * The negate source-modifier bit of logical instructions on Gfx8+
340              * performs 1's complement negation, so we can use that instead of
341              * a NOT instruction.
342              */
343             brw_reg inverted = negate(shifted);
344 
345             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
346              * with 1 and negating.
347              */
348             brw_reg anded = abld.vgrf(BRW_TYPE_UD);
349             abld.AND(anded, inverted, brw_imm_uw(1));
350 
351             *reg = abld.MOV(negate(retype(anded, BRW_TYPE_D)));
352          }
353          break;
354 
355       case nir_intrinsic_load_frag_shading_rate:
356          reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
357          if (reg->file == BAD_FILE)
358             *reg = emit_shading_rate_setup(ntb);
359          break;
360 
361       default:
362          break;
363       }
364    }
365 
366    return true;
367 }
368 
369 static void
fs_nir_emit_system_values(nir_to_brw_state & ntb)370 fs_nir_emit_system_values(nir_to_brw_state &ntb)
371 {
372    fs_visitor &s = ntb.s;
373 
374    ntb.system_values = ralloc_array(ntb.mem_ctx, brw_reg, SYSTEM_VALUE_MAX);
375    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
376       ntb.system_values[i] = brw_reg();
377    }
378 
379    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
380    nir_foreach_block(block, impl)
381       emit_system_values_block(ntb, block);
382 }
383 
384 static void
fs_nir_emit_impl(nir_to_brw_state & ntb,nir_function_impl * impl)385 fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl)
386 {
387    ntb.ssa_values = rzalloc_array(ntb.mem_ctx, brw_reg, impl->ssa_alloc);
388    ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc);
389 
390    fs_nir_emit_cf_list(ntb, &impl->body);
391 }
392 
393 static void
fs_nir_emit_cf_list(nir_to_brw_state & ntb,exec_list * list)394 fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list)
395 {
396    exec_list_validate(list);
397    foreach_list_typed(nir_cf_node, node, node, list) {
398       switch (node->type) {
399       case nir_cf_node_if:
400          fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
401          break;
402 
403       case nir_cf_node_loop:
404          fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
405          break;
406 
407       case nir_cf_node_block:
408          fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
409          break;
410 
411       default:
412          unreachable("Invalid CFG node block");
413       }
414    }
415 }
416 
417 static void
fs_nir_emit_if(nir_to_brw_state & ntb,nir_if * if_stmt)418 fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt)
419 {
420    const fs_builder &bld = ntb.bld;
421 
422    bool invert;
423    brw_reg cond_reg;
424 
425    /* If the condition has the form !other_condition, use other_condition as
426     * the source, but invert the predicate on the if instruction.
427     */
428    nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
429    if (cond != NULL && cond->op == nir_op_inot) {
430       invert = true;
431       cond_reg = get_nir_src(ntb, cond->src[0].src, cond->src[0].swizzle[0]);
432    } else {
433       invert = false;
434       cond_reg = get_nir_src(ntb, if_stmt->condition);
435    }
436 
437    /* first, put the condition into f0 */
438    fs_inst *inst = bld.MOV(bld.null_reg_d(),
439                            retype(cond_reg, BRW_TYPE_D));
440    inst->conditional_mod = BRW_CONDITIONAL_NZ;
441 
442    fs_inst *iff = bld.IF(BRW_PREDICATE_NORMAL);
443    iff->predicate_inverse = invert;
444 
445    fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
446 
447    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
448       bld.emit(BRW_OPCODE_ELSE);
449       fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
450    }
451 
452    fs_inst *endif = bld.emit(BRW_OPCODE_ENDIF);
453 
454    /* Peephole: replace IF-JUMP-ENDIF with predicated jump */
455    if (endif->prev->prev == iff) {
456       fs_inst *jump = (fs_inst *) endif->prev;
457       if (jump->predicate == BRW_PREDICATE_NONE &&
458           (jump->opcode == BRW_OPCODE_BREAK ||
459            jump->opcode == BRW_OPCODE_CONTINUE)) {
460          jump->predicate = iff->predicate;
461          jump->predicate_inverse = iff->predicate_inverse;
462          iff->exec_node::remove();
463          endif->exec_node::remove();
464       }
465    }
466 }
467 
468 static void
fs_nir_emit_loop(nir_to_brw_state & ntb,nir_loop * loop)469 fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
470 {
471    const fs_builder &bld = ntb.bld;
472 
473    assert(!nir_loop_has_continue_construct(loop));
474    bld.emit(BRW_OPCODE_DO);
475 
476    fs_nir_emit_cf_list(ntb, &loop->body);
477 
478    fs_inst *peep_while = bld.emit(BRW_OPCODE_WHILE);
479 
480    /* Peephole: replace (+f0) break; while with (-f0) while */
481    fs_inst *peep_break = (fs_inst *) peep_while->prev;
482 
483    if (peep_break->opcode == BRW_OPCODE_BREAK &&
484        peep_break->predicate != BRW_PREDICATE_NONE) {
485       peep_while->predicate = peep_break->predicate;
486       peep_while->predicate_inverse = !peep_break->predicate_inverse;
487       peep_break->exec_node::remove();
488    }
489 }
490 
491 static void
fs_nir_emit_block(nir_to_brw_state & ntb,nir_block * block)492 fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
493 {
494    fs_builder bld = ntb.bld;
495 
496    nir_foreach_instr(instr, block) {
497       fs_nir_emit_instr(ntb, instr);
498    }
499 
500    ntb.bld = bld;
501 }
502 
503 /**
504  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
505  * match instr.
506  */
507 static bool
optimize_extract_to_float(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,const brw_reg & result)508 optimize_extract_to_float(nir_to_brw_state &ntb, const fs_builder &bld,
509                           nir_alu_instr *instr, const brw_reg &result)
510 {
511    const intel_device_info *devinfo = ntb.devinfo;
512 
513    /* No fast path for f16 (yet) or f64. */
514    assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
515 
516    if (!instr->src[0].src.ssa->parent_instr)
517       return false;
518 
519    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
520       return false;
521 
522    nir_alu_instr *src0 =
523       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
524 
525    unsigned bytes;
526    bool is_signed;
527 
528    switch (src0->op) {
529    case nir_op_extract_u8:
530    case nir_op_extract_u16:
531       bytes = src0->op == nir_op_extract_u8 ? 1 : 2;
532 
533       /* i2f(extract_u8(a, b)) and u2f(extract_u8(a, b)) produce the same
534        * result. Ditto for extract_u16.
535        */
536       is_signed = false;
537       break;
538 
539    case nir_op_extract_i8:
540    case nir_op_extract_i16:
541       bytes = src0->op == nir_op_extract_i8 ? 1 : 2;
542 
543       /* The fast path can't handle u2f(extract_i8(a, b)) because the implicit
544        * sign extension of the extract_i8 is lost. For example,
545        * u2f(extract_i8(0x0000ff00, 1)) should produce 4294967295.0, but a
546        * fast path could either give 255.0 (by implementing the fast path as
547        * u2f(extract_u8(x))) or -1.0 (by implementing the fast path as
548        * i2f(extract_i8(x))). At one point in time, we incorrectly implemented
549        * the former.
550        */
551       if (instr->op != nir_op_i2f32)
552          return false;
553 
554       is_signed = true;
555       break;
556 
557    default:
558       return false;
559    }
560 
561    unsigned element = nir_src_as_uint(src0->src[1].src);
562 
563    /* Element type to extract.*/
564    const brw_reg_type type = brw_int_type(bytes, is_signed);
565 
566    brw_reg op0 = get_nir_src(ntb, src0->src[0].src, -1);
567    op0.type = brw_type_for_nir_type(devinfo,
568       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
569                      nir_src_bit_size(src0->src[0].src)));
570 
571    /* It is not documented in the Bspec, but DG2 and newer platforms cannot do
572     * direct byte-to-float conversions from scalars. MR !30140 has more
573     * details. If the optimization is applied in cases that would require
574     * lower_regioning to do some lowering, the code generated will be much,
575     * much worse.
576     */
577    if (devinfo->verx10 >= 125 && bytes == 1) {
578       /* If the source truly scalar, for example from the UNIFORM file, skip
579        * the optimize_extract_to_float optimization.
580        *
581        * Note: is_scalar values won't have zero stride until after the call to
582        * offset() below that applies the swizzle.
583        */
584       if (is_uniform(op0))
585          return false;
586 
587       /* If the dispatch width matches the scalar allocation width, then
588        * is_scalar can be demoted to non-is_scalar. This prevents offset() and
589        * component() (both called below) from setting the stride to zero, and
590        * that avoids the awful code generated by lower_regioning.
591        */
592       if (op0.is_scalar) {
593          const unsigned allocation_width = 8 * reg_unit(ntb.devinfo);
594          if (ntb.bld.dispatch_width() != allocation_width)
595             return false;
596 
597          assert(bld.dispatch_width() == allocation_width);
598          op0.is_scalar = false;
599       }
600    }
601 
602    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
603 
604    /* If the dispatch width matches the scalar allocation width, offset() will
605     * not modify the stride, but having source stride <0;1,0> is advantageous.
606     */
607    if (op0.is_scalar)
608       op0 = component(op0, 0);
609 
610    /* Bspec "Register Region Restrictions" for Xe says:
611     *
612     *    "In case of all float point data types used in destination
613     *
614     *    1. Register Regioning patterns where register data bit location of
615     *       the LSB of the channels are changed between source and destination
616     *       are not supported on Src0 and Src1 except for broadcast of a
617     *       scalar."
618     *
619     * This restriction is enfored in brw_lower_regioning.  There is no
620     * reason to generate an optimized instruction that brw_lower_regioning
621     * will have to break up later.
622     */
623    if (devinfo->verx10 >= 125 && element != 0 && !is_uniform(op0))
624       return false;
625 
626    bld.MOV(result, subscript(op0, type, element));
627    return true;
628 }
629 
630 static bool
optimize_frontfacing_ternary(nir_to_brw_state & ntb,nir_alu_instr * instr,const brw_reg & result)631 optimize_frontfacing_ternary(nir_to_brw_state &ntb,
632                              nir_alu_instr *instr,
633                              const brw_reg &result)
634 {
635    const intel_device_info *devinfo = ntb.devinfo;
636    fs_visitor &s = ntb.s;
637 
638    nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
639    if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
640       return false;
641 
642    if (!nir_src_is_const(instr->src[1].src) ||
643        !nir_src_is_const(instr->src[2].src))
644       return false;
645 
646    const float value1 = nir_src_as_float(instr->src[1].src);
647    const float value2 = nir_src_as_float(instr->src[2].src);
648    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
649       return false;
650 
651    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
652    assert(value1 == -value2);
653 
654    brw_reg tmp = ntb.bld.vgrf(BRW_TYPE_D);
655 
656    if (devinfo->ver >= 20) {
657       /* Gfx20+ has separate back-facing bits for each pair of
658        * subspans in order to support multiple polygons, so we need to
659        * use a <1;8,0> region in order to select the correct word for
660        * each channel.  Unfortunately they're no longer aligned to the
661        * sign bit of a 16-bit word, so a left shift is necessary.
662        */
663       brw_reg ff = ntb.bld.vgrf(BRW_TYPE_UW);
664 
665       for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
666          const fs_builder hbld = ntb.bld.group(16, i);
667          const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
668                                              BRW_TYPE_UW);
669          hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), brw_imm_ud(4));
670       }
671 
672       if (value1 == -1.0f)
673          ff.negate = true;
674 
675       ntb.bld.OR(subscript(tmp, BRW_TYPE_UW, 1), ff,
676                   brw_imm_uw(0x3f80));
677 
678    } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
679       /* According to the BSpec "PS Thread Payload for Normal
680        * Dispatch", the front/back facing interpolation bit is stored
681        * as bit 15 of either the R1.1 or R1.6 poly info field, for the
682        * first and second polygons respectively in multipolygon PS
683        * dispatch mode.
684        */
685       assert(s.dispatch_width == 16);
686 
687       for (unsigned i = 0; i < s.max_polygons; i++) {
688          const fs_builder hbld = ntb.bld.group(8, i);
689          struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
690                                     BRW_TYPE_UW);
691 
692          if (value1 == -1.0f)
693             g1.negate = true;
694 
695          hbld.OR(subscript(offset(tmp, hbld, i), BRW_TYPE_UW, 1),
696                  g1, brw_imm_uw(0x3f80));
697       }
698 
699    } else if (devinfo->ver >= 12) {
700       /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
701       brw_reg g1 = brw_reg(retype(brw_vec1_grf(1, 1), BRW_TYPE_W));
702 
703       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
704        *
705        *    or(8)  tmp.1<2>W  g1.1<0,1,0>W  0x00003f80W
706        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
707        *
708        * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
709        */
710       if (value1 == -1.0f)
711          g1.negate = true;
712 
713       ntb.bld.OR(subscript(tmp, BRW_TYPE_W, 1),
714                   g1, brw_imm_uw(0x3f80));
715    } else {
716       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
717       brw_reg g0 = brw_reg(retype(brw_vec1_grf(0, 0), BRW_TYPE_W));
718 
719       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
720        *
721        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
722        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
723        *
724        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
725        *
726        * This negation looks like it's safe in practice, because bits 0:4 will
727        * surely be TRIANGLES
728        */
729 
730       if (value1 == -1.0f) {
731          g0.negate = true;
732       }
733 
734       ntb.bld.OR(subscript(tmp, BRW_TYPE_W, 1),
735                   g0, brw_imm_uw(0x3f80));
736    }
737    ntb.bld.AND(retype(result, BRW_TYPE_D), tmp, brw_imm_d(0xbf800000));
738 
739    return true;
740 }
741 
742 static brw_rnd_mode
brw_rnd_mode_from_nir_op(const nir_op op)743 brw_rnd_mode_from_nir_op (const nir_op op) {
744    switch (op) {
745    case nir_op_f2f16_rtz:
746       return BRW_RND_MODE_RTZ;
747    case nir_op_f2f16_rtne:
748       return BRW_RND_MODE_RTNE;
749    default:
750       unreachable("Operation doesn't support rounding mode");
751    }
752 }
753 
754 static brw_rnd_mode
brw_rnd_mode_from_execution_mode(unsigned execution_mode)755 brw_rnd_mode_from_execution_mode(unsigned execution_mode)
756 {
757    if (nir_has_any_rounding_mode_rtne(execution_mode))
758       return BRW_RND_MODE_RTNE;
759    if (nir_has_any_rounding_mode_rtz(execution_mode))
760       return BRW_RND_MODE_RTZ;
761    return BRW_RND_MODE_UNSPECIFIED;
762 }
763 
764 static brw_reg
prepare_alu_destination_and_sources(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,brw_reg * op,bool need_dest)765 prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
766                                     const fs_builder &bld,
767                                     nir_alu_instr *instr,
768                                     brw_reg *op,
769                                     bool need_dest)
770 {
771    const intel_device_info *devinfo = ntb.devinfo;
772 
773    bool all_sources_uniform = true;
774    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
775       op[i] = get_nir_src(ntb, instr->src[i].src, -1);
776       op[i].type = brw_type_for_nir_type(devinfo,
777          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
778                         nir_src_bit_size(instr->src[i].src)));
779 
780       /* is_scalar sources won't be is_uniform because get_nir_src was passed
781        * -1 as the channel.
782        */
783       if (!is_uniform(op[i]) && !op[i].is_scalar)
784          all_sources_uniform = false;
785    }
786 
787    brw_reg result =
788       need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud();
789 
790    result.type = brw_type_for_nir_type(devinfo,
791       (nir_alu_type)(nir_op_infos[instr->op].output_type |
792                      instr->def.bit_size));
793 
794    /* Move and vecN instrutions may still be vectored.  Return the raw,
795     * vectored source and destination so that fs_visitor::nir_emit_alu can
796     * handle it.  Other callers should not have to handle these kinds of
797     * instructions.
798     */
799    switch (instr->op) {
800    case nir_op_mov:
801    case nir_op_vec2:
802    case nir_op_vec3:
803    case nir_op_vec4:
804    case nir_op_vec8:
805    case nir_op_vec16:
806       return result;
807    default:
808       break;
809    }
810 
811    const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform);
812    const fs_builder xbld = is_scalar ? bld.scalar_group() : bld;
813 
814    /* At this point, we have dealt with any instruction that operates on
815     * more than a single channel.  Therefore, we can just adjust the source
816     * and destination registers for that channel and emit the instruction.
817     */
818    unsigned channel = 0;
819    if (nir_op_infos[instr->op].output_size == 0) {
820       /* Since NIR is doing the scalarizing for us, we should only ever see
821        * vectorized operations with a single channel.
822        */
823       nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
824       assert(util_bitcount(write_mask) == 1);
825       channel = ffs(write_mask) - 1;
826 
827       result = offset(result, xbld, channel);
828    }
829 
830    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
831       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
832       op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]);
833 
834       /* If the dispatch width matches the scalar allocation width, offset()
835        * won't set the stride to zero. Force that here.
836        */
837       if (op[i].is_scalar)
838          op[i] = component(op[i], 0);
839    }
840 
841    return result;
842 }
843 
844 static brw_reg
resolve_source_modifiers(const fs_builder & bld,const brw_reg & src)845 resolve_source_modifiers(const fs_builder &bld, const brw_reg &src)
846 {
847    return (src.abs || src.negate) ? bld.MOV(src) : src;
848 }
849 
850 static void
resolve_inot_sources(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,brw_reg * op)851 resolve_inot_sources(nir_to_brw_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
852                      brw_reg *op)
853 {
854    for (unsigned i = 0; i < 2; i++) {
855       nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
856 
857       if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
858          /* The source of the inot is now the source of instr. */
859          prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
860 
861          assert(!op[i].negate);
862          op[i].negate = true;
863       } else {
864          op[i] = resolve_source_modifiers(bld, op[i]);
865       }
866    }
867 }
868 
869 static bool
try_emit_b2fi_of_inot(nir_to_brw_state & ntb,const fs_builder & bld,brw_reg result,nir_alu_instr * instr)870 try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld,
871                       brw_reg result,
872                       nir_alu_instr *instr)
873 {
874    const intel_device_info *devinfo = bld.shader->devinfo;
875 
876    if (devinfo->verx10 >= 125)
877       return false;
878 
879    nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
880 
881    if (inot_instr == NULL || inot_instr->op != nir_op_inot)
882       return false;
883 
884    /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
885     * of valid size-changing combinations is a bit more complex.
886     *
887     * The source restriction is just because I was lazy about generating the
888     * constant below.
889     */
890    if (instr->def.bit_size != 32 ||
891        nir_src_bit_size(inot_instr->src[0].src) != 32)
892       return false;
893 
894    /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
895     * this is float(1 + a).
896     */
897    brw_reg op;
898 
899    prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
900 
901    /* Ignore the saturate modifier, if there is one.  The result of the
902     * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
903     */
904    bld.ADD(result, op, brw_imm_d(1));
905 
906    return true;
907 }
908 
909 static bool
is_const_zero(const nir_src & src)910 is_const_zero(const nir_src &src)
911 {
912    return nir_src_is_const(src) && nir_src_as_int(src) == 0;
913 }
914 
915 static void
fs_nir_emit_alu(nir_to_brw_state & ntb,nir_alu_instr * instr,bool need_dest)916 fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
917                 bool need_dest)
918 {
919    const intel_device_info *devinfo = ntb.devinfo;
920 
921    fs_inst *inst;
922    unsigned execution_mode =
923       ntb.bld.shader->nir->info.float_controls_execution_mode;
924 
925    brw_reg op[NIR_MAX_VEC_COMPONENTS];
926    brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest);
927 
928 #ifndef NDEBUG
929    /* Everything except raw moves, some type conversions, iabs, and ineg
930     * should have 8-bit sources lowered by nir_lower_bit_size in
931     * brw_preprocess_nir or by brw_nir_lower_conversions in
932     * brw_postprocess_nir.
933     */
934    switch (instr->op) {
935    case nir_op_mov:
936    case nir_op_vec2:
937    case nir_op_vec3:
938    case nir_op_vec4:
939    case nir_op_vec8:
940    case nir_op_vec16:
941    case nir_op_i2f16:
942    case nir_op_i2f32:
943    case nir_op_i2i16:
944    case nir_op_i2i32:
945    case nir_op_u2f16:
946    case nir_op_u2f32:
947    case nir_op_u2u16:
948    case nir_op_u2u32:
949    case nir_op_iabs:
950    case nir_op_ineg:
951    case nir_op_pack_32_4x8_split:
952       break;
953 
954    default:
955       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
956          assert(brw_type_size_bytes(op[i].type) > 1);
957       }
958    }
959 #endif
960 
961    const fs_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld;
962 
963    switch (instr->op) {
964    case nir_op_mov:
965    case nir_op_vec2:
966    case nir_op_vec3:
967    case nir_op_vec4:
968    case nir_op_vec8:
969    case nir_op_vec16: {
970       brw_reg temp = result;
971       bool need_extra_copy = false;
972 
973       nir_intrinsic_instr *store_reg =
974          nir_store_reg_for_def(&instr->def);
975       if (store_reg != NULL) {
976          nir_def *dest_reg = store_reg->src[1].ssa;
977          for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
978             nir_intrinsic_instr *load_reg =
979                nir_load_reg_for_def(instr->src[i].src.ssa);
980             if (load_reg == NULL)
981                continue;
982 
983             if (load_reg->src[0].ssa == dest_reg) {
984                need_extra_copy = true;
985                temp = bld.vgrf(result.type, 4);
986                break;
987             }
988          }
989       }
990 
991       nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
992       unsigned last_bit = util_last_bit(write_mask);
993 
994       assert(last_bit <= NIR_MAX_VEC_COMPONENTS);
995       brw_reg comps[NIR_MAX_VEC_COMPONENTS];
996 
997       for (unsigned i = 0; i < last_bit; i++) {
998          if (instr->op == nir_op_mov)
999             comps[i] = offset(op[0], bld, instr->src[0].swizzle[i]);
1000          else
1001             comps[i] = offset(op[i], bld, instr->src[i].swizzle[0]);
1002       }
1003 
1004       if (write_mask == (1u << last_bit) - 1) {
1005          bld.VEC(temp, comps, last_bit);
1006       } else {
1007          for (unsigned i = 0; i < last_bit; i++) {
1008             if (write_mask & (1 << i))
1009                bld.MOV(offset(temp, bld, i), comps[i]);
1010          }
1011       }
1012 
1013       /* In this case the source and destination registers were the same,
1014        * so we need to insert an extra set of moves in order to deal with
1015        * any swizzling.
1016        */
1017       if (need_extra_copy) {
1018          for (unsigned i = 0; i < last_bit; i++) {
1019             if (!(write_mask & (1 << i)))
1020                continue;
1021 
1022             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1023          }
1024       }
1025       return;
1026    }
1027 
1028    case nir_op_i2f32:
1029    case nir_op_u2f32:
1030       if (optimize_extract_to_float(ntb, bld, instr, result))
1031          return;
1032       bld.MOV(result, op[0]);
1033       break;
1034 
1035    case nir_op_f2f16_rtne:
1036    case nir_op_f2f16_rtz:
1037    case nir_op_f2f16: {
1038       brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
1039 
1040       if (nir_op_f2f16 == instr->op)
1041          rnd = brw_rnd_mode_from_execution_mode(execution_mode);
1042       else
1043          rnd = brw_rnd_mode_from_nir_op(instr->op);
1044 
1045       if (BRW_RND_MODE_UNSPECIFIED != rnd)
1046          bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
1047 
1048       assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
1049       bld.MOV(result, op[0]);
1050       break;
1051    }
1052 
1053    case nir_op_b2i8:
1054    case nir_op_b2i16:
1055    case nir_op_b2i32:
1056    case nir_op_b2i64:
1057    case nir_op_b2f16:
1058    case nir_op_b2f32:
1059    case nir_op_b2f64:
1060       if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1061          break;
1062       op[0].type = BRW_TYPE_D;
1063       op[0].negate = !op[0].negate;
1064       FALLTHROUGH;
1065    case nir_op_i2f64:
1066    case nir_op_i2i64:
1067    case nir_op_u2f64:
1068    case nir_op_u2u64:
1069    case nir_op_f2f64:
1070    case nir_op_f2i64:
1071    case nir_op_f2u64:
1072    case nir_op_i2i32:
1073    case nir_op_u2u32:
1074    case nir_op_f2i32:
1075    case nir_op_f2u32:
1076    case nir_op_i2f16:
1077    case nir_op_u2f16:
1078    case nir_op_f2i16:
1079    case nir_op_f2u16:
1080    case nir_op_f2i8:
1081    case nir_op_f2u8:
1082       if (result.type == BRW_TYPE_B ||
1083           result.type == BRW_TYPE_UB ||
1084           result.type == BRW_TYPE_HF)
1085          assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
1086 
1087       if (op[0].type == BRW_TYPE_B ||
1088           op[0].type == BRW_TYPE_UB ||
1089           op[0].type == BRW_TYPE_HF)
1090          assert(brw_type_size_bytes(result.type) < 8); /* brw_nir_lower_conversions */
1091 
1092       bld.MOV(result, op[0]);
1093       break;
1094 
1095    case nir_op_i2i8:
1096    case nir_op_u2u8:
1097       assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
1098       FALLTHROUGH;
1099    case nir_op_i2i16:
1100    case nir_op_u2u16: {
1101       /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1102        * Emitting the instructions one by one results in two MOV instructions
1103        * that won't be propagated.  By handling both instructions here, a
1104        * single MOV is emitted.
1105        */
1106       nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1107       if (extract_instr != NULL) {
1108          if (extract_instr->op == nir_op_extract_u8 ||
1109              extract_instr->op == nir_op_extract_i8) {
1110             prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
1111 
1112             const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1113             const brw_reg_type type =
1114                brw_int_type(1, extract_instr->op == nir_op_extract_i8);
1115 
1116             op[0] = subscript(op[0], type, byte);
1117          } else if (extract_instr->op == nir_op_extract_u16 ||
1118                     extract_instr->op == nir_op_extract_i16) {
1119             prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
1120 
1121             const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1122             const brw_reg_type type =
1123                brw_int_type(2, extract_instr->op == nir_op_extract_i16);
1124 
1125             op[0] = subscript(op[0], type, word);
1126          }
1127       }
1128 
1129       bld.MOV(result, op[0]);
1130       break;
1131    }
1132 
1133    case nir_op_fsat:
1134       inst = bld.MOV(result, op[0]);
1135       inst->saturate = true;
1136       break;
1137 
1138    case nir_op_fneg:
1139    case nir_op_ineg:
1140       op[0].negate = true;
1141       bld.MOV(result, op[0]);
1142       break;
1143 
1144    case nir_op_fabs:
1145    case nir_op_iabs:
1146       op[0].negate = false;
1147       op[0].abs = true;
1148       bld.MOV(result, op[0]);
1149       break;
1150 
1151    case nir_op_f2f32:
1152       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1153          brw_rnd_mode rnd =
1154             brw_rnd_mode_from_execution_mode(execution_mode);
1155          bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1156                              brw_imm_d(rnd));
1157       }
1158 
1159       if (op[0].type == BRW_TYPE_HF)
1160          assert(brw_type_size_bytes(result.type) < 8); /* brw_nir_lower_conversions */
1161 
1162       bld.MOV(result, op[0]);
1163       break;
1164 
1165    case nir_op_fsign:
1166       unreachable("Should have been lowered by brw_nir_lower_fsign.");
1167 
1168    case nir_op_frcp:
1169       bld.RCP(result, op[0]);
1170       break;
1171 
1172    case nir_op_fexp2:
1173       bld.EXP2(result, op[0]);
1174       break;
1175 
1176    case nir_op_flog2:
1177       bld.LOG2(result, op[0]);
1178       break;
1179 
1180    case nir_op_fsin:
1181       bld.SIN(result, op[0]);
1182       break;
1183 
1184    case nir_op_fcos:
1185       bld.COS(result, op[0]);
1186       break;
1187 
1188    case nir_op_fadd:
1189       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1190          brw_rnd_mode rnd =
1191             brw_rnd_mode_from_execution_mode(execution_mode);
1192          bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1193                              brw_imm_d(rnd));
1194       }
1195       FALLTHROUGH;
1196    case nir_op_iadd:
1197       bld.ADD(result, op[0], op[1]);
1198       break;
1199 
1200    case nir_op_iadd3:
1201       assert(instr->def.bit_size < 64);
1202       bld.ADD3(result, op[0], op[1], op[2]);
1203       break;
1204 
1205    case nir_op_iadd_sat:
1206    case nir_op_uadd_sat:
1207       inst = bld.ADD(result, op[0], op[1]);
1208       inst->saturate = true;
1209       break;
1210 
1211    case nir_op_isub_sat:
1212       bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1213       break;
1214 
1215    case nir_op_usub_sat:
1216       bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1217       break;
1218 
1219    case nir_op_irhadd:
1220    case nir_op_urhadd:
1221       assert(instr->def.bit_size < 64);
1222       bld.AVG(result, op[0], op[1]);
1223       break;
1224 
1225    case nir_op_ihadd:
1226    case nir_op_uhadd: {
1227       assert(instr->def.bit_size < 64);
1228 
1229       op[0] = resolve_source_modifiers(bld, op[0]);
1230       op[1] = resolve_source_modifiers(bld, op[1]);
1231 
1232       /* AVG(x, y) - ((x ^ y) & 1) */
1233       brw_reg one = retype(brw_imm_ud(1), result.type);
1234       bld.ADD(result, bld.AVG(op[0], op[1]),
1235               negate(bld.AND(bld.XOR(op[0], op[1]), one)));
1236       break;
1237    }
1238 
1239    case nir_op_fmul:
1240       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1241          brw_rnd_mode rnd =
1242             brw_rnd_mode_from_execution_mode(execution_mode);
1243          bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1244                              brw_imm_d(rnd));
1245       }
1246 
1247       bld.MUL(result, op[0], op[1]);
1248       break;
1249 
1250    case nir_op_imul_2x32_64:
1251    case nir_op_umul_2x32_64:
1252       bld.MUL(result, op[0], op[1]);
1253       break;
1254 
1255    case nir_op_imul_32x16:
1256    case nir_op_umul_32x16: {
1257       const bool ud = instr->op == nir_op_umul_32x16;
1258       const enum brw_reg_type word_type = ud ? BRW_TYPE_UW : BRW_TYPE_W;
1259       const enum brw_reg_type dword_type = ud ? BRW_TYPE_UD : BRW_TYPE_D;
1260 
1261       assert(instr->def.bit_size == 32);
1262 
1263       /* Before copy propagation there are no immediate values. */
1264       assert(op[0].file != IMM && op[1].file != IMM);
1265 
1266       op[1] = subscript(op[1], word_type, 0);
1267 
1268       bld.MUL(result, retype(op[0], dword_type), op[1]);
1269 
1270       break;
1271    }
1272 
1273    case nir_op_imul:
1274       assert(instr->def.bit_size < 64);
1275       bld.MUL(result, op[0], op[1]);
1276       break;
1277 
1278    case nir_op_imul_high:
1279    case nir_op_umul_high:
1280       assert(instr->def.bit_size < 64);
1281       if (instr->def.bit_size == 32) {
1282          bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1283       } else {
1284          brw_reg tmp = bld.vgrf(brw_type_with_size(op[0].type, 32));
1285          bld.MUL(tmp, op[0], op[1]);
1286          bld.MOV(result, subscript(tmp, result.type, 1));
1287       }
1288       break;
1289 
1290    case nir_op_idiv:
1291    case nir_op_udiv:
1292       assert(instr->def.bit_size < 64);
1293       bld.INT_QUOTIENT(result, op[0], op[1]);
1294       break;
1295 
1296    case nir_op_uadd_carry:
1297       unreachable("Should have been lowered by carry_to_arith().");
1298 
1299    case nir_op_usub_borrow:
1300       unreachable("Should have been lowered by borrow_to_arith().");
1301 
1302    case nir_op_umod:
1303    case nir_op_irem:
1304       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1305        * appears that our hardware just does the right thing for signed
1306        * remainder.
1307        */
1308       assert(instr->def.bit_size < 64);
1309       bld.INT_REMAINDER(result, op[0], op[1]);
1310       break;
1311 
1312    case nir_op_imod: {
1313       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1314       bld.INT_REMAINDER(result, op[0], op[1]);
1315 
1316       /* Math instructions don't support conditional mod */
1317       inst = bld.MOV(bld.null_reg_d(), result);
1318       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1319 
1320       /* Now, we need to determine if signs of the sources are different.
1321        * When we XOR the sources, the top bit is 0 if they are the same and 1
1322        * if they are different.  We can then use a conditional modifier to
1323        * turn that into a predicate.  This leads us to an XOR.l instruction.
1324        *
1325        * Technically, according to the PRM, you're not allowed to use .l on a
1326        * XOR instruction.  However, empirical experiments and Curro's reading
1327        * of the simulator source both indicate that it's safe.
1328        */
1329       bld.XOR(op[0], op[1], &inst);
1330       inst->predicate = BRW_PREDICATE_NORMAL;
1331       inst->conditional_mod = BRW_CONDITIONAL_L;
1332 
1333       /* If the result of the initial remainder operation is non-zero and the
1334        * two sources have different signs, add in a copy of op[1] to get the
1335        * final integer modulus value.
1336        */
1337       inst = bld.ADD(result, result, op[1]);
1338       inst->predicate = BRW_PREDICATE_NORMAL;
1339       break;
1340    }
1341 
1342    case nir_op_flt32:
1343    case nir_op_fge32:
1344    case nir_op_feq32:
1345    case nir_op_fneu32: {
1346       brw_reg dest = result;
1347 
1348       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1349       if (bit_size != 32) {
1350          dest = bld.vgrf(op[0].type);
1351          bld.UNDEF(dest);
1352       }
1353 
1354       bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
1355 
1356       /* The destination will now be used as a source, so select component 0
1357        * if it's is_scalar (as is done in get_nir_src).
1358        */
1359       if (bit_size != 32 && result.is_scalar)
1360          dest = component(dest, 0);
1361 
1362       if (bit_size > 32) {
1363          bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
1364       } else if(bit_size < 32) {
1365          /* When we convert the result to 32-bit we need to be careful and do
1366           * it as a signed conversion to get sign extension (for 32-bit true)
1367           */
1368          const brw_reg_type src_type =
1369             brw_type_with_size(BRW_TYPE_D, bit_size);
1370 
1371          bld.MOV(retype(result, BRW_TYPE_D), retype(dest, src_type));
1372       }
1373       break;
1374    }
1375 
1376    case nir_op_ilt32:
1377    case nir_op_ult32:
1378    case nir_op_ige32:
1379    case nir_op_uge32:
1380    case nir_op_ieq32:
1381    case nir_op_ine32: {
1382       brw_reg dest = result;
1383 
1384       const uint32_t bit_size = brw_type_size_bits(op[0].type);
1385       if (bit_size != 32) {
1386          dest = bld.vgrf(op[0].type);
1387          bld.UNDEF(dest);
1388       }
1389 
1390       bld.CMP(dest, op[0], op[1],
1391               brw_cmod_for_nir_comparison(instr->op));
1392 
1393       /* The destination will now be used as a source, so select component 0
1394        * if it's is_scalar (as is done in get_nir_src).
1395        */
1396       if (bit_size != 32 && result.is_scalar)
1397          dest = component(dest, 0);
1398 
1399       if (bit_size > 32) {
1400          bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
1401       } else if (bit_size < 32) {
1402          /* When we convert the result to 32-bit we need to be careful and do
1403           * it as a signed conversion to get sign extension (for 32-bit true)
1404           */
1405          const brw_reg_type src_type =
1406             brw_type_with_size(BRW_TYPE_D, bit_size);
1407 
1408          bld.MOV(retype(result, BRW_TYPE_D), retype(dest, src_type));
1409       }
1410       break;
1411    }
1412 
1413    case nir_op_inot: {
1414       nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1415 
1416       if (inot_src_instr != NULL &&
1417           (inot_src_instr->op == nir_op_ior ||
1418            inot_src_instr->op == nir_op_ixor ||
1419            inot_src_instr->op == nir_op_iand)) {
1420          /* The sources of the source logical instruction are now the
1421           * sources of the instruction that will be generated.
1422           */
1423          prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false);
1424          resolve_inot_sources(ntb, bld, inot_src_instr, op);
1425 
1426          /* Smash all of the sources and destination to be signed.  This
1427           * doesn't matter for the operation of the instruction, but cmod
1428           * propagation fails on unsigned sources with negation (due to
1429           * fs_inst::can_do_cmod returning false).
1430           */
1431          result.type =
1432             brw_type_for_nir_type(devinfo,
1433                                   (nir_alu_type)(nir_type_int |
1434                                                  instr->def.bit_size));
1435          op[0].type =
1436             brw_type_for_nir_type(devinfo,
1437                                   (nir_alu_type)(nir_type_int |
1438                                                  nir_src_bit_size(inot_src_instr->src[0].src)));
1439          op[1].type =
1440             brw_type_for_nir_type(devinfo,
1441                                   (nir_alu_type)(nir_type_int |
1442                                                  nir_src_bit_size(inot_src_instr->src[1].src)));
1443 
1444          /* For XOR, only invert one of the sources.  Arbitrarily choose
1445           * the first source.
1446           */
1447          op[0].negate = !op[0].negate;
1448          if (inot_src_instr->op != nir_op_ixor)
1449             op[1].negate = !op[1].negate;
1450 
1451          switch (inot_src_instr->op) {
1452          case nir_op_ior:
1453             bld.AND(result, op[0], op[1]);
1454             return;
1455 
1456          case nir_op_iand:
1457             bld.OR(result, op[0], op[1]);
1458             return;
1459 
1460          case nir_op_ixor:
1461             bld.XOR(result, op[0], op[1]);
1462             return;
1463 
1464          default:
1465             unreachable("impossible opcode");
1466          }
1467       }
1468       op[0] = resolve_source_modifiers(bld, op[0]);
1469       bld.NOT(result, op[0]);
1470       break;
1471    }
1472 
1473    case nir_op_ixor:
1474       resolve_inot_sources(ntb, bld, instr, op);
1475       bld.XOR(result, op[0], op[1]);
1476       break;
1477    case nir_op_ior:
1478       resolve_inot_sources(ntb, bld, instr, op);
1479       bld.OR(result, op[0], op[1]);
1480       break;
1481    case nir_op_iand:
1482       resolve_inot_sources(ntb, bld, instr, op);
1483       bld.AND(result, op[0], op[1]);
1484       break;
1485 
1486    case nir_op_fdot2:
1487    case nir_op_fdot3:
1488    case nir_op_fdot4:
1489    case nir_op_b32all_fequal2:
1490    case nir_op_b32all_iequal2:
1491    case nir_op_b32all_fequal3:
1492    case nir_op_b32all_iequal3:
1493    case nir_op_b32all_fequal4:
1494    case nir_op_b32all_iequal4:
1495    case nir_op_b32any_fnequal2:
1496    case nir_op_b32any_inequal2:
1497    case nir_op_b32any_fnequal3:
1498    case nir_op_b32any_inequal3:
1499    case nir_op_b32any_fnequal4:
1500    case nir_op_b32any_inequal4:
1501       unreachable("Lowered by nir_lower_alu_reductions");
1502 
1503    case nir_op_ldexp:
1504       unreachable("not reached: should be handled by ldexp_to_arith()");
1505 
1506    case nir_op_fsqrt:
1507       bld.SQRT(result, op[0]);
1508       break;
1509 
1510    case nir_op_frsq:
1511       bld.RSQ(result, op[0]);
1512       break;
1513 
1514    case nir_op_ftrunc:
1515       bld.RNDZ(result, op[0]);
1516       break;
1517 
1518    case nir_op_fceil:
1519       bld.MOV(result, negate(bld.RNDD(negate(op[0]))));
1520       break;
1521    case nir_op_ffloor:
1522       bld.RNDD(result, op[0]);
1523       break;
1524    case nir_op_ffract:
1525       bld.FRC(result, op[0]);
1526       break;
1527    case nir_op_fround_even:
1528       bld.RNDE(result, op[0]);
1529       break;
1530 
1531    case nir_op_fquantize2f16: {
1532       brw_reg tmp16 = bld.vgrf(BRW_TYPE_D);
1533       brw_reg tmp32 = bld.vgrf(BRW_TYPE_F);
1534 
1535       /* The destination stride must be at least as big as the source stride. */
1536       tmp16 = subscript(tmp16, BRW_TYPE_HF, 0);
1537 
1538       /* Check for denormal */
1539       brw_reg abs_src0 = op[0];
1540       abs_src0.abs = true;
1541       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1542               BRW_CONDITIONAL_L);
1543       /* Get the appropriately signed zero */
1544       brw_reg zero = retype(bld.AND(retype(op[0], BRW_TYPE_UD),
1545                                    brw_imm_ud(0x80000000)), BRW_TYPE_F);
1546       /* Do the actual F32 -> F16 -> F32 conversion */
1547       bld.MOV(tmp16, op[0]);
1548       bld.MOV(tmp32, tmp16);
1549       /* Select that or zero based on normal status */
1550       inst = bld.SEL(result, zero, tmp32);
1551       inst->predicate = BRW_PREDICATE_NORMAL;
1552       break;
1553    }
1554 
1555    case nir_op_imin:
1556    case nir_op_umin:
1557    case nir_op_fmin:
1558       bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1559       break;
1560 
1561    case nir_op_imax:
1562    case nir_op_umax:
1563    case nir_op_fmax:
1564       bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1565       break;
1566 
1567    case nir_op_pack_snorm_2x16:
1568    case nir_op_pack_snorm_4x8:
1569    case nir_op_pack_unorm_2x16:
1570    case nir_op_pack_unorm_4x8:
1571    case nir_op_unpack_snorm_2x16:
1572    case nir_op_unpack_snorm_4x8:
1573    case nir_op_unpack_unorm_2x16:
1574    case nir_op_unpack_unorm_4x8:
1575    case nir_op_unpack_half_2x16:
1576    case nir_op_pack_half_2x16:
1577       unreachable("not reached: should be handled by lower_packing_builtins");
1578 
1579    case nir_op_unpack_half_2x16_split_x:
1580       bld.MOV(result, subscript(op[0], BRW_TYPE_HF, 0));
1581       break;
1582 
1583    case nir_op_unpack_half_2x16_split_y:
1584       bld.MOV(result, subscript(op[0], BRW_TYPE_HF, 1));
1585       break;
1586 
1587    case nir_op_pack_64_2x32_split:
1588    case nir_op_pack_32_2x16_split:
1589       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1590       break;
1591 
1592    case nir_op_pack_32_4x8_split:
1593       bld.emit(FS_OPCODE_PACK, result, op, 4);
1594       break;
1595 
1596    case nir_op_unpack_64_2x32_split_x:
1597    case nir_op_unpack_64_2x32_split_y: {
1598       if (instr->op == nir_op_unpack_64_2x32_split_x)
1599          bld.MOV(result, subscript(op[0], BRW_TYPE_UD, 0));
1600       else
1601          bld.MOV(result, subscript(op[0], BRW_TYPE_UD, 1));
1602       break;
1603    }
1604 
1605    case nir_op_unpack_32_2x16_split_x:
1606    case nir_op_unpack_32_2x16_split_y: {
1607       if (instr->op == nir_op_unpack_32_2x16_split_x)
1608          bld.MOV(result, subscript(op[0], BRW_TYPE_UW, 0));
1609       else
1610          bld.MOV(result, subscript(op[0], BRW_TYPE_UW, 1));
1611       break;
1612    }
1613 
1614    case nir_op_fpow:
1615       bld.POW(result, op[0], op[1]);
1616       break;
1617 
1618    case nir_op_bitfield_reverse:
1619       assert(instr->def.bit_size == 32);
1620       assert(nir_src_bit_size(instr->src[0].src) == 32);
1621       bld.BFREV(result, op[0]);
1622       break;
1623 
1624    case nir_op_bit_count:
1625       assert(instr->def.bit_size == 32);
1626       assert(nir_src_bit_size(instr->src[0].src) < 64);
1627       bld.CBIT(result, op[0]);
1628       break;
1629 
1630    case nir_op_uclz:
1631       assert(instr->def.bit_size == 32);
1632       assert(nir_src_bit_size(instr->src[0].src) == 32);
1633       bld.LZD(retype(result, BRW_TYPE_UD), op[0]);
1634       break;
1635 
1636    case nir_op_ifind_msb: {
1637       assert(instr->def.bit_size == 32);
1638       assert(nir_src_bit_size(instr->src[0].src) == 32);
1639 
1640       brw_reg tmp = bld.FBH(retype(op[0], BRW_TYPE_D));
1641 
1642       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1643        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1644        * subtract the result from 31 to convert the MSB count into an LSB
1645        * count.
1646        */
1647       brw_reg count_from_lsb = bld.ADD(negate(tmp), brw_imm_w(31));
1648 
1649       /* The high word of the FBH result will be 0xffff or 0x0000. After
1650        * calculating 31 - fbh, we can obtain the correct result for
1651        * ifind_msb(0) by ORing the (sign extended) upper word of the
1652        * intermediate result.
1653        */
1654       bld.OR(result, count_from_lsb, subscript(tmp, BRW_TYPE_W, 1));
1655       break;
1656    }
1657 
1658    case nir_op_find_lsb:
1659       assert(instr->def.bit_size == 32);
1660       assert(nir_src_bit_size(instr->src[0].src) == 32);
1661       bld.FBL(result, op[0]);
1662       break;
1663 
1664    case nir_op_ubitfield_extract:
1665    case nir_op_ibitfield_extract:
1666       unreachable("should have been lowered");
1667    case nir_op_ubfe:
1668    case nir_op_ibfe:
1669       assert(instr->def.bit_size < 64);
1670       bld.BFE(result, op[2], op[1], op[0]);
1671       break;
1672    case nir_op_bfm:
1673       assert(instr->def.bit_size < 64);
1674       bld.BFI1(result, op[0], op[1]);
1675       break;
1676    case nir_op_bfi:
1677       assert(instr->def.bit_size < 64);
1678 
1679       /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1680        * either 0 or src0. Replacing the 0 with another value can eliminate a
1681        * temporary register.
1682        */
1683       if (is_const_zero(instr->src[2].src))
1684          bld.BFI2(result, op[0], op[1], op[0]);
1685       else
1686          bld.BFI2(result, op[0], op[1], op[2]);
1687 
1688       break;
1689 
1690    case nir_op_bitfield_insert:
1691       unreachable("not reached: should have been lowered");
1692 
1693    /* With regards to implicit masking of the shift counts for 8- and 16-bit
1694     * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1695     * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1696     * src0) are used. The Bspec (backed by data from experimentation) state
1697     * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1698     * types.
1699     *
1700     * The match the behavior expected for the NIR opcodes, explicit masks for
1701     * 8- and 16-bit types must be added.
1702     */
1703    case nir_op_ishl:
1704       if (instr->def.bit_size < 32) {
1705          bld.SHL(result,
1706                  op[0],
1707                  bld.AND(op[1], brw_imm_ud(instr->def.bit_size - 1)));
1708       } else {
1709          bld.SHL(result, op[0], op[1]);
1710       }
1711 
1712       break;
1713    case nir_op_ishr:
1714       if (instr->def.bit_size < 32) {
1715          bld.ASR(result,
1716                  op[0],
1717                  bld.AND(op[1], brw_imm_ud(instr->def.bit_size - 1)));
1718       } else {
1719          bld.ASR(result, op[0], op[1]);
1720       }
1721 
1722       break;
1723    case nir_op_ushr:
1724       if (instr->def.bit_size < 32) {
1725          bld.SHR(result,
1726                  op[0],
1727                  bld.AND(op[1], brw_imm_ud(instr->def.bit_size - 1)));
1728       } else {
1729          bld.SHR(result, op[0], op[1]);
1730       }
1731 
1732       break;
1733 
1734    case nir_op_urol:
1735       bld.ROL(result, op[0], op[1]);
1736       break;
1737    case nir_op_uror:
1738       bld.ROR(result, op[0], op[1]);
1739       break;
1740 
1741    case nir_op_pack_half_2x16_split:
1742       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1743       break;
1744 
1745    case nir_op_sdot_4x8_iadd:
1746    case nir_op_sdot_4x8_iadd_sat:
1747       inst = bld.DP4A(retype(result, BRW_TYPE_D),
1748                       retype(op[2], BRW_TYPE_D),
1749                       retype(op[0], BRW_TYPE_D),
1750                       retype(op[1], BRW_TYPE_D));
1751 
1752       if (instr->op == nir_op_sdot_4x8_iadd_sat)
1753          inst->saturate = true;
1754       break;
1755 
1756    case nir_op_udot_4x8_uadd:
1757    case nir_op_udot_4x8_uadd_sat:
1758       inst = bld.DP4A(retype(result, BRW_TYPE_UD),
1759                       retype(op[2], BRW_TYPE_UD),
1760                       retype(op[0], BRW_TYPE_UD),
1761                       retype(op[1], BRW_TYPE_UD));
1762 
1763       if (instr->op == nir_op_udot_4x8_uadd_sat)
1764          inst->saturate = true;
1765       break;
1766 
1767    case nir_op_sudot_4x8_iadd:
1768    case nir_op_sudot_4x8_iadd_sat:
1769       inst = bld.DP4A(retype(result, BRW_TYPE_D),
1770                       retype(op[2], BRW_TYPE_D),
1771                       retype(op[0], BRW_TYPE_D),
1772                       retype(op[1], BRW_TYPE_UD));
1773 
1774       if (instr->op == nir_op_sudot_4x8_iadd_sat)
1775          inst->saturate = true;
1776       break;
1777 
1778    case nir_op_ffma:
1779       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1780          brw_rnd_mode rnd =
1781             brw_rnd_mode_from_execution_mode(execution_mode);
1782          bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1783                              brw_imm_d(rnd));
1784       }
1785 
1786       bld.MAD(result, op[2], op[1], op[0]);
1787       break;
1788 
1789    case nir_op_flrp:
1790       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1791          brw_rnd_mode rnd =
1792             brw_rnd_mode_from_execution_mode(execution_mode);
1793          bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1794                              brw_imm_d(rnd));
1795       }
1796 
1797       bld.LRP(result, op[0], op[1], op[2]);
1798       break;
1799 
1800    case nir_op_b32csel:
1801       if (optimize_frontfacing_ternary(ntb, instr, result))
1802          return;
1803 
1804       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1805       inst = bld.SEL(result, op[1], op[2]);
1806       inst->predicate = BRW_PREDICATE_NORMAL;
1807       break;
1808 
1809    case nir_op_fcsel:
1810       bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_NZ);
1811       break;
1812 
1813    case nir_op_fcsel_gt:
1814       bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_G);
1815       break;
1816 
1817    case nir_op_fcsel_ge:
1818       bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_GE);
1819       break;
1820 
1821    case nir_op_extract_u8:
1822    case nir_op_extract_i8: {
1823       const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1824       unsigned byte = nir_src_as_uint(instr->src[1].src);
1825 
1826       /* The PRMs say:
1827        *
1828        *    BDW+
1829        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1830        *    Use two instructions and a word or DWord intermediate integer type.
1831        */
1832       if (instr->def.bit_size == 64) {
1833          if (instr->op == nir_op_extract_i8) {
1834             /* If we need to sign extend, extract to a word first */
1835             brw_reg w_temp = bld.vgrf(BRW_TYPE_W);
1836             bld.MOV(w_temp, subscript(op[0], type, byte));
1837             bld.MOV(result, w_temp);
1838          } else if (byte & 1) {
1839             /* Extract the high byte from the word containing the desired byte
1840              * offset.
1841              */
1842             bld.SHR(result,
1843                     subscript(op[0], BRW_TYPE_UW, byte / 2),
1844                     brw_imm_uw(8));
1845          } else {
1846             /* Otherwise use an AND with 0xff and a word type */
1847             bld.AND(result,
1848                     subscript(op[0], BRW_TYPE_UW, byte / 2),
1849                     brw_imm_uw(0xff));
1850          }
1851       } else {
1852          bld.MOV(result, subscript(op[0], type, byte));
1853       }
1854       break;
1855    }
1856 
1857    case nir_op_extract_u16:
1858    case nir_op_extract_i16: {
1859       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1860       unsigned word = nir_src_as_uint(instr->src[1].src);
1861       bld.MOV(result, subscript(op[0], type, word));
1862       break;
1863    }
1864 
1865    default:
1866       unreachable("unhandled instruction");
1867    }
1868 }
1869 
1870 static void
fs_nir_emit_load_const(nir_to_brw_state & ntb,nir_load_const_instr * instr)1871 fs_nir_emit_load_const(nir_to_brw_state &ntb,
1872                        nir_load_const_instr *instr)
1873 {
1874    const intel_device_info *devinfo = ntb.devinfo;
1875    const fs_builder &bld = ntb.bld.scalar_group();
1876 
1877    const brw_reg_type reg_type =
1878       brw_type_with_size(BRW_TYPE_D, instr->def.bit_size);
1879    brw_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1880 
1881    reg.is_scalar = true;
1882 
1883    brw_reg comps[NIR_MAX_VEC_COMPONENTS];
1884 
1885    switch (instr->def.bit_size) {
1886    case 8:
1887       for (unsigned i = 0; i < instr->def.num_components; i++)
1888          comps[i] = setup_imm_b(bld, instr->value[i].i8);
1889       break;
1890 
1891    case 16:
1892       for (unsigned i = 0; i < instr->def.num_components; i++)
1893          comps[i] = brw_imm_w(instr->value[i].i16);
1894       break;
1895 
1896    case 32:
1897       for (unsigned i = 0; i < instr->def.num_components; i++)
1898          comps[i] = brw_imm_d(instr->value[i].i32);
1899       break;
1900 
1901    case 64:
1902       if (!devinfo->has_64bit_int) {
1903          reg.type = BRW_TYPE_DF;
1904          for (unsigned i = 0; i < instr->def.num_components; i++)
1905             comps[i] = brw_imm_df(instr->value[i].f64);
1906       } else {
1907          for (unsigned i = 0; i < instr->def.num_components; i++)
1908             comps[i] = brw_imm_q(instr->value[i].i64);
1909       }
1910       break;
1911 
1912    default:
1913       unreachable("Invalid bit size");
1914    }
1915 
1916    bld.VEC(reg, comps, instr->def.num_components);
1917 
1918    ntb.ssa_values[instr->def.index] = reg;
1919 }
1920 
1921 static bool
get_nir_src_bindless(nir_to_brw_state & ntb,const nir_src & src)1922 get_nir_src_bindless(nir_to_brw_state &ntb, const nir_src &src)
1923 {
1924    return ntb.ssa_bind_infos[src.ssa->index].bindless;
1925 }
1926 
1927 /**
1928  * Specifying -1 for channel indicates that no channel selection should be applied.
1929  */
1930 static brw_reg
get_nir_src(nir_to_brw_state & ntb,const nir_src & src,int channel)1931 get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel)
1932 {
1933    nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
1934 
1935    brw_reg reg;
1936    if (!load_reg) {
1937       if (nir_src_is_undef(src)) {
1938          const brw_reg_type reg_type =
1939             brw_type_with_size(BRW_TYPE_D, src.ssa->bit_size);
1940          reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
1941       } else {
1942          reg = ntb.ssa_values[src.ssa->index];
1943       }
1944    } else {
1945       nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
1946       /* We don't handle indirects on locals */
1947       assert(nir_intrinsic_base(load_reg) == 0);
1948       assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
1949       reg = ntb.ssa_values[decl_reg->def.index];
1950    }
1951 
1952    /* To avoid floating-point denorm flushing problems, set the type by
1953     * default to an integer type - instructions that need floating point
1954     * semantics will set this to F if they need to
1955     */
1956    reg.type = brw_type_with_size(BRW_TYPE_D, nir_src_bit_size(src));
1957 
1958    if (channel >= 0) {
1959       reg = offset(reg, ntb.bld, channel);
1960 
1961       /* If the dispatch width matches the scalar allocation width, offset()
1962        * won't set the stride to zero. Force that here.
1963        */
1964       if (reg.is_scalar)
1965          reg = component(reg, 0);
1966    }
1967 
1968    return reg;
1969 }
1970 
1971 /**
1972  * Return an IMM for 32-bit constants; otherwise call get_nir_src() as normal.
1973  */
1974 static brw_reg
get_nir_src_imm(nir_to_brw_state & ntb,const nir_src & src)1975 get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
1976 {
1977    return nir_src_is_const(src) && nir_src_bit_size(src) == 32 ?
1978           brw_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
1979 }
1980 
1981 static brw_reg
get_nir_def(nir_to_brw_state & ntb,const nir_def & def,bool all_sources_uniform)1982 get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
1983 {
1984    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
1985    bool is_scalar = false;
1986 
1987    if (def.parent_instr->type == nir_instr_type_intrinsic &&
1988        store_reg == NULL) {
1989       const nir_intrinsic_instr *instr =
1990          nir_instr_as_intrinsic(def.parent_instr);
1991 
1992       switch (instr->intrinsic) {
1993       case nir_intrinsic_load_btd_global_arg_addr_intel:
1994       case nir_intrinsic_load_btd_local_arg_addr_intel:
1995       case nir_intrinsic_load_btd_shader_type_intel:
1996       case nir_intrinsic_load_global_constant_uniform_block_intel:
1997       case nir_intrinsic_load_inline_data_intel:
1998       case nir_intrinsic_load_reloc_const_intel:
1999       case nir_intrinsic_load_ssbo_uniform_block_intel:
2000       case nir_intrinsic_load_ubo_uniform_block_intel:
2001       case nir_intrinsic_load_workgroup_id:
2002          is_scalar = true;
2003          break;
2004 
2005       case nir_intrinsic_load_ubo:
2006          is_scalar = get_nir_src(ntb, instr->src[1]).is_scalar;
2007          break;
2008 
2009       case nir_intrinsic_load_uniform:
2010          is_scalar = get_nir_src(ntb, instr->src[0]).is_scalar;
2011          break;
2012 
2013       case nir_intrinsic_ballot:
2014       case nir_intrinsic_resource_intel:
2015          is_scalar = !def.divergent;
2016          break;
2017 
2018       default:
2019          break;
2020       }
2021 
2022       /* This cannot be is_scalar if NIR thought it was divergent. */
2023       assert(!(is_scalar && def.divergent));
2024    } else if (def.parent_instr->type == nir_instr_type_alu) {
2025       is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent;
2026    }
2027 
2028    const fs_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;
2029 
2030    if (!store_reg) {
2031       const brw_reg_type reg_type =
2032          brw_type_with_size(def.bit_size == 8 ? BRW_TYPE_D : BRW_TYPE_F,
2033                             def.bit_size);
2034       ntb.ssa_values[def.index] =
2035          bld.vgrf(reg_type, def.num_components);
2036 
2037       ntb.ssa_values[def.index].is_scalar = is_scalar;
2038 
2039       if (def.bit_size * bld.dispatch_width() < 8 * REG_SIZE)
2040          bld.UNDEF(ntb.ssa_values[def.index]);
2041 
2042       return ntb.ssa_values[def.index];
2043    } else {
2044       nir_intrinsic_instr *decl_reg =
2045          nir_reg_get_decl(store_reg->src[1].ssa);
2046       /* We don't handle indirects on locals */
2047       assert(nir_intrinsic_base(store_reg) == 0);
2048       assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
2049       assert(!is_scalar);
2050       return ntb.ssa_values[decl_reg->def.index];
2051    }
2052 }
2053 
2054 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)2055 get_nir_write_mask(const nir_def &def)
2056 {
2057    nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2058    if (!store_reg) {
2059       return nir_component_mask(def.num_components);
2060    } else {
2061       return nir_intrinsic_write_mask(store_reg);
2062    }
2063 }
2064 
2065 static fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum opcode opcode,const brw_reg & dst,const brw_reg & src,const brw_reg & desc,const brw_reg & flag_reg,glsl_interp_mode interpolation)2066 emit_pixel_interpolater_send(const fs_builder &bld,
2067                              enum opcode opcode,
2068                              const brw_reg &dst,
2069                              const brw_reg &src,
2070                              const brw_reg &desc,
2071                              const brw_reg &flag_reg,
2072                              glsl_interp_mode interpolation)
2073 {
2074    struct brw_wm_prog_data *wm_prog_data =
2075       brw_wm_prog_data(bld.shader->prog_data);
2076 
2077    brw_reg srcs[INTERP_NUM_SRCS];
2078 
2079    if (src.is_scalar) {
2080       srcs[INTERP_SRC_OFFSET] = bld.vgrf(src.type, 2);
2081       brw_combine_with_vec(bld, srcs[INTERP_SRC_OFFSET], src, 2);
2082    } else {
2083       srcs[INTERP_SRC_OFFSET] = src;
2084    }
2085 
2086    srcs[INTERP_SRC_MSG_DESC]     = desc;
2087    srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2088 
2089    fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2090    /* 2 floats per slot returned */
2091    inst->size_written = 2 * dst.component_size(inst->exec_size);
2092    if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2093       inst->pi_noperspective = true;
2094       /* TGL BSpec says:
2095        *     This field cannot be set to "Linear Interpolation"
2096        *     unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2097        */
2098       wm_prog_data->uses_nonperspective_interp_modes = true;
2099    }
2100 
2101    wm_prog_data->pulls_bary = true;
2102 
2103    return inst;
2104 }
2105 
2106 /**
2107  * Return the specified component \p subreg of a per-polygon PS
2108  * payload register for the polygon corresponding to each channel
2109  * specified in the provided \p bld.
2110  *
2111  * \p reg specifies the payload register in REG_SIZE units for the
2112  * first polygon dispatched to the thread.  This function requires
2113  * that subsequent registers on the payload contain the corresponding
2114  * register for subsequent polygons, one GRF register per polygon, if
2115  * multiple polygons are being processed by the same PS thread.
2116  *
2117  * This can be used to access the value of a "Source Depth and/or W
2118  * Attribute Vertex Deltas", "Perspective Bary Planes" or
2119  * "Non-Perspective Bary Planes" payload field conveniently for
2120  * multiple polygons as a single brw_reg.
2121  */
2122 static brw_reg
fetch_polygon_reg(const fs_builder & bld,unsigned reg,unsigned subreg)2123 fetch_polygon_reg(const fs_builder &bld, unsigned reg, unsigned subreg)
2124 {
2125    const fs_visitor *shader = bld.shader;
2126    assert(shader->stage == MESA_SHADER_FRAGMENT);
2127 
2128    const struct intel_device_info *devinfo = shader->devinfo;
2129    const unsigned poly_width = shader->dispatch_width / shader->max_polygons;
2130    const unsigned poly_idx = bld.group() / poly_width;
2131    assert(bld.group() % poly_width == 0);
2132 
2133    if (bld.dispatch_width() > poly_width) {
2134       assert(bld.dispatch_width() <= 2 * poly_width);
2135       const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
2136       const unsigned vstride = reg_size / brw_type_size_bytes(BRW_TYPE_F);
2137       return stride(brw_vec1_grf(reg + reg_unit(devinfo) * poly_idx, subreg),
2138                     vstride, poly_width, 0);
2139    } else {
2140       return brw_vec1_grf(reg + reg_unit(devinfo) * poly_idx, subreg);
2141    }
2142 }
2143 
2144 /**
2145  * Interpolate per-polygon barycentrics at a specific offset relative
2146  * to each channel fragment coordinates, optionally using
2147  * perspective-correct interpolation if requested.  This is mostly
2148  * useful as replacement for the PI shared function that existed on
2149  * platforms prior to Xe2, but is expected to work on earlier
2150  * platforms since we can get the required polygon setup information
2151  * from the thread payload as far back as ICL.
2152  */
2153 static void
emit_pixel_interpolater_alu_at_offset(const fs_builder & bld,const brw_reg & dst,const brw_reg & offs,glsl_interp_mode interpolation)2154 emit_pixel_interpolater_alu_at_offset(const fs_builder &bld,
2155                                       const brw_reg &dst,
2156                                       const brw_reg &offs,
2157                                       glsl_interp_mode interpolation)
2158 {
2159    const fs_visitor *shader = bld.shader;
2160    assert(shader->stage == MESA_SHADER_FRAGMENT);
2161 
2162    const intel_device_info *devinfo = shader->devinfo;
2163    assert(devinfo->ver >= 11);
2164 
2165    const fs_thread_payload &payload = shader->fs_payload();
2166    const struct brw_wm_prog_data *wm_prog_data =
2167       brw_wm_prog_data(shader->prog_data);
2168 
2169    if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2170       assert(wm_prog_data->uses_npc_bary_coefficients &&
2171              wm_prog_data->uses_nonperspective_interp_modes);
2172    } else {
2173       assert(interpolation == INTERP_MODE_SMOOTH);
2174       assert(wm_prog_data->uses_pc_bary_coefficients &&
2175              wm_prog_data->uses_depth_w_coefficients);
2176    }
2177 
2178    /* Account for half-pixel X/Y coordinate offset. */
2179    const brw_reg off_x = bld.vgrf(BRW_TYPE_F);
2180    bld.ADD(off_x, offs, brw_imm_f(0.5));
2181 
2182    const brw_reg off_y = bld.vgrf(BRW_TYPE_F);
2183    bld.ADD(off_y, offset(offs, bld, 1), brw_imm_f(0.5));
2184 
2185    /* Process no more than two polygons at a time to avoid hitting
2186     * regioning restrictions.
2187     */
2188    const unsigned poly_width = shader->dispatch_width / shader->max_polygons;
2189 
2190    for (unsigned i = 0; i < DIV_ROUND_UP(shader->max_polygons, 2); i++) {
2191       const fs_builder ibld = bld.group(MIN2(bld.dispatch_width(), 2 * poly_width), i);
2192 
2193       /* Fetch needed parameters from the thread payload. */
2194       const unsigned bary_coef_reg = interpolation == INTERP_MODE_NOPERSPECTIVE ?
2195          payload.npc_bary_coef_reg : payload.pc_bary_coef_reg;
2196       const brw_reg start_x = devinfo->ver < 12 ? fetch_polygon_reg(ibld, 1, 1) :
2197          fetch_polygon_reg(ibld, bary_coef_reg,
2198                            devinfo->ver >= 20 ? 6 : 2);
2199       const brw_reg start_y = devinfo->ver < 12 ? fetch_polygon_reg(ibld, 1, 6) :
2200          fetch_polygon_reg(ibld, bary_coef_reg,
2201                            devinfo->ver >= 20 ? 7 : 6);
2202 
2203       const brw_reg bary1_c0 = fetch_polygon_reg(ibld, bary_coef_reg,
2204                                                 devinfo->ver >= 20 ? 2 : 3);
2205       const brw_reg bary1_cx = fetch_polygon_reg(ibld, bary_coef_reg, 1);
2206       const brw_reg bary1_cy = fetch_polygon_reg(ibld, bary_coef_reg, 0);
2207 
2208       const brw_reg bary2_c0 = fetch_polygon_reg(ibld, bary_coef_reg,
2209                                                 devinfo->ver >= 20 ? 5 : 7);
2210       const brw_reg bary2_cx = fetch_polygon_reg(ibld, bary_coef_reg,
2211                                                 devinfo->ver >= 20 ? 4 : 5);
2212       const brw_reg bary2_cy = fetch_polygon_reg(ibld, bary_coef_reg,
2213                                                 devinfo->ver >= 20 ? 3 : 4);
2214 
2215       const brw_reg rhw_c0 = devinfo->ver >= 20 ?
2216          fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 5) :
2217          fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 7);
2218       const brw_reg rhw_cx = devinfo->ver >= 20 ?
2219          fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 4) :
2220          fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 5);
2221       const brw_reg rhw_cy = devinfo->ver >= 20 ?
2222          fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 3) :
2223          fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 4);
2224 
2225       /* Compute X/Y coordinate deltas relative to the origin of the polygon. */
2226       const brw_reg delta_x = ibld.vgrf(BRW_TYPE_F);
2227       ibld.ADD(delta_x, offset(shader->pixel_x, ibld, i), negate(start_x));
2228       ibld.ADD(delta_x, delta_x, offset(off_x, ibld, i));
2229 
2230       const brw_reg delta_y = ibld.vgrf(BRW_TYPE_F);
2231       ibld.ADD(delta_y, offset(shader->pixel_y, ibld, i), negate(start_y));
2232       ibld.ADD(delta_y, delta_y, offset(off_y, ibld, i));
2233 
2234       /* Evaluate the plane equations obtained above for the
2235        * barycentrics and RHW coordinate at the offset specified for
2236        * each channel.  Limit arithmetic to acc_width in order to
2237        * allow the accumulator to be used for linear interpolation.
2238        */
2239       const unsigned acc_width = 16 * reg_unit(devinfo);
2240       const brw_reg rhw = ibld.vgrf(BRW_TYPE_F);
2241       const brw_reg bary1 = ibld.vgrf(BRW_TYPE_F);
2242       const brw_reg bary2 = ibld.vgrf(BRW_TYPE_F);
2243 
2244       for (unsigned j = 0; j < DIV_ROUND_UP(ibld.dispatch_width(), acc_width); j++) {
2245          const fs_builder jbld = ibld.group(MIN2(ibld.dispatch_width(), acc_width), j);
2246          const brw_reg acc = suboffset(brw_acc_reg(16), jbld.group() % acc_width);
2247 
2248          if (interpolation != INTERP_MODE_NOPERSPECTIVE) {
2249             jbld.MAD(acc, horiz_offset(rhw_c0, acc_width * j),
2250                      horiz_offset(rhw_cx, acc_width * j), offset(delta_x, jbld, j));
2251             jbld.MAC(offset(rhw, jbld, j),
2252                      horiz_offset(rhw_cy, acc_width * j), offset(delta_y, jbld, j));
2253          }
2254 
2255          jbld.MAD(acc, horiz_offset(bary1_c0, acc_width * j),
2256                   horiz_offset(bary1_cx, acc_width * j), offset(delta_x, jbld, j));
2257          jbld.MAC(offset(bary1, jbld, j),
2258                   horiz_offset(bary1_cy, acc_width * j), offset(delta_y, jbld, j));
2259 
2260          jbld.MAD(acc, horiz_offset(bary2_c0, acc_width * j),
2261                   horiz_offset(bary2_cx, acc_width * j), offset(delta_x, jbld, j));
2262          jbld.MAC(offset(bary2, jbld, j),
2263                   horiz_offset(bary2_cy, acc_width * j), offset(delta_y, jbld, j));
2264       }
2265 
2266       /* Scale the results dividing by the interpolated RHW coordinate
2267        * if the interpolation is required to be perspective-correct.
2268        */
2269       if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2270          ibld.MOV(offset(dst, ibld, i), bary1);
2271          ibld.MOV(offset(offset(dst, bld, 1), ibld, i), bary2);
2272       } else {
2273          const brw_reg w = ibld.vgrf(BRW_TYPE_F);
2274          ibld.emit(SHADER_OPCODE_RCP, w, rhw);
2275          ibld.MUL(offset(dst, ibld, i), bary1, w);
2276          ibld.MUL(offset(offset(dst, bld, 1), ibld, i), bary2, w);
2277       }
2278    }
2279 }
2280 
2281 /**
2282  * Interpolate per-polygon barycentrics at a specified sample index,
2283  * optionally using perspective-correct interpolation if requested.
2284  * This is mostly useful as replacement for the PI shared function
2285  * that existed on platforms prior to Xe2, but is expected to work on
2286  * earlier platforms since we can get the required polygon setup
2287  * information from the thread payload as far back as ICL.
2288  */
2289 static void
emit_pixel_interpolater_alu_at_sample(const fs_builder & bld,const brw_reg & dst,const brw_reg & idx,glsl_interp_mode interpolation)2290 emit_pixel_interpolater_alu_at_sample(const fs_builder &bld,
2291                                       const brw_reg &dst,
2292                                       const brw_reg &idx,
2293                                       glsl_interp_mode interpolation)
2294 {
2295    const fs_thread_payload &payload = bld.shader->fs_payload();
2296    const struct brw_wm_prog_data *wm_prog_data =
2297       brw_wm_prog_data(bld.shader->prog_data);
2298    const fs_builder ubld = bld.exec_all().group(16, 0);
2299    const brw_reg sample_offs_xy = ubld.vgrf(BRW_TYPE_UD);
2300    assert(wm_prog_data->uses_sample_offsets);
2301 
2302    /* Interleave the X/Y coordinates of each sample in order to allow
2303     * a single indirect look-up, by using a MOV for the 16 X
2304     * coordinates, then another MOV for the 16 Y coordinates.
2305     */
2306    for (unsigned i = 0; i < 2; i++) {
2307       const brw_reg reg = retype(brw_vec16_grf(payload.sample_offsets_reg, 4 * i),
2308                                 BRW_TYPE_UB);
2309       ubld.MOV(subscript(sample_offs_xy, BRW_TYPE_UW, i), reg);
2310    }
2311 
2312    /* Use indirect addressing to fetch the X/Y offsets of the sample
2313     * index provided for each channel.
2314     */
2315    const brw_reg idx_b = bld.vgrf(BRW_TYPE_UD);
2316    bld.MUL(idx_b, idx, brw_imm_ud(brw_type_size_bytes(BRW_TYPE_UD)));
2317 
2318    const brw_reg off_xy = bld.vgrf(BRW_TYPE_UD);
2319    bld.emit(SHADER_OPCODE_MOV_INDIRECT, off_xy, component(sample_offs_xy, 0),
2320             idx_b, brw_imm_ud(16 * brw_type_size_bytes(BRW_TYPE_UD)));
2321 
2322    /* Convert the selected fixed-point offsets to floating-point
2323     * offsets.
2324     */
2325    const brw_reg offs = bld.vgrf(BRW_TYPE_F, 2);
2326 
2327    for (unsigned i = 0; i < 2; i++) {
2328       const brw_reg tmp = bld.vgrf(BRW_TYPE_F);
2329       bld.MOV(tmp, subscript(off_xy, BRW_TYPE_UW, i));
2330       bld.MUL(tmp, tmp, brw_imm_f(0.0625));
2331       bld.ADD(offset(offs, bld, i), tmp, brw_imm_f(-0.5));
2332    }
2333 
2334    /* Interpolate at the resulting offsets. */
2335    emit_pixel_interpolater_alu_at_offset(bld, dst, offs, interpolation);
2336 }
2337 
2338 /**
2339  * Computes 1 << x, given a D/UD register containing some value x.
2340  */
2341 static brw_reg
intexp2(const fs_builder & bld,const brw_reg & x)2342 intexp2(const fs_builder &bld, const brw_reg &x)
2343 {
2344    assert(x.type == BRW_TYPE_UD || x.type == BRW_TYPE_D);
2345 
2346    return bld.SHL(bld.MOV(retype(brw_imm_d(1), x.type)), x);
2347 }
2348 
2349 static void
emit_gs_end_primitive(nir_to_brw_state & ntb,const nir_src & vertex_count_nir_src)2350 emit_gs_end_primitive(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src)
2351 {
2352    fs_visitor &s = ntb.s;
2353    assert(s.stage == MESA_SHADER_GEOMETRY);
2354 
2355    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2356 
2357    if (s.gs_compile->control_data_header_size_bits == 0)
2358       return;
2359 
2360    /* We can only do EndPrimitive() functionality when the control data
2361     * consists of cut bits.  Fortunately, the only time it isn't is when the
2362     * output type is points, in which case EndPrimitive() is a no-op.
2363     */
2364    if (gs_prog_data->control_data_format !=
2365        GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2366       return;
2367    }
2368 
2369    /* Cut bits use one bit per vertex. */
2370    assert(s.gs_compile->control_data_bits_per_vertex == 1);
2371 
2372    brw_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2373    vertex_count.type = BRW_TYPE_UD;
2374 
2375    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2376     * vertex n, 0 otherwise.  So all we need to do here is mark bit
2377     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2378     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2379     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2380     *
2381     * Note that if EndPrimitive() is called before emitting any vertices, this
2382     * will cause us to set bit 31 of the control_data_bits register to 1.
2383     * That's fine because:
2384     *
2385     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2386     *   output, so the hardware will ignore cut bit 31.
2387     *
2388     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2389     *   last vertex, so setting cut bit 31 has no effect (since the primitive
2390     *   is automatically ended when the GS terminates).
2391     *
2392     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2393     *   control_data_bits register to 0 when the first vertex is emitted.
2394     */
2395 
2396    const fs_builder abld = ntb.bld.annotate("end primitive");
2397 
2398    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2399    brw_reg prev_count = abld.ADD(vertex_count, brw_imm_ud(0xffffffffu));
2400    brw_reg mask = intexp2(abld, prev_count);
2401    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2402     * attention to the lower 5 bits of its second source argument, so on this
2403     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2404     * ((vertex_count - 1) % 32).
2405     */
2406    abld.OR(s.control_data_bits, s.control_data_bits, mask);
2407 }
2408 
2409 brw_reg
gs_urb_per_slot_dword_index(const brw_reg & vertex_count)2410 fs_visitor::gs_urb_per_slot_dword_index(const brw_reg &vertex_count)
2411 {
2412    /* We use a single UD register to accumulate control data bits (32 bits
2413     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2414     * at a time.
2415     *
2416     * On platforms < Xe2:
2417     *    Unfortunately,the URB_WRITE_SIMD8 message uses 128-bit (OWord)
2418     *    offsets.  We have select a 128-bit group via the Global and Per-Slot
2419     *    Offsets, then use the Channel Mask phase to enable/disable which DWord
2420     *    within that group to write.  (Remember, different SIMD8 channels may
2421     *    have emitted different numbers of vertices, so we may need per-slot
2422     *    offsets.)
2423     *
2424     *    Channel masking presents an annoying problem: we may have to replicate
2425     *    the data up to 4 times:
2426     *
2427     *    Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data,
2428     *          Data.
2429     *
2430     *    To avoid penalizing shaders that emit a small number of vertices, we
2431     *    can avoid these sometimes: if the size of the control data header is
2432     *    <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2433     *    land in the same 128-bit group, so we can skip per-slot offsets.
2434     *
2435     *    Similarly, if the control data header is <= 32 bits, there is only one
2436     *    DWord, so we can skip channel masks.
2437     */
2438    const fs_builder bld = fs_builder(this).at_end();
2439    const fs_builder abld = bld.annotate("urb per slot offset");
2440 
2441    /* Figure out which DWord we're trying to write to using the formula:
2442     *
2443     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2444     *
2445     * Since bits_per_vertex is a power of two, and is known at compile
2446     * time, this can be optimized to:
2447     *
2448     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2449     */
2450    brw_reg prev_count = abld.ADD(vertex_count, brw_imm_ud(0xffffffffu));
2451    unsigned log2_bits_per_vertex =
2452       util_last_bit(gs_compile->control_data_bits_per_vertex);
2453    return abld.SHR(prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2454 }
2455 
2456 brw_reg
gs_urb_channel_mask(const brw_reg & dword_index)2457 fs_visitor::gs_urb_channel_mask(const brw_reg &dword_index)
2458 {
2459    brw_reg channel_mask;
2460 
2461    /* Xe2+ can do URB loads with a byte offset, so we don't need to
2462     * construct a channel mask.
2463     */
2464    if (devinfo->ver >= 20)
2465       return channel_mask;
2466 
2467    /* Channel masking presents an annoying problem: we may have to replicate
2468     * the data up to 4 times:
2469     *
2470     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2471     *
2472     * To avoid penalizing shaders that emit a small number of vertices, we
2473     * can avoid these sometimes: if the size of the control data header is
2474     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2475     * land in the same 128-bit group, so we can skip per-slot offsets.
2476     *
2477     * Similarly, if the control data header is <= 32 bits, there is only one
2478     * DWord, so we can skip channel masks.
2479     */
2480    if (gs_compile->control_data_header_size_bits <= 32)
2481       return channel_mask;
2482 
2483    const fs_builder bld = fs_builder(this).at_end();
2484    const fs_builder ubld = bld.exec_all();
2485 
2486    /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2487     * write to the appropriate DWORD within the OWORD.
2488     */
2489    brw_reg channel = ubld.AND(dword_index, brw_imm_ud(3u));
2490    /* Then the channel masks need to be in bits 23:16. */
2491    return ubld.SHL(intexp2(ubld, channel), brw_imm_ud(16u));
2492 }
2493 
2494 void
emit_gs_control_data_bits(const brw_reg & vertex_count)2495 fs_visitor::emit_gs_control_data_bits(const brw_reg &vertex_count)
2496 {
2497    assert(stage == MESA_SHADER_GEOMETRY);
2498    assert(gs_compile->control_data_bits_per_vertex != 0);
2499 
2500    const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2501 
2502    const fs_builder bld = fs_builder(this).at_end();
2503    const fs_builder abld = bld.annotate("emit control data bits");
2504 
2505    brw_reg dword_index = gs_urb_per_slot_dword_index(vertex_count);
2506    brw_reg channel_mask = gs_urb_channel_mask(dword_index);
2507    brw_reg per_slot_offset;
2508 
2509    const unsigned max_control_data_header_size_bits =
2510       devinfo->ver >= 20 ? 32 : 128;
2511 
2512    if (gs_compile->control_data_header_size_bits > max_control_data_header_size_bits) {
2513       /* Convert dword_index to bytes on Xe2+ since LSC can do operate on byte
2514        * offset granularity.
2515        */
2516       if (devinfo->ver >= 20) {
2517          per_slot_offset = abld.SHL(dword_index, brw_imm_ud(2u));
2518       } else {
2519          /* Set the per-slot offset to dword_index / 4, so that we'll write to
2520           * the appropriate OWord within the control data header.
2521           */
2522          per_slot_offset = abld.SHR(dword_index, brw_imm_ud(2u));
2523       }
2524    }
2525 
2526    /* If there are channel masks, add 3 extra copies of the data. */
2527    const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2528    assert(length <= 4);
2529    brw_reg sources[4];
2530 
2531    for (unsigned i = 0; i < length; i++)
2532       sources[i] = this->control_data_bits;
2533 
2534    brw_reg srcs[URB_LOGICAL_NUM_SRCS];
2535    srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2536    srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2537    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2538    srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_TYPE_F, length);
2539    srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
2540    abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2541 
2542    fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2543                              srcs, ARRAY_SIZE(srcs));
2544 
2545    /* We need to increment Global Offset by 256-bits to make room for
2546     * Broadwell's extra "Vertex Count" payload at the beginning of the
2547     * URB entry.  Since this is an OWord message, Global Offset is counted
2548     * in 128-bit units, so we must set it to 2.
2549     */
2550    if (gs_prog_data->static_vertex_count == -1)
2551       inst->offset = 2;
2552 }
2553 
2554 static void
set_gs_stream_control_data_bits(nir_to_brw_state & ntb,const brw_reg & vertex_count,unsigned stream_id)2555 set_gs_stream_control_data_bits(nir_to_brw_state &ntb, const brw_reg &vertex_count,
2556                                 unsigned stream_id)
2557 {
2558    fs_visitor &s = ntb.s;
2559 
2560    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2561 
2562    /* Note: we are calling this *before* increasing vertex_count, so
2563     * this->vertex_count == vertex_count - 1 in the formula above.
2564     */
2565 
2566    /* Stream mode uses 2 bits per vertex */
2567    assert(s.gs_compile->control_data_bits_per_vertex == 2);
2568 
2569    /* Must be a valid stream */
2570    assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2571 
2572    /* Control data bits are initialized to 0 so we don't have to set any
2573     * bits when sending vertices to stream 0.
2574     */
2575    if (stream_id == 0)
2576       return;
2577 
2578    const fs_builder abld = ntb.bld.annotate("set stream control data bits");
2579 
2580    /* reg::sid = stream_id */
2581    brw_reg sid = abld.MOV(brw_imm_ud(stream_id));
2582 
2583    /* reg:shift_count = 2 * (vertex_count - 1) */
2584    brw_reg shift_count = abld.SHL(vertex_count, brw_imm_ud(1u));
2585 
2586    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2587     * attention to the lower 5 bits of its second source argument, so on this
2588     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2589     * stream_id << ((2 * (vertex_count - 1)) % 32).
2590     */
2591    brw_reg mask = abld.SHL(sid, shift_count);
2592    abld.OR(s.control_data_bits, s.control_data_bits, mask);
2593 }
2594 
2595 static void
emit_gs_vertex(nir_to_brw_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2596 emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src,
2597                unsigned stream_id)
2598 {
2599    fs_visitor &s = ntb.s;
2600 
2601    assert(s.stage == MESA_SHADER_GEOMETRY);
2602 
2603    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2604 
2605    brw_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2606    vertex_count.type = BRW_TYPE_UD;
2607 
2608    /* Haswell and later hardware ignores the "Render Stream Select" bits
2609     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2610     * and instead sends all primitives down the pipeline for rasterization.
2611     * If the SOL stage is enabled, "Render Stream Select" is honored and
2612     * primitives bound to non-zero streams are discarded after stream output.
2613     *
2614     * Since the only purpose of primives sent to non-zero streams is to
2615     * be recorded by transform feedback, we can simply discard all geometry
2616     * bound to these streams when transform feedback is disabled.
2617     */
2618    if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2619       return;
2620 
2621    /* If we're outputting 32 control data bits or less, then we can wait
2622     * until the shader is over to output them all.  Otherwise we need to
2623     * output them as we go.  Now is the time to do it, since we're about to
2624     * output the vertex_count'th vertex, so it's guaranteed that the
2625     * control data bits associated with the (vertex_count - 1)th vertex are
2626     * correct.
2627     */
2628    if (s.gs_compile->control_data_header_size_bits > 32) {
2629       const fs_builder abld =
2630          ntb.bld.annotate("emit vertex: emit control data bits");
2631 
2632       /* Only emit control data bits if we've finished accumulating a batch
2633        * of 32 bits.  This is the case when:
2634        *
2635        *     (vertex_count * bits_per_vertex) % 32 == 0
2636        *
2637        * (in other words, when the last 5 bits of vertex_count *
2638        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2639        * integer n (which is always the case, since bits_per_vertex is
2640        * always 1 or 2), this is equivalent to requiring that the last 5-n
2641        * bits of vertex_count are 0:
2642        *
2643        *     vertex_count & (2^(5-n) - 1) == 0
2644        *
2645        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2646        * equivalent to:
2647        *
2648        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2649        *
2650        * TODO: If vertex_count is an immediate, we could do some of this math
2651        *       at compile time...
2652        */
2653       fs_inst *inst =
2654          abld.AND(ntb.bld.null_reg_d(), vertex_count,
2655                   brw_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2656       inst->conditional_mod = BRW_CONDITIONAL_Z;
2657 
2658       abld.IF(BRW_PREDICATE_NORMAL);
2659       /* If vertex_count is 0, then no control data bits have been
2660        * accumulated yet, so we can skip emitting them.
2661        */
2662       abld.CMP(ntb.bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2663                BRW_CONDITIONAL_NEQ);
2664       abld.IF(BRW_PREDICATE_NORMAL);
2665       s.emit_gs_control_data_bits(vertex_count);
2666       abld.emit(BRW_OPCODE_ENDIF);
2667 
2668       /* Reset control_data_bits to 0 so we can start accumulating a new
2669        * batch.
2670        *
2671        * Note: in the case where vertex_count == 0, this neutralizes the
2672        * effect of any call to EndPrimitive() that the shader may have
2673        * made before outputting its first vertex.
2674        */
2675       abld.exec_all().MOV(s.control_data_bits, brw_imm_ud(0u));
2676       abld.emit(BRW_OPCODE_ENDIF);
2677    }
2678 
2679    s.emit_urb_writes(vertex_count);
2680 
2681    /* In stream mode we have to set control data bits for all vertices
2682     * unless we have disabled control data bits completely (which we do
2683     * do for MESA_PRIM_POINTS outputs that don't use streams).
2684     */
2685    if (s.gs_compile->control_data_header_size_bits > 0 &&
2686        gs_prog_data->control_data_format ==
2687           GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2688       set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2689    }
2690 }
2691 
2692 static void
brw_combine_with_vec(const fs_builder & bld,const brw_reg & dst,const brw_reg & src,unsigned n)2693 brw_combine_with_vec(const fs_builder &bld, const brw_reg &dst,
2694                      const brw_reg &src, unsigned n)
2695 {
2696    assert(n <= NIR_MAX_VEC_COMPONENTS);
2697    brw_reg comps[NIR_MAX_VEC_COMPONENTS];
2698    for (unsigned i = 0; i < n; i++)
2699       comps[i] = offset(src, bld, i);
2700    bld.VEC(dst, comps, n);
2701 }
2702 
2703 static void
emit_gs_input_load(nir_to_brw_state & ntb,const brw_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2704 emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst,
2705                    const nir_src &vertex_src,
2706                    unsigned base_offset,
2707                    const nir_src &offset_src,
2708                    unsigned num_components,
2709                    unsigned first_component)
2710 {
2711    const fs_builder &bld = ntb.bld;
2712    const struct intel_device_info *devinfo = ntb.devinfo;
2713 
2714    fs_visitor &s = ntb.s;
2715 
2716    assert(brw_type_size_bytes(dst.type) == 4);
2717    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2718    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2719 
2720    /* TODO: figure out push input layout for invocations == 1 */
2721    if (gs_prog_data->invocations == 1 &&
2722        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2723        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2724       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2725                        nir_src_as_uint(vertex_src) * push_reg_count;
2726 
2727       const brw_reg attr = offset(brw_attr_reg(0, dst.type), bld,
2728                                   first_component + imm_offset);
2729       brw_combine_with_vec(bld, dst, attr, num_components);
2730       return;
2731    }
2732 
2733    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2734    assert(gs_prog_data->base.include_vue_handles);
2735 
2736    brw_reg start = s.gs_payload().icp_handle_start;
2737    brw_reg icp_handle = ntb.bld.vgrf(BRW_TYPE_UD);
2738    const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2739 
2740    if (gs_prog_data->invocations == 1) {
2741       if (nir_src_is_const(vertex_src)) {
2742          /* The vertex index is constant; just select the proper URB handle. */
2743          icp_handle =
2744             byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2745       } else {
2746          /* The vertex index is non-constant.  We need to use indirect
2747           * addressing to fetch the proper URB handle.
2748           *
2749           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2750           * indicating that channel <n> should read the handle from
2751           * DWord <n>.  We convert that to bytes by multiplying by 4.
2752           *
2753           * Next, we convert the vertex index to bytes by multiplying
2754           * by 32/64 (shifting by 5/6), and add the two together.  This is
2755           * the final indirect byte offset.
2756           */
2757          brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
2758 
2759          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2760          brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
2761          /* Convert vertex_index to bytes (multiply by 32/64) */
2762          assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2763          brw_reg vertex_offset_bytes =
2764             bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
2765                     brw_imm_ud(ffs(grf_size_bytes) - 1));
2766          brw_reg icp_offset_bytes =
2767             bld.ADD(vertex_offset_bytes, channel_offsets);
2768 
2769          /* Use first_icp_handle as the base offset.  There is one register
2770           * of URB handles per vertex, so inform the register allocator that
2771           * we might read up to nir->info.gs.vertices_in registers.
2772           */
2773          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2774                   brw_reg(icp_offset_bytes),
2775                   brw_imm_ud(s.nir->info.gs.vertices_in * grf_size_bytes));
2776       }
2777    } else {
2778       assert(gs_prog_data->invocations > 1);
2779 
2780       if (nir_src_is_const(vertex_src)) {
2781          unsigned vertex = nir_src_as_uint(vertex_src);
2782          bld.MOV(icp_handle, component(start, vertex));
2783       } else {
2784          /* The vertex index is non-constant.  We need to use indirect
2785           * addressing to fetch the proper URB handle.
2786           *
2787           * Convert vertex_index to bytes (multiply by 4)
2788           */
2789          brw_reg icp_offset_bytes =
2790             bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
2791                     brw_imm_ud(2u));
2792 
2793          /* Use first_icp_handle as the base offset.  There is one DWord
2794           * of URB handles per vertex, so inform the register allocator that
2795           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2796           */
2797          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2798                   brw_reg(icp_offset_bytes),
2799                   brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2800                              grf_size_bytes));
2801       }
2802    }
2803 
2804    fs_inst *inst;
2805    brw_reg indirect_offset = get_nir_src(ntb, offset_src);
2806 
2807    if (nir_src_is_const(offset_src)) {
2808       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
2809       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2810 
2811       /* Constant indexing - use global offset. */
2812       if (first_component != 0) {
2813          unsigned read_components = num_components + first_component;
2814          brw_reg tmp = bld.vgrf(dst.type, read_components);
2815          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2816                          ARRAY_SIZE(srcs));
2817          inst->size_written = read_components *
2818                               tmp.component_size(inst->exec_size);
2819          brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
2820                               num_components);
2821       } else {
2822          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2823                          ARRAY_SIZE(srcs));
2824          inst->size_written = num_components *
2825                               dst.component_size(inst->exec_size);
2826       }
2827       inst->offset = base_offset + nir_src_as_uint(offset_src);
2828    } else {
2829       /* Indirect indexing - use per-slot offsets as well. */
2830       unsigned read_components = num_components + first_component;
2831       brw_reg tmp = bld.vgrf(dst.type, read_components);
2832 
2833       /* Convert oword offset to bytes on Xe2+ */
2834       if (devinfo->ver >= 20)
2835          indirect_offset = bld.SHL(indirect_offset, brw_imm_ud(4u));
2836 
2837       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
2838       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2839       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2840 
2841       if (first_component != 0) {
2842          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2843                          srcs, ARRAY_SIZE(srcs));
2844          inst->size_written = read_components *
2845                               tmp.component_size(inst->exec_size);
2846          brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
2847                               num_components);
2848       } else {
2849          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
2850                          srcs, ARRAY_SIZE(srcs));
2851          inst->size_written = num_components *
2852                               dst.component_size(inst->exec_size);
2853       }
2854       inst->offset = base_offset;
2855    }
2856 }
2857 
2858 static brw_reg
get_indirect_offset(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)2859 get_indirect_offset(nir_to_brw_state &ntb, nir_intrinsic_instr *instr)
2860 {
2861    const intel_device_info *devinfo = ntb.devinfo;
2862    nir_src *offset_src = nir_get_io_offset_src(instr);
2863 
2864    if (nir_src_is_const(*offset_src)) {
2865       /* The only constant offset we should find is 0.  brw_nir.c's
2866        * add_const_offset_to_base() will fold other constant offsets
2867        * into the "base" index.
2868        */
2869       assert(nir_src_as_uint(*offset_src) == 0);
2870       return brw_reg();
2871    }
2872 
2873    brw_reg offset = get_nir_src(ntb, *offset_src);
2874 
2875    if (devinfo->ver < 20)
2876       return offset;
2877 
2878    /* Convert Owords (16-bytes) to bytes */
2879    return ntb.bld.SHL(retype(offset, BRW_TYPE_UD), brw_imm_ud(4u));
2880 }
2881 
2882 static void
fs_nir_emit_vs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)2883 fs_nir_emit_vs_intrinsic(nir_to_brw_state &ntb,
2884                          nir_intrinsic_instr *instr)
2885 {
2886    const fs_builder &bld = ntb.bld;
2887    fs_visitor &s = ntb.s;
2888    assert(s.stage == MESA_SHADER_VERTEX);
2889 
2890    brw_reg dest;
2891    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2892       dest = get_nir_def(ntb, instr->def);
2893 
2894    switch (instr->intrinsic) {
2895    case nir_intrinsic_load_vertex_id:
2896    case nir_intrinsic_load_base_vertex:
2897       unreachable("should be lowered by nir_lower_system_values()");
2898 
2899    case nir_intrinsic_load_input: {
2900       assert(instr->def.bit_size == 32);
2901       const brw_reg src = offset(brw_attr_reg(0, dest.type), bld,
2902                                 nir_intrinsic_base(instr) * 4 +
2903                                 nir_intrinsic_component(instr) +
2904                                 nir_src_as_uint(instr->src[0]));
2905       brw_combine_with_vec(bld, dest, src, instr->num_components);
2906       break;
2907    }
2908 
2909    case nir_intrinsic_load_vertex_id_zero_base:
2910    case nir_intrinsic_load_instance_id:
2911    case nir_intrinsic_load_base_instance:
2912    case nir_intrinsic_load_draw_id:
2913    case nir_intrinsic_load_first_vertex:
2914    case nir_intrinsic_load_is_indexed_draw:
2915       unreachable("lowered by brw_nir_lower_vs_inputs");
2916 
2917    default:
2918       fs_nir_emit_intrinsic(ntb, bld, instr);
2919       break;
2920    }
2921 }
2922 
2923 static brw_reg
get_tcs_single_patch_icp_handle(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2924 get_tcs_single_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
2925                                 nir_intrinsic_instr *instr)
2926 {
2927    fs_visitor &s = ntb.s;
2928 
2929    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
2930    const nir_src &vertex_src = instr->src[0];
2931    nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2932 
2933    const brw_reg start = s.tcs_payload().icp_handle_start;
2934 
2935    brw_reg icp_handle;
2936 
2937    if (nir_src_is_const(vertex_src)) {
2938       /* Emit a MOV to resolve <0,1,0> regioning. */
2939       unsigned vertex = nir_src_as_uint(vertex_src);
2940       icp_handle = bld.MOV(component(start, vertex));
2941    } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2942               vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2943       /* For the common case of only 1 instance, an array index of
2944        * gl_InvocationID means reading the handles from the start.  Skip all
2945        * the indirect work.
2946        */
2947       icp_handle = start;
2948    } else {
2949       /* The vertex index is non-constant.  We need to use indirect
2950        * addressing to fetch the proper URB handle.
2951        */
2952       icp_handle = bld.vgrf(BRW_TYPE_UD);
2953 
2954       /* Each ICP handle is a single DWord (4 bytes) */
2955       brw_reg vertex_offset_bytes =
2956          bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
2957                  brw_imm_ud(2u));
2958 
2959       /* We might read up to 4 registers. */
2960       bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2961                start, vertex_offset_bytes,
2962                brw_imm_ud(4 * REG_SIZE));
2963    }
2964 
2965    return icp_handle;
2966 }
2967 
2968 static brw_reg
get_tcs_multi_patch_icp_handle(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2969 get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
2970                                nir_intrinsic_instr *instr)
2971 {
2972    fs_visitor &s = ntb.s;
2973    const intel_device_info *devinfo = s.devinfo;
2974 
2975    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) s.key;
2976    const nir_src &vertex_src = instr->src[0];
2977    const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2978 
2979    const brw_reg start = s.tcs_payload().icp_handle_start;
2980 
2981    if (nir_src_is_const(vertex_src))
2982       return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2983 
2984    /* The vertex index is non-constant.  We need to use indirect
2985     * addressing to fetch the proper URB handle.
2986     *
2987     * First, we start with the sequence indicating that channel <n>
2988     * should read the handle from DWord <n>.  We convert that to bytes
2989     * by multiplying by 4.
2990     *
2991     * Next, we convert the vertex index to bytes by multiplying
2992     * by the GRF size (by shifting), and add the two together.  This is
2993     * the final indirect byte offset.
2994     */
2995    brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
2996 
2997    /* Offsets will be 0, 4, 8, ... */
2998    brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
2999    /* Convert vertex_index to bytes (multiply by 32) */
3000    assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
3001    brw_reg vertex_offset_bytes =
3002       bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
3003               brw_imm_ud(ffs(grf_size_bytes) - 1));
3004    brw_reg icp_offset_bytes =
3005       bld.ADD(vertex_offset_bytes, channel_offsets);
3006 
3007    /* Use start of ICP handles as the base offset.  There is one register
3008     * of URB handles per vertex, so inform the register allocator that
3009     * we might read up to nir->info.gs.vertices_in registers.
3010     */
3011    brw_reg icp_handle = bld.vgrf(BRW_TYPE_UD);
3012    bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
3013             icp_offset_bytes,
3014             brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) *
3015                        grf_size_bytes));
3016 
3017    return icp_handle;
3018 }
3019 
3020 static void
setup_barrier_message_payload_gfx125(const fs_builder & bld,const brw_reg & msg_payload)3021 setup_barrier_message_payload_gfx125(const fs_builder &bld,
3022                                      const brw_reg &msg_payload)
3023 {
3024    const fs_builder ubld = bld.exec_all().group(1, 0);
3025    const struct intel_device_info *devinfo = bld.shader->devinfo;
3026    assert(devinfo->verx10 >= 125);
3027 
3028    /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
3029    brw_reg m0_10ub = horiz_offset(retype(msg_payload, BRW_TYPE_UB), 10);
3030    brw_reg r0_11ub =
3031       stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UB), 11),
3032              0, 1, 0);
3033    ubld.group(2, 0).MOV(m0_10ub, r0_11ub);
3034 
3035    if (devinfo->ver >= 20) {
3036       /* Use an active threads barrier. */
3037       const brw_reg m0_2ud = component(retype(msg_payload, BRW_TYPE_UD), 2);
3038       ubld.OR(m0_2ud, m0_2ud, brw_imm_ud(1u << 8));
3039    }
3040 }
3041 
3042 static void
emit_barrier(nir_to_brw_state & ntb)3043 emit_barrier(nir_to_brw_state &ntb)
3044 {
3045    const intel_device_info *devinfo = ntb.devinfo;
3046    const fs_builder &bld = ntb.bld;
3047    const fs_builder ubld = bld.exec_all();
3048    const fs_builder hbld = ubld.group(8 * reg_unit(devinfo), 0);
3049    fs_visitor &s = ntb.s;
3050 
3051    /* We are getting the barrier ID from the compute shader header */
3052    assert(gl_shader_stage_uses_workgroup(s.stage));
3053 
3054    /* Zero-initialize the payload */
3055    brw_reg payload = hbld.MOV(brw_imm_ud(0u));
3056 
3057    if (devinfo->verx10 >= 125) {
3058       setup_barrier_message_payload_gfx125(bld, payload);
3059    } else {
3060       assert(gl_shader_stage_is_compute(s.stage));
3061 
3062       brw_reg barrier_id_mask =
3063          brw_imm_ud(devinfo->ver == 9 ? 0x8f000000u : 0x7f000000u);
3064 
3065       /* Copy the barrier id from r0.2 to the message payload reg.2 */
3066       brw_reg r0_2 = brw_reg(retype(brw_vec1_grf(0, 2), BRW_TYPE_UD));
3067       ubld.group(1, 0).AND(component(payload, 2), r0_2, barrier_id_mask);
3068    }
3069 
3070    /* Emit a gateway "barrier" message using the payload we set up, followed
3071     * by a wait instruction.
3072     */
3073    ubld.emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
3074 }
3075 
3076 static void
emit_tcs_barrier(nir_to_brw_state & ntb)3077 emit_tcs_barrier(nir_to_brw_state &ntb)
3078 {
3079    const intel_device_info *devinfo = ntb.devinfo;
3080    const fs_builder &bld = ntb.bld;
3081    fs_visitor &s = ntb.s;
3082 
3083    assert(s.stage == MESA_SHADER_TESS_CTRL);
3084    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
3085 
3086    brw_reg m0 = bld.vgrf(BRW_TYPE_UD);
3087    brw_reg m0_2 = component(m0, 2);
3088 
3089    const fs_builder chanbld = bld.exec_all().group(1, 0);
3090 
3091    /* Zero the message header */
3092    bld.exec_all().MOV(m0, brw_imm_ud(0u));
3093 
3094    if (devinfo->verx10 >= 125) {
3095       setup_barrier_message_payload_gfx125(bld, m0);
3096    } else if (devinfo->ver >= 11) {
3097       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_TYPE_UD),
3098                   brw_imm_ud(INTEL_MASK(30, 24)));
3099 
3100       /* Set the Barrier Count and the enable bit */
3101       chanbld.OR(m0_2, m0_2,
3102                  brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
3103    } else {
3104       /* Copy "Barrier ID" from r0.2, bits 16:13 */
3105       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_TYPE_UD),
3106                   brw_imm_ud(INTEL_MASK(16, 13)));
3107 
3108       /* Shift it up to bits 27:24. */
3109       chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
3110 
3111       /* Set the Barrier Count and the enable bit */
3112       chanbld.OR(m0_2, m0_2,
3113                  brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
3114    }
3115 
3116    bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
3117 }
3118 
3119 static void
fs_nir_emit_tcs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3120 fs_nir_emit_tcs_intrinsic(nir_to_brw_state &ntb,
3121                           nir_intrinsic_instr *instr)
3122 {
3123    const intel_device_info *devinfo = ntb.devinfo;
3124    const fs_builder &bld = ntb.bld;
3125    fs_visitor &s = ntb.s;
3126 
3127    assert(s.stage == MESA_SHADER_TESS_CTRL);
3128    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
3129    struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
3130 
3131    brw_reg dst;
3132    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3133       dst = get_nir_def(ntb, instr->def);
3134 
3135    switch (instr->intrinsic) {
3136    case nir_intrinsic_load_primitive_id:
3137       bld.MOV(dst, s.tcs_payload().primitive_id);
3138       break;
3139    case nir_intrinsic_load_invocation_id:
3140       bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
3141       break;
3142 
3143    case nir_intrinsic_barrier:
3144       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
3145          fs_nir_emit_intrinsic(ntb, bld, instr);
3146       if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
3147          if (tcs_prog_data->instances != 1)
3148             emit_tcs_barrier(ntb);
3149       }
3150       break;
3151 
3152    case nir_intrinsic_load_input:
3153       unreachable("nir_lower_io should never give us these.");
3154       break;
3155 
3156    case nir_intrinsic_load_per_vertex_input: {
3157       assert(instr->def.bit_size == 32);
3158       brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3159       unsigned imm_offset = nir_intrinsic_base(instr);
3160       fs_inst *inst;
3161 
3162       const bool multi_patch =
3163          vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
3164 
3165       brw_reg icp_handle = multi_patch ?
3166          get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
3167          get_tcs_single_patch_icp_handle(ntb, bld, instr);
3168 
3169       /* We can only read two double components with each URB read, so
3170        * we send two read messages in that case, each one loading up to
3171        * two double components.
3172        */
3173       unsigned num_components = instr->num_components;
3174       unsigned first_component = nir_intrinsic_component(instr);
3175 
3176       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3177       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
3178 
3179       if (indirect_offset.file == BAD_FILE) {
3180          /* Constant indexing - use global offset. */
3181          if (first_component != 0) {
3182             unsigned read_components = num_components + first_component;
3183             brw_reg tmp = bld.vgrf(dst.type, read_components);
3184             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
3185                             ARRAY_SIZE(srcs));
3186             brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3187                                  num_components);
3188          } else {
3189             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
3190                             ARRAY_SIZE(srcs));
3191          }
3192          inst->offset = imm_offset;
3193       } else {
3194          /* Indirect indexing - use per-slot offsets as well. */
3195          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3196 
3197          if (first_component != 0) {
3198             unsigned read_components = num_components + first_component;
3199             brw_reg tmp = bld.vgrf(dst.type, read_components);
3200             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3201                             srcs, ARRAY_SIZE(srcs));
3202             brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3203                                  num_components);
3204          } else {
3205             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3206                             srcs, ARRAY_SIZE(srcs));
3207          }
3208          inst->offset = imm_offset;
3209       }
3210       inst->size_written = (num_components + first_component) *
3211                            inst->dst.component_size(inst->exec_size);
3212 
3213       /* Copy the temporary to the destination to deal with writemasking.
3214        *
3215        * Also attempt to deal with gl_PointSize being in the .w component.
3216        */
3217       if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
3218          assert(brw_type_size_bytes(dst.type) == 4);
3219          inst->dst = bld.vgrf(dst.type, 4);
3220          inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
3221          bld.MOV(dst, offset(inst->dst, bld, 3));
3222       }
3223       break;
3224    }
3225 
3226    case nir_intrinsic_load_output:
3227    case nir_intrinsic_load_per_vertex_output: {
3228       assert(instr->def.bit_size == 32);
3229       brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3230       unsigned imm_offset = nir_intrinsic_base(instr);
3231       unsigned first_component = nir_intrinsic_component(instr);
3232 
3233       fs_inst *inst;
3234       if (indirect_offset.file == BAD_FILE) {
3235          /* This MOV replicates the output handle to all enabled channels
3236           * is SINGLE_PATCH mode.
3237           */
3238          brw_reg patch_handle = bld.MOV(s.tcs_payload().patch_urb_output);
3239 
3240          {
3241             brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3242             srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
3243 
3244             if (first_component != 0) {
3245                unsigned read_components =
3246                   instr->num_components + first_component;
3247                brw_reg tmp = bld.vgrf(dst.type, read_components);
3248                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3249                                srcs, ARRAY_SIZE(srcs));
3250                inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3251                brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3252                                     instr->num_components);
3253             } else {
3254                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3255                                srcs, ARRAY_SIZE(srcs));
3256                inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3257             }
3258             inst->offset = imm_offset;
3259          }
3260       } else {
3261          /* Indirect indexing - use per-slot offsets as well. */
3262          brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3263          srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3264          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3265 
3266          if (first_component != 0) {
3267             unsigned read_components =
3268                instr->num_components + first_component;
3269             brw_reg tmp = bld.vgrf(dst.type, read_components);
3270             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3271                             srcs, ARRAY_SIZE(srcs));
3272             inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3273             brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3274                                  instr->num_components);
3275          } else {
3276             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3277                             srcs, ARRAY_SIZE(srcs));
3278             inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3279          }
3280          inst->offset = imm_offset;
3281       }
3282       break;
3283    }
3284 
3285    case nir_intrinsic_store_output:
3286    case nir_intrinsic_store_per_vertex_output: {
3287       assert(nir_src_bit_size(instr->src[0]) == 32);
3288       brw_reg value = get_nir_src(ntb, instr->src[0], -1);
3289       brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3290       unsigned imm_offset = nir_intrinsic_base(instr);
3291       unsigned mask = nir_intrinsic_write_mask(instr);
3292 
3293       if (mask == 0)
3294          break;
3295 
3296       unsigned num_components = util_last_bit(mask);
3297       unsigned first_component = nir_intrinsic_component(instr);
3298       assert((first_component + num_components) <= 4);
3299 
3300       mask = mask << first_component;
3301 
3302       const bool has_urb_lsc = devinfo->ver >= 20;
3303 
3304       brw_reg mask_reg;
3305       if (mask != WRITEMASK_XYZW)
3306          mask_reg = brw_imm_ud(mask << 16);
3307 
3308       brw_reg sources[4];
3309 
3310       unsigned m = has_urb_lsc ? 0 : first_component;
3311       for (unsigned i = 0; i < num_components; i++) {
3312          int c = i + first_component;
3313          if (mask & (1 << c)) {
3314             sources[m++] = offset(value, bld, i);
3315          } else if (devinfo->ver < 20) {
3316             m++;
3317          }
3318       }
3319 
3320       assert(has_urb_lsc || m == (first_component + num_components));
3321 
3322       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3323       srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3324       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3325       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
3326       srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_TYPE_F, m);
3327       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(m);
3328       bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
3329 
3330       fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
3331                                srcs, ARRAY_SIZE(srcs));
3332       inst->offset = imm_offset;
3333       break;
3334    }
3335 
3336    default:
3337       fs_nir_emit_intrinsic(ntb, bld, instr);
3338       break;
3339    }
3340 }
3341 
3342 static void
fs_nir_emit_tes_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3343 fs_nir_emit_tes_intrinsic(nir_to_brw_state &ntb,
3344                           nir_intrinsic_instr *instr)
3345 {
3346    const intel_device_info *devinfo = ntb.devinfo;
3347    const fs_builder &bld = ntb.bld;
3348    fs_visitor &s = ntb.s;
3349 
3350    assert(s.stage == MESA_SHADER_TESS_EVAL);
3351    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(s.prog_data);
3352 
3353    brw_reg dest;
3354    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3355       dest = get_nir_def(ntb, instr->def);
3356 
3357    switch (instr->intrinsic) {
3358    case nir_intrinsic_load_primitive_id:
3359       bld.MOV(dest, s.tes_payload().primitive_id);
3360       break;
3361 
3362    case nir_intrinsic_load_tess_coord:
3363       for (unsigned i = 0; i < 3; i++)
3364          bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3365       break;
3366 
3367    case nir_intrinsic_load_input:
3368    case nir_intrinsic_load_per_vertex_input: {
3369       assert(instr->def.bit_size == 32);
3370       brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3371       unsigned imm_offset = nir_intrinsic_base(instr);
3372       unsigned first_component = nir_intrinsic_component(instr);
3373 
3374       fs_inst *inst;
3375       if (indirect_offset.file == BAD_FILE) {
3376          /* Arbitrarily only push up to 32 vec4 slots worth of data,
3377           * which is 16 registers (since each holds 2 vec4 slots).
3378           */
3379          const unsigned max_push_slots = 32;
3380          if (imm_offset < max_push_slots) {
3381             const brw_reg src = horiz_offset(brw_attr_reg(0, dest.type),
3382                                             4 * imm_offset + first_component);
3383             brw_reg comps[NIR_MAX_VEC_COMPONENTS];
3384             for (unsigned i = 0; i < instr->num_components; i++) {
3385                comps[i] = component(src, i);
3386             }
3387             bld.VEC(dest, comps, instr->num_components);
3388 
3389             tes_prog_data->base.urb_read_length =
3390                MAX2(tes_prog_data->base.urb_read_length,
3391                     (imm_offset / 2) + 1);
3392          } else {
3393             /* Replicate the patch handle to all enabled channels */
3394             brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3395             srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3396 
3397             if (first_component != 0) {
3398                unsigned read_components =
3399                   instr->num_components + first_component;
3400                brw_reg tmp = bld.vgrf(dest.type, read_components);
3401                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3402                                srcs, ARRAY_SIZE(srcs));
3403                inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3404                brw_combine_with_vec(bld, dest, offset(tmp, bld, first_component),
3405                                     instr->num_components);
3406             } else {
3407                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3408                                srcs, ARRAY_SIZE(srcs));
3409                inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3410             }
3411             inst->offset = imm_offset;
3412          }
3413       } else {
3414          /* Indirect indexing - use per-slot offsets as well. */
3415 
3416          /* We can only read two double components with each URB read, so
3417           * we send two read messages in that case, each one loading up to
3418           * two double components.
3419           */
3420          unsigned num_components = instr->num_components;
3421 
3422          brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3423          srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3424          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3425 
3426          if (first_component != 0) {
3427             unsigned read_components =
3428                 num_components + first_component;
3429             brw_reg tmp = bld.vgrf(dest.type, read_components);
3430             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3431                             srcs, ARRAY_SIZE(srcs));
3432             brw_combine_with_vec(bld, dest, offset(tmp, bld, first_component),
3433                                  num_components);
3434          } else {
3435             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3436                             srcs, ARRAY_SIZE(srcs));
3437          }
3438          inst->offset = imm_offset;
3439          inst->size_written = (num_components + first_component) *
3440                               inst->dst.component_size(inst->exec_size);
3441       }
3442       break;
3443    }
3444    default:
3445       fs_nir_emit_intrinsic(ntb, bld, instr);
3446       break;
3447    }
3448 }
3449 
3450 static void
fs_nir_emit_gs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3451 fs_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
3452                          nir_intrinsic_instr *instr)
3453 {
3454    const fs_builder &bld = ntb.bld;
3455    fs_visitor &s = ntb.s;
3456 
3457    assert(s.stage == MESA_SHADER_GEOMETRY);
3458 
3459    brw_reg dest;
3460    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3461       dest = get_nir_def(ntb, instr->def);
3462 
3463    switch (instr->intrinsic) {
3464    case nir_intrinsic_load_primitive_id:
3465       assert(s.stage == MESA_SHADER_GEOMETRY);
3466       assert(brw_gs_prog_data(s.prog_data)->include_primitive_id);
3467       bld.MOV(retype(dest, BRW_TYPE_UD), s.gs_payload().primitive_id);
3468       break;
3469 
3470    case nir_intrinsic_load_input:
3471       unreachable("load_input intrinsics are invalid for the GS stage");
3472 
3473    case nir_intrinsic_load_per_vertex_input:
3474       emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3475                          instr->src[1], instr->num_components,
3476                          nir_intrinsic_component(instr));
3477       break;
3478 
3479    case nir_intrinsic_emit_vertex_with_counter:
3480       emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3481 
3482       /* After an EmitVertex() call, the values of all outputs are undefined.
3483        * If this is not in control flow, recreate a fresh set of output
3484        * registers to keep their live ranges separate.
3485        */
3486       if (instr->instr.block->cf_node.parent->type == nir_cf_node_function)
3487          fs_nir_setup_outputs(ntb);
3488       break;
3489 
3490    case nir_intrinsic_end_primitive_with_counter:
3491       emit_gs_end_primitive(ntb, instr->src[0]);
3492       break;
3493 
3494    case nir_intrinsic_set_vertex_and_primitive_count:
3495       bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3496       break;
3497 
3498    case nir_intrinsic_load_invocation_id: {
3499       brw_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3500       assert(val.file != BAD_FILE);
3501       dest.type = val.type;
3502       bld.MOV(dest, val);
3503       break;
3504    }
3505 
3506    default:
3507       fs_nir_emit_intrinsic(ntb, bld, instr);
3508       break;
3509    }
3510 }
3511 
3512 /**
3513  * Fetch the current render target layer index.
3514  */
3515 static brw_reg
fetch_render_target_array_index(const fs_builder & bld)3516 fetch_render_target_array_index(const fs_builder &bld)
3517 {
3518    const fs_visitor *v = bld.shader;
3519 
3520    if (bld.shader->devinfo->ver >= 20) {
3521       /* Gfx20+ has separate Render Target Array indices for each pair
3522        * of subspans in order to support multiple polygons, so we need
3523        * to use a <1;8,0> region in order to select the correct word
3524        * for each channel.
3525        */
3526       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3527 
3528       for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
3529          const fs_builder hbld = bld.group(16, i);
3530          const struct brw_reg reg = retype(brw_vec1_grf(2 * i + 1, 1),
3531                                            BRW_TYPE_UW);
3532          hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
3533                   brw_imm_uw(0x7ff));
3534       }
3535 
3536       return idx;
3537    } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
3538       /* According to the BSpec "PS Thread Payload for Normal
3539        * Dispatch", the render target array index is stored as bits
3540        * 26:16 of either the R1.1 or R1.6 poly info dwords, for the
3541        * first and second polygons respectively in multipolygon PS
3542        * dispatch mode.
3543        */
3544       assert(bld.dispatch_width() == 16);
3545       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3546 
3547       for (unsigned i = 0; i < v->max_polygons; i++) {
3548          const fs_builder hbld = bld.group(8, i);
3549          const struct brw_reg g1 = brw_uw1_reg(FIXED_GRF, 1, 3 + 10 * i);
3550          hbld.AND(offset(idx, hbld, i), g1, brw_imm_uw(0x7ff));
3551       }
3552 
3553       return idx;
3554    } else if (bld.shader->devinfo->ver >= 12) {
3555       /* The render target array index is provided in the thread payload as
3556        * bits 26:16 of r1.1.
3557        */
3558       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3559       bld.AND(idx, brw_uw1_reg(FIXED_GRF, 1, 3),
3560               brw_imm_uw(0x7ff));
3561       return idx;
3562    } else {
3563       /* The render target array index is provided in the thread payload as
3564        * bits 26:16 of r0.0.
3565        */
3566       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3567       bld.AND(idx, brw_uw1_reg(FIXED_GRF, 0, 1),
3568               brw_imm_uw(0x7ff));
3569       return idx;
3570    }
3571 }
3572 
3573 static brw_reg
fetch_viewport_index(const fs_builder & bld)3574 fetch_viewport_index(const fs_builder &bld)
3575 {
3576    const fs_visitor *v = bld.shader;
3577 
3578    if (bld.shader->devinfo->ver >= 20) {
3579       /* Gfx20+ has separate viewport indices for each pair
3580        * of subspans in order to support multiple polygons, so we need
3581        * to use a <1;8,0> region in order to select the correct word
3582        * for each channel.
3583        */
3584       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3585 
3586       for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
3587          const fs_builder hbld = bld.group(16, i);
3588          const struct brw_reg reg = retype(xe2_vec1_grf(i, 9),
3589                                            BRW_TYPE_UW);
3590          hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
3591                   brw_imm_uw(0xf000));
3592       }
3593 
3594       bld.SHR(idx, idx, brw_imm_ud(12));
3595       return idx;
3596    } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
3597       /* According to the BSpec "PS Thread Payload for Normal
3598        * Dispatch", the viewport index is stored as bits
3599        * 30:27 of either the R1.1 or R1.6 poly info dwords, for the
3600        * first and second polygons respectively in multipolygon PS
3601        * dispatch mode.
3602        */
3603       assert(bld.dispatch_width() == 16);
3604       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3605       brw_reg vp_idx_per_poly_dw[2] = {
3606          brw_ud1_reg(FIXED_GRF, 1, 1), /* R1.1 bits 30:27 */
3607          brw_ud1_reg(FIXED_GRF, 1, 6), /* R1.6 bits 30:27 */
3608       };
3609 
3610       for (unsigned i = 0; i < v->max_polygons; i++) {
3611          const fs_builder hbld = bld.group(8, i);
3612          hbld.SHR(offset(idx, hbld, i), vp_idx_per_poly_dw[i], brw_imm_ud(27));
3613       }
3614 
3615       return bld.AND(idx, brw_imm_ud(0xf));
3616    } else if (bld.shader->devinfo->ver >= 12) {
3617       /* The viewport index is provided in the thread payload as
3618        * bits 30:27 of r1.1.
3619        */
3620       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3621       bld.SHR(idx,
3622               bld.AND(brw_uw1_reg(FIXED_GRF, 1, 3),
3623                       brw_imm_uw(0x7800)),
3624               brw_imm_ud(11));
3625       return idx;
3626    } else {
3627       /* The viewport index is provided in the thread payload as
3628        * bits 30:27 of r0.0.
3629        */
3630       const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3631       bld.SHR(idx,
3632               bld.AND(brw_uw1_reg(FIXED_GRF, 0, 1),
3633                       brw_imm_uw(0x7800)),
3634               brw_imm_ud(11));
3635       return idx;
3636    }
3637 }
3638 
3639 /* Sample from the MCS surface attached to this multisample texture. */
3640 static brw_reg
emit_mcs_fetch(nir_to_brw_state & ntb,const brw_reg & coordinate,unsigned components,const brw_reg & texture,const brw_reg & texture_handle)3641 emit_mcs_fetch(nir_to_brw_state &ntb, const brw_reg &coordinate, unsigned components,
3642                const brw_reg &texture,
3643                const brw_reg &texture_handle)
3644 {
3645    const fs_builder &bld = ntb.bld;
3646 
3647    const brw_reg dest = bld.vgrf(BRW_TYPE_UD, 4);
3648 
3649    brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
3650    srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3651    srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3652    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
3653    srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3654    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
3655    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
3656    srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
3657 
3658    fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3659                             ARRAY_SIZE(srcs));
3660 
3661    /* We only care about one or two regs of response, but the sampler always
3662     * writes 4/8.
3663     */
3664    inst->size_written = 4 * dest.component_size(inst->exec_size);
3665 
3666    return dest;
3667 }
3668 
3669 /**
3670  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3671  * framebuffer at the current fragment coordinates and sample index.
3672  */
3673 static fs_inst *
emit_non_coherent_fb_read(nir_to_brw_state & ntb,const fs_builder & bld,const brw_reg & dst,unsigned target)3674 emit_non_coherent_fb_read(nir_to_brw_state &ntb, const fs_builder &bld, const brw_reg &dst,
3675                           unsigned target)
3676 {
3677    fs_visitor &s = ntb.s;
3678    const struct intel_device_info *devinfo = s.devinfo;
3679 
3680    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3681    const brw_wm_prog_key *wm_key =
3682       reinterpret_cast<const brw_wm_prog_key *>(s.key);
3683    assert(!wm_key->coherent_fb_fetch);
3684 
3685    /* Calculate the fragment coordinates. */
3686    const brw_reg coords = bld.vgrf(BRW_TYPE_UD, 3);
3687    bld.MOV(offset(coords, bld, 0), s.pixel_x);
3688    bld.MOV(offset(coords, bld, 1), s.pixel_y);
3689    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3690 
3691    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3692     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3693     * shouldn't be necessary to recompile based on whether the framebuffer is
3694     * CMS or UMS.
3695     */
3696    assert(wm_key->multisample_fbo == INTEL_ALWAYS ||
3697           wm_key->multisample_fbo == INTEL_NEVER);
3698    if (wm_key->multisample_fbo &&
3699        ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3700       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3701 
3702    const brw_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3703    const brw_reg mcs = wm_key->multisample_fbo ?
3704       emit_mcs_fetch(ntb, coords, 3, brw_imm_ud(target), brw_reg()) : brw_reg();
3705 
3706    /* Use either a normal or a CMS texel fetch message depending on whether
3707     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3708     * message just in case the framebuffer uses 16x multisampling, it should
3709     * be equivalent to the normal CMS fetch for lower multisampling modes.
3710     */
3711    opcode op;
3712    if (wm_key->multisample_fbo) {
3713       /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
3714        * multisampling, it should be equivalent to the normal CMS fetch for
3715        * lower multisampling modes.
3716        *
3717        * On Gfx12HP, there is only CMS_W variant available.
3718        */
3719       if (devinfo->verx10 >= 125)
3720          op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
3721       else
3722          op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
3723    } else {
3724       op = SHADER_OPCODE_TXF_LOGICAL;
3725    }
3726 
3727    /* Emit the instruction. */
3728    brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
3729    srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3730    srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3731    srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3732    srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3733    srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(target);
3734    srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3735    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3736    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3737    srcs[TEX_LOGICAL_SRC_RESIDENCY]        = brw_imm_ud(0);
3738 
3739    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3740    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3741 
3742    return inst;
3743 }
3744 
3745 /**
3746  * Actual coherent framebuffer read implemented using the native render target
3747  * read message.  Requires SKL+.
3748  */
3749 static fs_inst *
emit_coherent_fb_read(const fs_builder & bld,const brw_reg & dst,unsigned target)3750 emit_coherent_fb_read(const fs_builder &bld, const brw_reg &dst, unsigned target)
3751 {
3752    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3753    inst->target = target;
3754    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3755 
3756    return inst;
3757 }
3758 
3759 static brw_reg
alloc_temporary(const fs_builder & bld,unsigned size,brw_reg * regs,unsigned n)3760 alloc_temporary(const fs_builder &bld, unsigned size, brw_reg *regs, unsigned n)
3761 {
3762    if (n && regs[0].file != BAD_FILE) {
3763       return regs[0];
3764 
3765    } else {
3766       const brw_reg tmp = bld.vgrf(BRW_TYPE_F, size);
3767 
3768       for (unsigned i = 0; i < n; i++)
3769          regs[i] = tmp;
3770 
3771       return tmp;
3772    }
3773 }
3774 
3775 static brw_reg
alloc_frag_output(nir_to_brw_state & ntb,unsigned location)3776 alloc_frag_output(nir_to_brw_state &ntb, unsigned location)
3777 {
3778    fs_visitor &s = ntb.s;
3779 
3780    assert(s.stage == MESA_SHADER_FRAGMENT);
3781    const brw_wm_prog_key *const key =
3782       reinterpret_cast<const brw_wm_prog_key *>(s.key);
3783    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3784    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3785 
3786    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3787       return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3788 
3789    else if (l == FRAG_RESULT_COLOR)
3790       return alloc_temporary(ntb.bld, 4, s.outputs,
3791                              MAX2(key->nr_color_regions, 1));
3792 
3793    else if (l == FRAG_RESULT_DEPTH)
3794       return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3795 
3796    else if (l == FRAG_RESULT_STENCIL)
3797       return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3798 
3799    else if (l == FRAG_RESULT_SAMPLE_MASK)
3800       return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3801 
3802    else if (l >= FRAG_RESULT_DATA0 &&
3803             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3804       return alloc_temporary(ntb.bld, 4,
3805                              &s.outputs[l - FRAG_RESULT_DATA0], 1);
3806 
3807    else
3808       unreachable("Invalid location");
3809 }
3810 
3811 static void
emit_is_helper_invocation(nir_to_brw_state & ntb,brw_reg result)3812 emit_is_helper_invocation(nir_to_brw_state &ntb, brw_reg result)
3813 {
3814    const fs_builder &bld = ntb.bld;
3815 
3816    /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3817     * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3818     * consideration demoted invocations.
3819     */
3820    result.type = BRW_TYPE_UD;
3821 
3822    bld.MOV(result, brw_imm_ud(0));
3823 
3824    /* See brw_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3825    unsigned width = bld.dispatch_width();
3826    for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3827       const fs_builder b = bld.group(MIN2(width, 16), i);
3828 
3829       fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0));
3830 
3831       /* The at() ensures that any code emitted to get the predicate happens
3832        * before the mov right above.  This is not an issue elsewhere because
3833        * lowering code already set up the builder this way.
3834        */
3835       brw_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3836       mov->predicate_inverse = true;
3837    }
3838 }
3839 
3840 static brw_reg
emit_frontfacing_interpolation(nir_to_brw_state & ntb)3841 emit_frontfacing_interpolation(nir_to_brw_state &ntb)
3842 {
3843    const intel_device_info *devinfo = ntb.devinfo;
3844    const fs_builder &bld = ntb.bld;
3845    fs_visitor &s = ntb.s;
3846 
3847    brw_reg ff = bld.vgrf(BRW_TYPE_D);
3848 
3849    if (devinfo->ver >= 20) {
3850       /* Gfx20+ has separate back-facing bits for each pair of
3851        * subspans in order to support multiple polygons, so we need to
3852        * use a <1;8,0> region in order to select the correct word for
3853        * each channel.
3854        */
3855       const brw_reg tmp = bld.vgrf(BRW_TYPE_UW);
3856 
3857       for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3858          const fs_builder hbld = bld.group(16, i);
3859          const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
3860                                              BRW_TYPE_UW);
3861          hbld.AND(offset(tmp, hbld, i), gi_uw, brw_imm_uw(0x800));
3862       }
3863 
3864       bld.CMP(ff, tmp, brw_imm_uw(0), BRW_CONDITIONAL_Z);
3865 
3866    } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
3867       /* According to the BSpec "PS Thread Payload for Normal
3868        * Dispatch", the front/back facing interpolation bit is stored
3869        * as bit 15 of either the R1.1 or R1.6 poly info field, for the
3870        * first and second polygons respectively in multipolygon PS
3871        * dispatch mode.
3872        */
3873       assert(s.dispatch_width == 16);
3874       brw_reg tmp = bld.vgrf(BRW_TYPE_W);
3875 
3876       for (unsigned i = 0; i < s.max_polygons; i++) {
3877          const fs_builder hbld = bld.group(8, i);
3878          const struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
3879                                           BRW_TYPE_W);
3880          hbld.ASR(offset(tmp, hbld, i), g1, brw_imm_d(15));
3881       }
3882 
3883       bld.NOT(ff, tmp);
3884 
3885    } else if (devinfo->ver >= 12) {
3886       brw_reg g1 = brw_reg(retype(brw_vec1_grf(1, 1), BRW_TYPE_W));
3887 
3888       brw_reg tmp = bld.vgrf(BRW_TYPE_W);
3889       bld.ASR(tmp, g1, brw_imm_d(15));
3890       bld.NOT(ff, tmp);
3891    } else {
3892       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3893        * a boolean result from this (~0/true or 0/false).
3894        *
3895        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3896        * this task in only one instruction:
3897        *    - a negation source modifier will flip the bit; and
3898        *    - a W -> D type conversion will sign extend the bit into the high
3899        *      word of the destination.
3900        *
3901        * An ASR 15 fills the low word of the destination.
3902        */
3903       brw_reg g0 = brw_reg(retype(brw_vec1_grf(0, 0), BRW_TYPE_W));
3904 
3905       bld.ASR(ff, negate(g0), brw_imm_d(15));
3906    }
3907 
3908    return ff;
3909 }
3910 
3911 static brw_reg
emit_samplepos_setup(nir_to_brw_state & ntb)3912 emit_samplepos_setup(nir_to_brw_state &ntb)
3913 {
3914    const fs_builder &bld = ntb.bld;
3915    fs_visitor &s = ntb.s;
3916 
3917    assert(s.stage == MESA_SHADER_FRAGMENT);
3918    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
3919 
3920    const fs_builder abld = bld.annotate("compute sample position");
3921    brw_reg pos = abld.vgrf(BRW_TYPE_F, 2);
3922 
3923    if (wm_prog_data->persample_dispatch == INTEL_NEVER) {
3924       /* From ARB_sample_shading specification:
3925        * "When rendering to a non-multisample buffer, or if multisample
3926        *  rasterization is disabled, gl_SamplePosition will always be
3927        *  (0.5, 0.5).
3928        */
3929       bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f));
3930       bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f));
3931       return pos;
3932    }
3933 
3934    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3935     * mode will be enabled.
3936     *
3937     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3938     * R31.1:0         Position Offset X/Y for Slot[3:0]
3939     * R31.3:2         Position Offset X/Y for Slot[7:4]
3940     * .....
3941     *
3942     * The X, Y sample positions come in as bytes in  thread payload. So, read
3943     * the positions using vstride=16, width=8, hstride=2.
3944     */
3945    const brw_reg sample_pos_reg =
3946       fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, BRW_TYPE_W);
3947 
3948    for (unsigned i = 0; i < 2; i++) {
3949       brw_reg tmp_d = bld.vgrf(BRW_TYPE_D);
3950       abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_TYPE_B, i));
3951       /* Convert int_sample_pos to floating point */
3952       brw_reg tmp_f = bld.vgrf(BRW_TYPE_F);
3953       abld.MOV(tmp_f, tmp_d);
3954       /* Scale to the range [0, 1] */
3955       abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
3956    }
3957 
3958    if (wm_prog_data->persample_dispatch == INTEL_SOMETIMES) {
3959       check_dynamic_msaa_flag(abld, wm_prog_data,
3960                               INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3961       for (unsigned i = 0; i < 2; i++) {
3962          set_predicate(BRW_PREDICATE_NORMAL,
3963                        bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3964                                brw_imm_f(0.5f)));
3965       }
3966    }
3967 
3968    return pos;
3969 }
3970 
3971 static brw_reg
emit_sampleid_setup(nir_to_brw_state & ntb)3972 emit_sampleid_setup(nir_to_brw_state &ntb)
3973 {
3974    const intel_device_info *devinfo = ntb.devinfo;
3975    const fs_builder &bld = ntb.bld;
3976    fs_visitor &s = ntb.s;
3977 
3978    assert(s.stage == MESA_SHADER_FRAGMENT);
3979    ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
3980    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
3981 
3982    const fs_builder abld = bld.annotate("compute sample id");
3983    brw_reg sample_id = abld.vgrf(BRW_TYPE_UD);
3984 
3985    assert(key->multisample_fbo != INTEL_NEVER);
3986 
3987    /* Sample ID comes in as 4-bit numbers in g1.0:
3988     *
3989     *    15:12 Slot 3 SampleID (only used in SIMD16)
3990     *     11:8 Slot 2 SampleID (only used in SIMD16)
3991     *      7:4 Slot 1 SampleID
3992     *      3:0 Slot 0 SampleID
3993     *
3994     * Each slot corresponds to four channels, so we want to replicate each
3995     * half-byte value to 4 channels in a row:
3996     *
3997     *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
3998     *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
3999     *
4000     *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
4001     *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
4002     *
4003     * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
4004     * channels to read the first byte (7:0), and the second group of 8
4005     * channels to read the second byte (15:8).  Then, we shift right by
4006     * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
4007     * values into place.  Finally, we AND with 0xf to keep the low nibble.
4008     *
4009     *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
4010     *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
4011     *
4012     * TODO: These payload bits exist on Gfx7 too, but they appear to always
4013     *       be zero, so this code fails to work.  We should find out why.
4014     */
4015    const brw_reg tmp = abld.vgrf(BRW_TYPE_UW);
4016 
4017    for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
4018       const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
4019       /* According to the "PS Thread Payload for Normal Dispatch"
4020        * pages on the BSpec, the sample ids are stored in R0.8/R1.8
4021        * on gfx20+ and in R1.0/R2.0 on gfx8+.
4022        */
4023       const struct brw_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
4024                                     brw_vec1_grf(i + 1, 0);
4025       hbld.SHR(offset(tmp, hbld, i),
4026                stride(retype(id_reg, BRW_TYPE_UB), 1, 8, 0),
4027                brw_imm_v(0x44440000));
4028    }
4029 
4030    abld.AND(sample_id, tmp, brw_imm_w(0xf));
4031 
4032    if (key->multisample_fbo == INTEL_SOMETIMES) {
4033       check_dynamic_msaa_flag(abld, wm_prog_data,
4034                               INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
4035       set_predicate(BRW_PREDICATE_NORMAL,
4036                     abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
4037    }
4038 
4039    return sample_id;
4040 }
4041 
4042 static brw_reg
emit_samplemaskin_setup(nir_to_brw_state & ntb)4043 emit_samplemaskin_setup(nir_to_brw_state &ntb)
4044 {
4045    const fs_builder &bld = ntb.bld;
4046    fs_visitor &s = ntb.s;
4047 
4048    assert(s.stage == MESA_SHADER_FRAGMENT);
4049    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
4050 
4051    /* The HW doesn't provide us with expected values. */
4052    assert(wm_prog_data->coarse_pixel_dispatch != INTEL_ALWAYS);
4053 
4054    brw_reg coverage_mask =
4055       fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, BRW_TYPE_UD);
4056 
4057    if (wm_prog_data->persample_dispatch == INTEL_NEVER)
4058       return coverage_mask;
4059 
4060    /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
4061     * and a mask representing which sample is being processed by the
4062     * current shader invocation.
4063     *
4064     * From the OES_sample_variables specification:
4065     * "When per-sample shading is active due to the use of a fragment input
4066     *  qualified by "sample" or due to the use of the gl_SampleID or
4067     *  gl_SamplePosition variables, only the bit for the current sample is
4068     *  set in gl_SampleMaskIn."
4069     */
4070    const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
4071 
4072    if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
4073       ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
4074 
4075    brw_reg one = abld.MOV(brw_imm_ud(1));
4076    brw_reg enabled_mask = abld.SHL(one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
4077    brw_reg mask = abld.AND(enabled_mask, coverage_mask);
4078 
4079    if (wm_prog_data->persample_dispatch == INTEL_ALWAYS)
4080       return mask;
4081 
4082    check_dynamic_msaa_flag(abld, wm_prog_data,
4083                            INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
4084    set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
4085 
4086    return mask;
4087 }
4088 
4089 static brw_reg
emit_shading_rate_setup(nir_to_brw_state & ntb)4090 emit_shading_rate_setup(nir_to_brw_state &ntb)
4091 {
4092    const intel_device_info *devinfo = ntb.devinfo;
4093    const fs_builder &bld = ntb.bld;
4094 
4095    assert(devinfo->ver >= 11);
4096 
4097    struct brw_wm_prog_data *wm_prog_data =
4098       brw_wm_prog_data(bld.shader->prog_data);
4099 
4100    /* Coarse pixel shading size fields overlap with other fields of not in
4101     * coarse pixel dispatch mode, so report 0 when that's not the case.
4102     */
4103    if (wm_prog_data->coarse_pixel_dispatch == INTEL_NEVER)
4104       return brw_imm_ud(0);
4105 
4106    const fs_builder abld = bld.annotate("compute fragment shading rate");
4107 
4108    /* The shading rates provided in the shader are the actual 2D shading
4109     * rate while the SPIR-V built-in is the enum value that has the shading
4110     * rate encoded as a bitfield.  Fortunately, the bitfield value is just
4111     * the shading rate divided by two and shifted.
4112     */
4113 
4114    /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
4115    brw_reg actual_x = brw_reg(retype(brw_vec1_grf(1, 0), BRW_TYPE_UB));
4116    /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
4117    brw_reg actual_y = byte_offset(actual_x, 1);
4118 
4119    brw_reg int_rate_y = abld.SHR(actual_y, brw_imm_ud(1));
4120    brw_reg int_rate_x = abld.SHR(actual_x, brw_imm_ud(1));
4121 
4122    brw_reg rate = abld.OR(abld.SHL(int_rate_x, brw_imm_ud(2)), int_rate_y);
4123 
4124    if (wm_prog_data->coarse_pixel_dispatch == INTEL_ALWAYS)
4125       return rate;
4126 
4127    check_dynamic_msaa_flag(abld, wm_prog_data,
4128                            INTEL_MSAA_FLAG_COARSE_RT_WRITES);
4129    set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
4130 
4131    return rate;
4132 }
4133 
4134 /* Input data is organized with first the per-primitive values, followed
4135  * by per-vertex values.  The per-vertex will have interpolation information
4136  * associated, so use 4 components for each value.
4137  */
4138 
4139 /* The register location here is relative to the start of the URB
4140  * data.  It will get adjusted to be a real location before
4141  * generate_code() time.
4142  */
4143 static brw_reg
brw_interp_reg(const fs_builder & bld,unsigned location,unsigned channel,unsigned comp)4144 brw_interp_reg(const fs_builder &bld, unsigned location,
4145                unsigned channel, unsigned comp)
4146 {
4147    fs_visitor &s = *bld.shader;
4148    assert(s.stage == MESA_SHADER_FRAGMENT);
4149    assert(BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs);
4150 
4151    const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
4152 
4153    assert(prog_data->urb_setup[location] >= 0);
4154    unsigned nr = prog_data->urb_setup[location];
4155    channel += prog_data->urb_setup_channel[location];
4156 
4157    /* Adjust so we start counting from the first per_vertex input. */
4158    assert(nr >= prog_data->num_per_primitive_inputs);
4159    nr -= prog_data->num_per_primitive_inputs;
4160 
4161    const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
4162    const unsigned regnr = per_vertex_start + (nr * 4) + channel;
4163 
4164    if (s.max_polygons > 1) {
4165       /* In multipolygon dispatch each plane parameter is a
4166        * dispatch_width-wide SIMD vector (see comment in
4167        * assign_urb_setup()), so we need to use offset() instead of
4168        * component() to select the specified parameter.
4169        */
4170       const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
4171       bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD),
4172                           s.dispatch_width, comp));
4173       return retype(tmp, BRW_TYPE_F);
4174    } else {
4175       return component(brw_attr_reg(regnr, BRW_TYPE_F), comp);
4176    }
4177 }
4178 
4179 /* The register location here is relative to the start of the URB
4180  * data.  It will get adjusted to be a real location before
4181  * generate_code() time.
4182  */
4183 static brw_reg
brw_per_primitive_reg(const fs_builder & bld,int location,unsigned comp)4184 brw_per_primitive_reg(const fs_builder &bld, int location, unsigned comp)
4185 {
4186    fs_visitor &s = *bld.shader;
4187    assert(s.stage == MESA_SHADER_FRAGMENT);
4188    assert(BITFIELD64_BIT(location) & s.nir->info.per_primitive_inputs);
4189 
4190    const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
4191 
4192    comp += prog_data->urb_setup_channel[location];
4193 
4194    assert(prog_data->urb_setup[location] >= 0);
4195 
4196    const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
4197 
4198    assert(regnr < prog_data->num_per_primitive_inputs);
4199 
4200    if (s.max_polygons > 1) {
4201       /* In multipolygon dispatch each primitive constant is a
4202        * dispatch_width-wide SIMD vector (see comment in
4203        * assign_urb_setup()), so we need to use offset() instead of
4204        * component() to select the specified parameter.
4205        */
4206       const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
4207       bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD),
4208                           s.dispatch_width, comp % 4));
4209       return retype(tmp, BRW_TYPE_F);
4210    } else {
4211       return component(brw_attr_reg(regnr, BRW_TYPE_F), comp % 4);
4212    }
4213 }
4214 
4215 static void
fs_nir_emit_fs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4216 fs_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
4217                          nir_intrinsic_instr *instr)
4218 {
4219    const intel_device_info *devinfo = ntb.devinfo;
4220    const fs_builder &bld = ntb.bld;
4221    fs_visitor &s = ntb.s;
4222 
4223    assert(s.stage == MESA_SHADER_FRAGMENT);
4224 
4225    brw_reg dest;
4226    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4227       dest = get_nir_def(ntb, instr->def);
4228 
4229    switch (instr->intrinsic) {
4230    case nir_intrinsic_load_front_face:
4231       bld.MOV(retype(dest, BRW_TYPE_D), emit_frontfacing_interpolation(ntb));
4232       break;
4233 
4234    case nir_intrinsic_load_sample_pos:
4235    case nir_intrinsic_load_sample_pos_or_center: {
4236       brw_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
4237       assert(sample_pos.file != BAD_FILE);
4238       dest.type = sample_pos.type;
4239       bld.MOV(dest, sample_pos);
4240       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
4241       break;
4242    }
4243 
4244    case nir_intrinsic_load_layer_id:
4245       dest.type = BRW_TYPE_UD;
4246       bld.MOV(dest, fetch_render_target_array_index(bld));
4247       break;
4248 
4249    case nir_intrinsic_is_helper_invocation:
4250       emit_is_helper_invocation(ntb, dest);
4251       break;
4252 
4253    case nir_intrinsic_load_helper_invocation:
4254    case nir_intrinsic_load_sample_mask_in:
4255    case nir_intrinsic_load_sample_id:
4256    case nir_intrinsic_load_frag_shading_rate: {
4257       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
4258       brw_reg val = ntb.system_values[sv];
4259       assert(val.file != BAD_FILE);
4260       dest.type = val.type;
4261       bld.MOV(dest, val);
4262       break;
4263    }
4264 
4265    case nir_intrinsic_store_output: {
4266       const brw_reg src = get_nir_src(ntb, instr->src[0], -1);
4267       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
4268       const unsigned location = nir_intrinsic_base(instr) +
4269          SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
4270       const brw_reg new_dest =
4271          offset(retype(alloc_frag_output(ntb, location), src.type),
4272                 bld, nir_intrinsic_component(instr));
4273 
4274       brw_combine_with_vec(bld, new_dest, src, instr->num_components);
4275       break;
4276    }
4277 
4278    case nir_intrinsic_load_output: {
4279       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
4280                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
4281       assert(l >= FRAG_RESULT_DATA0);
4282       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
4283       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
4284       const brw_reg tmp = bld.vgrf(dest.type, 4);
4285 
4286       if (reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch)
4287          emit_coherent_fb_read(bld, tmp, target);
4288       else
4289          emit_non_coherent_fb_read(ntb, bld, tmp, target);
4290 
4291       brw_combine_with_vec(bld, dest,
4292                            offset(tmp, bld, nir_intrinsic_component(instr)),
4293                            instr->num_components);
4294       break;
4295    }
4296 
4297    case nir_intrinsic_demote:
4298    case nir_intrinsic_terminate:
4299    case nir_intrinsic_demote_if:
4300    case nir_intrinsic_terminate_if: {
4301       /* We track our discarded pixels in f0.1/f1.0.  By predicating on it, we
4302        * can update just the flag bits that aren't yet discarded.  If there's
4303        * no condition, we emit a CMP of g0 != g0, so all currently executing
4304        * channels will get turned off.
4305        */
4306       fs_inst *cmp = NULL;
4307       if (instr->intrinsic == nir_intrinsic_demote_if ||
4308           instr->intrinsic == nir_intrinsic_terminate_if) {
4309          nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
4310 
4311          if (alu != NULL &&
4312              alu->op != nir_op_bcsel) {
4313             /* Re-emit the instruction that generated the Boolean value, but
4314              * do not store it.  Since this instruction will be conditional,
4315              * other instructions that want to use the real Boolean value may
4316              * get garbage.  This was a problem for piglit's fs-discard-exit-2
4317              * test.
4318              *
4319              * Ideally we'd detect that the instruction cannot have a
4320              * conditional modifier before emitting the instructions.  Alas,
4321              * that is nigh impossible.  Instead, we're going to assume the
4322              * instruction (or last instruction) generated can have a
4323              * conditional modifier.  If it cannot, fallback to the old-style
4324              * compare, and hope dead code elimination will clean up the
4325              * extra instructions generated.
4326              */
4327             fs_nir_emit_alu(ntb, alu, false);
4328 
4329             cmp = (fs_inst *) s.instructions.get_tail();
4330             if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
4331                if (cmp->can_do_cmod())
4332                   cmp->conditional_mod = BRW_CONDITIONAL_Z;
4333                else
4334                   cmp = NULL;
4335             } else {
4336                /* The old sequence that would have been generated is,
4337                 * basically, bool_result == false.  This is equivalent to
4338                 * !bool_result, so negate the old modifier.
4339                 *
4340                 * Unfortunately, we can't do this to most float comparisons
4341                 * because of NaN, so we'll have to fallback to the old-style
4342                 * compare.
4343                 *
4344                 * For example, this code (after negation):
4345                 *    (+f1.0) cmp.ge.f1.0(8) null<1>F g30<8,8,1>F     0x0F
4346                 * will provide different results from this:
4347                 *    cmp.l.f0.0(8)   g31<1>F         g30<1,1,0>F     0x0F
4348                 *    (+f1.0) cmp.z.f1.0(8) null<1>D  g31<8,8,1>D     0D
4349                 * because both (NaN >= 0) == false and (NaN < 0) == false.
4350                 *
4351                 * It will still work for == and != though, because
4352                 * (NaN == x) == false and (NaN != x) == true.
4353                 */
4354                if (brw_type_is_float(cmp->src[0].type) &&
4355                    cmp->conditional_mod != BRW_CONDITIONAL_EQ &&
4356                    cmp->conditional_mod != BRW_CONDITIONAL_NEQ) {
4357                   cmp = NULL;
4358                } else {
4359                   cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
4360                }
4361             }
4362          }
4363 
4364          if (cmp == NULL) {
4365             cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
4366                           brw_imm_d(0), BRW_CONDITIONAL_Z);
4367          }
4368       } else {
4369          brw_reg some_reg = brw_reg(retype(brw_vec8_grf(0, 0), BRW_TYPE_UW));
4370          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
4371       }
4372 
4373       cmp->predicate = BRW_PREDICATE_NORMAL;
4374       cmp->flag_subreg = sample_mask_flag_subreg(s);
4375 
4376       fs_inst *jump = bld.emit(BRW_OPCODE_HALT);
4377       jump->flag_subreg = sample_mask_flag_subreg(s);
4378       jump->predicate_inverse = true;
4379 
4380       if (instr->intrinsic == nir_intrinsic_terminate ||
4381           instr->intrinsic == nir_intrinsic_terminate_if) {
4382          jump->predicate = BRW_PREDICATE_NORMAL;
4383       } else {
4384          /* Only jump when the whole quad is demoted.  For historical
4385           * reasons this is also used for discard.
4386           */
4387          jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
4388                             BRW_PREDICATE_ALIGN1_ANY4H);
4389       }
4390       break;
4391    }
4392 
4393    case nir_intrinsic_load_input:
4394    case nir_intrinsic_load_per_primitive_input: {
4395       /* In Fragment Shaders load_input is used either for flat inputs or
4396        * per-primitive inputs.
4397        */
4398       assert(instr->def.bit_size == 32);
4399       unsigned base = nir_intrinsic_base(instr);
4400       unsigned comp = nir_intrinsic_component(instr);
4401       unsigned num_components = instr->num_components;
4402 
4403       /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
4404 
4405       if (base == VARYING_SLOT_LAYER) {
4406          dest.type = BRW_TYPE_UD;
4407          bld.MOV(dest, fetch_render_target_array_index(bld));
4408          break;
4409       } else if (base == VARYING_SLOT_VIEWPORT) {
4410          dest.type = BRW_TYPE_UD;
4411          bld.MOV(dest, fetch_viewport_index(bld));
4412          break;
4413       }
4414 
4415       if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
4416          assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
4417          for (unsigned int i = 0; i < num_components; i++) {
4418             bld.MOV(offset(dest, bld, i),
4419                     retype(brw_per_primitive_reg(bld, base, comp + i), dest.type));
4420          }
4421       } else {
4422          /* Gfx20+ packs the plane parameters of a single logical
4423           * input in a vec3 format instead of the previously used vec4
4424           * format.
4425           */
4426          const unsigned k = devinfo->ver >= 20 ? 0 : 3;
4427          for (unsigned int i = 0; i < num_components; i++) {
4428             bld.MOV(offset(dest, bld, i),
4429                     retype(brw_interp_reg(bld, base, comp + i, k), dest.type));
4430          }
4431       }
4432       break;
4433    }
4434 
4435    case nir_intrinsic_load_fs_input_interp_deltas: {
4436       assert(s.stage == MESA_SHADER_FRAGMENT);
4437       assert(nir_src_as_uint(instr->src[0]) == 0);
4438       const unsigned base = nir_intrinsic_base(instr);
4439       const unsigned comp = nir_intrinsic_component(instr);
4440       dest.type = BRW_TYPE_F;
4441 
4442       /* Gfx20+ packs the plane parameters of a single logical
4443        * input in a vec3 format instead of the previously used vec4
4444        * format.
4445        */
4446       if (devinfo->ver >= 20) {
4447          bld.MOV(offset(dest, bld, 0), brw_interp_reg(bld, base, comp, 0));
4448          bld.MOV(offset(dest, bld, 1), brw_interp_reg(bld, base, comp, 2));
4449          bld.MOV(offset(dest, bld, 2), brw_interp_reg(bld, base, comp, 1));
4450       } else {
4451          bld.MOV(offset(dest, bld, 0), brw_interp_reg(bld, base, comp, 3));
4452          bld.MOV(offset(dest, bld, 1), brw_interp_reg(bld, base, comp, 1));
4453          bld.MOV(offset(dest, bld, 2), brw_interp_reg(bld, base, comp, 0));
4454       }
4455 
4456       break;
4457    }
4458 
4459    case nir_intrinsic_load_barycentric_pixel:
4460    case nir_intrinsic_load_barycentric_centroid:
4461    case nir_intrinsic_load_barycentric_sample: {
4462       /* Use the delta_xy values computed from the payload */
4463       enum intel_barycentric_mode bary = brw_barycentric_mode(
4464          reinterpret_cast<const brw_wm_prog_key *>(s.key), instr);
4465       const brw_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
4466                               offset(s.delta_xy[bary], bld, 1) };
4467       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4468       break;
4469    }
4470 
4471    case nir_intrinsic_load_barycentric_at_sample: {
4472       const glsl_interp_mode interpolation =
4473          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4474 
4475       if (devinfo->ver >= 20) {
4476          emit_pixel_interpolater_alu_at_sample(
4477             bld, dest, retype(get_nir_src(ntb, instr->src[0]),
4478                               BRW_TYPE_UD),
4479             interpolation);
4480 
4481       } else {
4482          const brw_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
4483                                            BRW_TYPE_UD);
4484          const brw_reg sample_id = bld.emit_uniformize(sample_src);
4485          const brw_reg msg_data = component(bld.group(8, 0).vgrf(BRW_TYPE_UD), 0);
4486 
4487          bld.exec_all().group(1, 0).SHL(msg_data, sample_id, brw_imm_ud(4u));
4488 
4489          brw_reg flag_reg;
4490          struct brw_wm_prog_key *wm_prog_key = (struct brw_wm_prog_key *) s.key;
4491          if (wm_prog_key->multisample_fbo == INTEL_SOMETIMES) {
4492             struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
4493 
4494             check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
4495                                     wm_prog_data,
4496                                     INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
4497             flag_reg = brw_flag_reg(0, 0);
4498          }
4499 
4500          emit_pixel_interpolater_send(bld,
4501                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
4502                                       dest,
4503                                       brw_reg(), /* src */
4504                                       msg_data,
4505                                       flag_reg,
4506                                       interpolation);
4507       }
4508       break;
4509    }
4510 
4511    case nir_intrinsic_load_barycentric_at_offset: {
4512       const glsl_interp_mode interpolation =
4513          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4514 
4515       if (devinfo->ver >= 20) {
4516          emit_pixel_interpolater_alu_at_offset(
4517             bld, dest,
4518             retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F),
4519             interpolation);
4520 
4521       } else if (nir_const_value *const_offset = nir_src_as_const_value(instr->src[0])) {
4522          assert(nir_src_bit_size(instr->src[0]) == 32);
4523          unsigned off_x = const_offset[0].u32 & 0xf;
4524          unsigned off_y = const_offset[1].u32 & 0xf;
4525 
4526          emit_pixel_interpolater_send(bld,
4527                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
4528                                       dest,
4529                                       brw_reg(), /* src */
4530                                       brw_imm_ud(off_x | (off_y << 4)),
4531                                       brw_reg(), /* flag_reg */
4532                                       interpolation);
4533       } else {
4534          brw_reg src = retype(get_nir_src(ntb, instr->src[0], -1), BRW_TYPE_D);
4535          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
4536          emit_pixel_interpolater_send(bld,
4537                                       opcode,
4538                                       dest,
4539                                       src,
4540                                       brw_imm_ud(0u),
4541                                       brw_reg(), /* flag_reg */
4542                                       interpolation);
4543       }
4544       break;
4545    }
4546 
4547    case nir_intrinsic_load_frag_coord: {
4548       brw_reg comps[4] = { s.pixel_x, s.pixel_y, s.pixel_z, s.wpos_w };
4549       bld.VEC(dest, comps, 4);
4550       break;
4551    }
4552 
4553    case nir_intrinsic_load_interpolated_input: {
4554       assert(instr->src[0].ssa &&
4555              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
4556       nir_intrinsic_instr *bary_intrinsic =
4557          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
4558       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
4559       brw_reg dst_xy;
4560 
4561       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
4562           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
4563          /* Use the result of the PI message. */
4564          dst_xy = retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F);
4565       } else {
4566          /* Use the delta_xy values computed from the payload */
4567          enum intel_barycentric_mode bary = brw_barycentric_mode(
4568             reinterpret_cast<const brw_wm_prog_key *>(s.key), bary_intrinsic);
4569          dst_xy = s.delta_xy[bary];
4570       }
4571 
4572       for (unsigned int i = 0; i < instr->num_components; i++) {
4573          brw_reg interp =
4574             brw_interp_reg(bld, nir_intrinsic_base(instr),
4575                            nir_intrinsic_component(instr) + i, 0);
4576          interp.type = BRW_TYPE_F;
4577          dest.type = BRW_TYPE_F;
4578 
4579          bld.PLN(offset(dest, bld, i), interp, dst_xy);
4580       }
4581       break;
4582    }
4583 
4584    default:
4585       fs_nir_emit_intrinsic(ntb, bld, instr);
4586       break;
4587    }
4588 }
4589 
4590 static unsigned
brw_workgroup_size(fs_visitor & s)4591 brw_workgroup_size(fs_visitor &s)
4592 {
4593    assert(gl_shader_stage_uses_workgroup(s.stage));
4594    assert(!s.nir->info.workgroup_size_variable);
4595    const struct brw_cs_prog_data *cs = brw_cs_prog_data(s.prog_data);
4596    return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
4597 }
4598 
4599 static void
fs_nir_emit_cs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4600 fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
4601                          nir_intrinsic_instr *instr)
4602 {
4603    const intel_device_info *devinfo = ntb.devinfo;
4604    const fs_builder &bld = ntb.bld;
4605    fs_visitor &s = ntb.s;
4606 
4607    assert(gl_shader_stage_uses_workgroup(s.stage));
4608    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(s.prog_data);
4609 
4610    brw_reg dest;
4611    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4612       dest = get_nir_def(ntb, instr->def);
4613 
4614    const fs_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
4615 
4616    switch (instr->intrinsic) {
4617    case nir_intrinsic_barrier:
4618       if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4619          fs_nir_emit_intrinsic(ntb, bld, instr);
4620       if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4621          /* The whole workgroup fits in a single HW thread, so all the
4622           * invocations are already executed lock-step.  Instead of an actual
4623           * barrier just emit a scheduling fence, that will generate no code.
4624           */
4625          if (!s.nir->info.workgroup_size_variable &&
4626              brw_workgroup_size(s) <= s.dispatch_width) {
4627             bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
4628             break;
4629          }
4630 
4631          emit_barrier(ntb);
4632          cs_prog_data->uses_barrier = true;
4633       }
4634       break;
4635 
4636    case nir_intrinsic_load_inline_data_intel: {
4637       const cs_thread_payload &payload = s.cs_payload();
4638       unsigned inline_stride = brw_type_size_bytes(dest.type);
4639       for (unsigned c = 0; c < instr->def.num_components; c++) {
4640          xbld.MOV(offset(dest, xbld, c),
4641                   retype(
4642                      byte_offset(payload.inline_parameter,
4643                                  nir_intrinsic_base(instr) +
4644                                  c * inline_stride),
4645                      dest.type));
4646       }
4647       break;
4648    }
4649 
4650    case nir_intrinsic_load_subgroup_id:
4651       s.cs_payload().load_subgroup_id(bld, dest);
4652       break;
4653 
4654    case nir_intrinsic_load_local_invocation_id:
4655       /* This is only used for hardware generated local IDs. */
4656       assert(cs_prog_data->generate_local_id);
4657 
4658       dest.type = BRW_TYPE_UD;
4659 
4660       for (unsigned i = 0; i < 3; i++)
4661          bld.MOV(offset(dest, bld, i), s.cs_payload().local_invocation_id[i]);
4662       break;
4663 
4664    case nir_intrinsic_load_workgroup_id: {
4665       brw_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4666       const fs_builder ubld = bld.scalar_group();
4667 
4668       assert(val.file != BAD_FILE);
4669       assert(val.is_scalar);
4670 
4671       dest.type = val.type;
4672       for (unsigned i = 0; i < 3; i++)
4673          ubld.MOV(offset(dest, ubld, i), offset(val, ubld, i));
4674       break;
4675    }
4676 
4677    case nir_intrinsic_load_num_workgroups: {
4678       assert(instr->def.bit_size == 32);
4679 
4680       cs_prog_data->uses_num_work_groups = true;
4681 
4682       brw_reg srcs[MEMORY_LOGICAL_NUM_SRCS];
4683       srcs[MEMORY_LOGICAL_OPCODE] = brw_imm_ud(LSC_OP_LOAD);
4684       srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_UNTYPED);
4685       srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_BTI);
4686       srcs[MEMORY_LOGICAL_BINDING] = brw_imm_ud(0);
4687       srcs[MEMORY_LOGICAL_ADDRESS] = brw_imm_ud(0);
4688       srcs[MEMORY_LOGICAL_COORD_COMPONENTS] = brw_imm_ud(1);
4689       srcs[MEMORY_LOGICAL_ALIGNMENT] = brw_imm_ud(4);
4690       srcs[MEMORY_LOGICAL_DATA_SIZE] = brw_imm_ud(LSC_DATA_SIZE_D32);
4691       srcs[MEMORY_LOGICAL_COMPONENTS] = brw_imm_ud(3);
4692       srcs[MEMORY_LOGICAL_FLAGS] = brw_imm_ud(0);
4693 
4694       fs_inst *inst =
4695          bld.emit(SHADER_OPCODE_MEMORY_LOAD_LOGICAL,
4696                   dest, srcs, MEMORY_LOGICAL_NUM_SRCS);
4697       inst->size_written = 3 * s.dispatch_width * 4;
4698       break;
4699    }
4700 
4701    case nir_intrinsic_load_workgroup_size: {
4702       /* Should have been lowered by brw_nir_lower_cs_intrinsics() or
4703        * iris_setup_uniforms() for the variable group size case.
4704        */
4705       unreachable("Should have been lowered");
4706       break;
4707    }
4708 
4709    case nir_intrinsic_dpas_intel: {
4710       const unsigned sdepth = nir_intrinsic_systolic_depth(instr);
4711       const unsigned rcount = nir_intrinsic_repeat_count(instr);
4712 
4713       const brw_reg_type dest_type =
4714          brw_type_for_nir_type(devinfo, nir_intrinsic_dest_type(instr));
4715       const brw_reg_type src_type =
4716          brw_type_for_nir_type(devinfo, nir_intrinsic_src_type(instr));
4717 
4718       dest = retype(dest, dest_type);
4719       brw_reg src0 = retype(get_nir_src(ntb, instr->src[0]), dest_type);
4720 
4721       fs_builder bld16 = bld.exec_all().group(16, 0);
4722       fs_builder bldn = devinfo->ver >= 20 ? bld16 : bld.exec_all().group(8, 0);
4723 
4724       bldn.DPAS(dest,
4725                 src0,
4726                 retype(get_nir_src(ntb, instr->src[2]), src_type),
4727                 retype(get_nir_src(ntb, instr->src[1]), src_type),
4728                 sdepth,
4729                 rcount)
4730          ->saturate = nir_intrinsic_saturate(instr);
4731 
4732       cs_prog_data->uses_systolic = true;
4733       break;
4734    }
4735 
4736    default:
4737       fs_nir_emit_intrinsic(ntb, bld, instr);
4738       break;
4739    }
4740 }
4741 
4742 static void
emit_rt_lsc_fence(const fs_builder & bld,enum lsc_fence_scope scope,enum lsc_flush_type flush_type)4743 emit_rt_lsc_fence(const fs_builder &bld,
4744                   enum lsc_fence_scope scope,
4745                   enum lsc_flush_type flush_type)
4746 {
4747    const intel_device_info *devinfo = bld.shader->devinfo;
4748 
4749    const fs_builder ubld = bld.exec_all().group(8, 0);
4750    brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
4751    fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
4752                              brw_imm_ud(0) /* desc */,
4753                              brw_imm_ud(0) /* ex_desc */,
4754                              brw_vec8_grf(0, 0) /* payload */);
4755    send->sfid = GFX12_SFID_UGM;
4756    send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true);
4757    send->mlen = reg_unit(devinfo); /* g0 header */
4758    send->ex_mlen = 0;
4759    /* Temp write for scheduling */
4760    send->size_written = REG_SIZE * reg_unit(devinfo);
4761    send->send_has_side_effects = true;
4762 
4763    ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
4764 }
4765 
4766 
4767 static void
fs_nir_emit_bs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4768 fs_nir_emit_bs_intrinsic(nir_to_brw_state &ntb,
4769                          nir_intrinsic_instr *instr)
4770 {
4771    const fs_builder &bld = ntb.bld;
4772    fs_visitor &s = ntb.s;
4773 
4774    assert(brw_shader_stage_is_bindless(s.stage));
4775    const bs_thread_payload &payload = s.bs_payload();
4776 
4777    brw_reg dest;
4778    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4779       dest = get_nir_def(ntb, instr->def);
4780 
4781    const fs_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
4782 
4783    switch (instr->intrinsic) {
4784    case nir_intrinsic_load_btd_global_arg_addr_intel:
4785       xbld.MOV(dest, retype(payload.global_arg_ptr, dest.type));
4786       break;
4787 
4788    case nir_intrinsic_load_btd_local_arg_addr_intel:
4789       xbld.MOV(dest, retype(payload.local_arg_ptr, dest.type));
4790       break;
4791 
4792    case nir_intrinsic_load_btd_shader_type_intel:
4793       payload.load_shader_type(xbld, dest);
4794       break;
4795 
4796    default:
4797       fs_nir_emit_intrinsic(ntb, bld, instr);
4798       break;
4799    }
4800 }
4801 
4802 static brw_reduce_op
brw_reduce_op_for_nir_reduction_op(nir_op op)4803 brw_reduce_op_for_nir_reduction_op(nir_op op)
4804 {
4805    switch (op) {
4806    case nir_op_iadd: return BRW_REDUCE_OP_ADD;
4807    case nir_op_fadd: return BRW_REDUCE_OP_ADD;
4808    case nir_op_imul: return BRW_REDUCE_OP_MUL;
4809    case nir_op_fmul: return BRW_REDUCE_OP_MUL;
4810    case nir_op_imin: return BRW_REDUCE_OP_MIN;
4811    case nir_op_umin: return BRW_REDUCE_OP_MIN;
4812    case nir_op_fmin: return BRW_REDUCE_OP_MIN;
4813    case nir_op_imax: return BRW_REDUCE_OP_MAX;
4814    case nir_op_umax: return BRW_REDUCE_OP_MAX;
4815    case nir_op_fmax: return BRW_REDUCE_OP_MAX;
4816    case nir_op_iand: return BRW_REDUCE_OP_AND;
4817    case nir_op_ior:  return BRW_REDUCE_OP_OR;
4818    case nir_op_ixor: return BRW_REDUCE_OP_XOR;
4819    default:
4820       unreachable("Invalid reduction operation");
4821    }
4822 }
4823 
4824 static brw_reg
get_nir_image_intrinsic_image(nir_to_brw_state & ntb,const brw::fs_builder & bld,nir_intrinsic_instr * instr)4825 get_nir_image_intrinsic_image(nir_to_brw_state &ntb, const brw::fs_builder &bld,
4826                               nir_intrinsic_instr *instr)
4827 {
4828    brw_reg surf_index = get_nir_src_imm(ntb, instr->src[0]);
4829    enum brw_reg_type type = brw_type_with_size(BRW_TYPE_UD,
4830                                                brw_type_size_bits(surf_index.type));
4831 
4832    return bld.emit_uniformize(retype(surf_index, type));
4833 }
4834 
4835 static brw_reg
get_nir_buffer_intrinsic_index(nir_to_brw_state & ntb,const brw::fs_builder & bld,nir_intrinsic_instr * instr,bool * no_mask_handle=NULL)4836 get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw::fs_builder &bld,
4837                                nir_intrinsic_instr *instr, bool *no_mask_handle = NULL)
4838 {
4839    /* SSBO stores are weird in that their index is in src[1] */
4840    const bool is_store =
4841       instr->intrinsic == nir_intrinsic_store_ssbo ||
4842       instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4843    nir_src src = is_store ? instr->src[1] : instr->src[0];
4844 
4845    brw_reg surf_index = get_nir_src_imm(ntb, src);
4846 
4847    if (no_mask_handle)
4848       *no_mask_handle = surf_index.is_scalar || surf_index.file == IMM;
4849 
4850    enum brw_reg_type type = brw_type_with_size(BRW_TYPE_UD,
4851                                                brw_type_size_bits(surf_index.type));
4852 
4853    return bld.emit_uniformize(retype(surf_index, type));
4854 }
4855 
4856 /**
4857  * The offsets we get from NIR act as if each SIMD channel has it's own blob
4858  * of contiguous space.  However, if we actually place each SIMD channel in
4859  * it's own space, we end up with terrible cache performance because each SIMD
4860  * channel accesses a different cache line even when they're all accessing the
4861  * same byte offset.  To deal with this problem, we swizzle the address using
4862  * a simple algorithm which ensures that any time a SIMD message reads or
4863  * writes the same address, it's all in the same cache line.  We have to keep
4864  * the bottom two bits fixed so that we can read/write up to a dword at a time
4865  * and the individual element is contiguous.  We do this by splitting the
4866  * address as follows:
4867  *
4868  *    31                             4-6           2          0
4869  *    +-------------------------------+------------+----------+
4870  *    |        Hi address bits        | chan index | addr low |
4871  *    +-------------------------------+------------+----------+
4872  *
4873  * In other words, the bottom two address bits stay, and the top 30 get
4874  * shifted up so that we can stick the SIMD channel index in the middle.  This
4875  * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4876  * at the same logical offset, the scratch read/write instruction acts on
4877  * continuous elements and we get good cache locality.
4878  */
4879 static brw_reg
swizzle_nir_scratch_addr(nir_to_brw_state & ntb,const brw::fs_builder & bld,const nir_src & nir_addr_src,bool in_dwords)4880 swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
4881                          const brw::fs_builder &bld,
4882                          const nir_src &nir_addr_src,
4883                          bool in_dwords)
4884 {
4885    fs_visitor &s = ntb.s;
4886 
4887    const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
4888    const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4889 
4890    if (nir_src_is_const(nir_addr_src)) {
4891       unsigned nir_addr = nir_src_as_uint(nir_addr_src);
4892       if (in_dwords) {
4893          /* In this case, we know the address is aligned to a DWORD and we want
4894           * the final address in DWORDs.
4895           */
4896          return bld.OR(chan_index,
4897                        brw_imm_ud(nir_addr << (chan_index_bits - 2)));
4898       } else {
4899          /* This case is substantially more annoying because we have to pay
4900           * attention to those pesky two bottom bits.
4901           */
4902          unsigned addr_hi = (nir_addr & ~0x3u) << chan_index_bits;
4903          unsigned addr_lo = (nir_addr &  0x3u);
4904 
4905          return bld.OR(bld.SHL(chan_index, brw_imm_ud(2)),
4906                        brw_imm_ud(addr_lo | addr_hi));
4907       }
4908    }
4909 
4910    const brw_reg nir_addr =
4911       retype(get_nir_src(ntb, nir_addr_src), BRW_TYPE_UD);
4912 
4913    if (in_dwords) {
4914       /* In this case, we know the address is aligned to a DWORD and we want
4915        * the final address in DWORDs.
4916        */
4917       return bld.OR(bld.SHL(nir_addr, brw_imm_ud(chan_index_bits - 2)),
4918                     chan_index);
4919    } else {
4920       /* This case substantially more annoying because we have to pay
4921        * attention to those pesky two bottom bits.
4922        */
4923       brw_reg chan_addr = bld.SHL(chan_index, brw_imm_ud(2));
4924       brw_reg addr_bits =
4925          bld.OR(bld.AND(nir_addr, brw_imm_ud(0x3u)),
4926                 bld.SHL(bld.AND(nir_addr, brw_imm_ud(~0x3u)),
4927                         brw_imm_ud(chan_index_bits)));
4928       return bld.OR(addr_bits, chan_addr);
4929    }
4930 }
4931 
4932 static unsigned
choose_block_size_dwords(const intel_device_info * devinfo,unsigned dwords)4933 choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
4934 {
4935    const unsigned min_block = 8;
4936    const unsigned max_block = devinfo->has_lsc ? 64 : 32;
4937 
4938    const unsigned block = 1 << util_logbase2(dwords);
4939 
4940    return CLAMP(block, min_block, max_block);
4941 }
4942 
4943 static brw_reg
increment_a64_address(const fs_builder & _bld,brw_reg address,uint32_t v,bool use_no_mask)4944 increment_a64_address(const fs_builder &_bld, brw_reg address, uint32_t v, bool use_no_mask)
4945 {
4946    const fs_builder bld = use_no_mask ? _bld.exec_all().group(8, 0) : _bld;
4947 
4948    if (bld.shader->devinfo->has_64bit_int) {
4949       struct brw_reg imm = brw_imm_reg(address.type);
4950       imm.u64 = v;
4951       return bld.ADD(address, imm);
4952    } else {
4953       brw_reg dst = bld.vgrf(BRW_TYPE_UQ);
4954       brw_reg dst_low = subscript(dst, BRW_TYPE_UD, 0);
4955       brw_reg dst_high = subscript(dst, BRW_TYPE_UD, 1);
4956       brw_reg src_low = subscript(address, BRW_TYPE_UD, 0);
4957       brw_reg src_high = subscript(address, BRW_TYPE_UD, 1);
4958 
4959       /* Add low and if that overflows, add carry to high. */
4960       bld.ADD(dst_low, src_low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
4961       bld.ADD(dst_high, src_high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
4962       return dst_low;
4963    }
4964 }
4965 
4966 static brw_reg
emit_fence(const fs_builder & bld,enum opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4967 emit_fence(const fs_builder &bld, enum opcode opcode,
4968            uint8_t sfid, uint32_t desc,
4969            bool commit_enable, uint8_t bti)
4970 {
4971    assert(opcode == SHADER_OPCODE_INTERLOCK ||
4972           opcode == SHADER_OPCODE_MEMORY_FENCE);
4973 
4974    brw_reg dst = bld.vgrf(BRW_TYPE_UD);
4975    fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
4976                              brw_imm_ud(commit_enable),
4977                              brw_imm_ud(bti));
4978    fence->sfid = sfid;
4979    fence->desc = desc;
4980 
4981    return dst;
4982 }
4983 
4984 static uint32_t
lsc_fence_descriptor_for_intrinsic(const struct intel_device_info * devinfo,nir_intrinsic_instr * instr)4985 lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
4986                                    nir_intrinsic_instr *instr)
4987 {
4988    assert(devinfo->has_lsc);
4989 
4990    enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
4991    enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
4992 
4993    if (nir_intrinsic_has_memory_scope(instr)) {
4994       switch (nir_intrinsic_memory_scope(instr)) {
4995       case SCOPE_DEVICE:
4996       case SCOPE_QUEUE_FAMILY:
4997          scope = LSC_FENCE_TILE;
4998          flush_type = LSC_FLUSH_TYPE_EVICT;
4999          break;
5000       case SCOPE_WORKGROUP:
5001          scope = LSC_FENCE_THREADGROUP;
5002          break;
5003       case SCOPE_SHADER_CALL:
5004       case SCOPE_INVOCATION:
5005       case SCOPE_SUBGROUP:
5006       case SCOPE_NONE:
5007          break;
5008       }
5009    } else {
5010       /* No scope defined. */
5011       scope = LSC_FENCE_TILE;
5012       flush_type = LSC_FLUSH_TYPE_EVICT;
5013    }
5014    return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
5015 }
5016 
5017 /**
5018  * Create a MOV to read the timestamp register.
5019  */
5020 static brw_reg
get_timestamp(const fs_builder & bld)5021 get_timestamp(const fs_builder &bld)
5022 {
5023    fs_visitor &s = *bld.shader;
5024 
5025    brw_reg ts = brw_reg(retype(brw_vec4_reg(ARF,
5026                                           BRW_ARF_TIMESTAMP, 0), BRW_TYPE_UD));
5027 
5028    brw_reg dst = brw_vgrf(s.alloc.allocate(1), BRW_TYPE_UD);
5029 
5030    /* We want to read the 3 fields we care about even if it's not enabled in
5031     * the dispatch.
5032     */
5033    bld.group(4, 0).exec_all().MOV(dst, ts);
5034 
5035    return dst;
5036 }
5037 
5038 static unsigned
component_from_intrinsic(nir_intrinsic_instr * instr)5039 component_from_intrinsic(nir_intrinsic_instr *instr)
5040 {
5041    if (nir_intrinsic_has_component(instr))
5042       return nir_intrinsic_component(instr);
5043    else
5044       return 0;
5045 }
5046 
5047 static void
adjust_handle_and_offset(const fs_builder & bld,brw_reg & urb_handle,unsigned & urb_global_offset)5048 adjust_handle_and_offset(const fs_builder &bld,
5049                          brw_reg &urb_handle,
5050                          unsigned &urb_global_offset)
5051 {
5052    /* Make sure that URB global offset is below 2048 (2^11), because
5053     * that's the maximum possible value encoded in Message Descriptor.
5054     */
5055    unsigned adjustment = (urb_global_offset >> 11) << 11;
5056 
5057    if (adjustment) {
5058       fs_builder ubld8 = bld.group(8, 0).exec_all();
5059       /* Allocate new register to not overwrite the shared URB handle. */
5060       urb_handle = ubld8.ADD(urb_handle, brw_imm_ud(adjustment));
5061       urb_global_offset -= adjustment;
5062    }
5063 }
5064 
5065 static void
emit_urb_direct_vec4_write(const fs_builder & bld,unsigned urb_global_offset,const brw_reg & src,brw_reg urb_handle,unsigned dst_comp_offset,unsigned comps,unsigned mask)5066 emit_urb_direct_vec4_write(const fs_builder &bld,
5067                            unsigned urb_global_offset,
5068                            const brw_reg &src,
5069                            brw_reg urb_handle,
5070                            unsigned dst_comp_offset,
5071                            unsigned comps,
5072                            unsigned mask)
5073 {
5074    for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5075       fs_builder bld8 = bld.group(8, q);
5076 
5077       brw_reg payload_srcs[8];
5078       unsigned length = 0;
5079 
5080       for (unsigned i = 0; i < dst_comp_offset; i++)
5081          payload_srcs[length++] = reg_undef;
5082 
5083       for (unsigned c = 0; c < comps; c++)
5084          payload_srcs[length++] = quarter(offset(src, bld, c), q);
5085 
5086       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5087       srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5088       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5089       srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(bld.shader->alloc.allocate(length),
5090                                             BRW_TYPE_F);
5091       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5092       bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5093 
5094       fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5095                                 reg_undef, srcs, ARRAY_SIZE(srcs));
5096       inst->offset = urb_global_offset;
5097       assert(inst->offset < 2048);
5098    }
5099 }
5100 
5101 static void
emit_urb_direct_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,brw_reg urb_handle)5102 emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
5103                        const brw_reg &src, brw_reg urb_handle)
5104 {
5105    assert(nir_src_bit_size(instr->src[0]) == 32);
5106 
5107    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5108    assert(nir_src_is_const(*offset_nir_src));
5109 
5110    const unsigned comps = nir_src_num_components(instr->src[0]);
5111    assert(comps <= 4);
5112 
5113    const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5114                                      nir_src_as_uint(*offset_nir_src) +
5115                                      component_from_intrinsic(instr);
5116 
5117    /* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
5118     * We can write up to 8 dwords, so single vec4 write is enough.
5119     */
5120    const unsigned comp_shift = offset_in_dwords % 4;
5121    const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
5122 
5123    unsigned urb_global_offset = offset_in_dwords / 4;
5124    adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
5125 
5126    emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle,
5127                               comp_shift, comps, mask);
5128 }
5129 
5130 static void
emit_urb_direct_vec4_write_xe2(const fs_builder & bld,unsigned offset_in_bytes,const brw_reg & src,brw_reg urb_handle,unsigned comps,unsigned mask)5131 emit_urb_direct_vec4_write_xe2(const fs_builder &bld,
5132                                unsigned offset_in_bytes,
5133                                const brw_reg &src,
5134                                brw_reg urb_handle,
5135                                unsigned comps,
5136                                unsigned mask)
5137 {
5138    const struct intel_device_info *devinfo = bld.shader->devinfo;
5139    const unsigned runit = reg_unit(devinfo);
5140    const unsigned write_size = 8 * runit;
5141 
5142    if (offset_in_bytes > 0) {
5143       fs_builder bldall = bld.group(write_size, 0).exec_all();
5144       urb_handle = bldall.ADD(urb_handle, brw_imm_ud(offset_in_bytes));
5145    }
5146 
5147    for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
5148       fs_builder hbld = bld.group(write_size, q);
5149 
5150       assert(comps <= 4);
5151       brw_reg payload_srcs[4];
5152 
5153       for (unsigned c = 0; c < comps; c++)
5154          payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
5155 
5156       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5157       srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5158       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5159       int nr = bld.shader->alloc.allocate(comps * runit);
5160       srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(nr, BRW_TYPE_F);
5161       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
5162       hbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
5163 
5164       hbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5165                 reg_undef, srcs, ARRAY_SIZE(srcs));
5166    }
5167 }
5168 
5169 static void
emit_urb_direct_writes_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,brw_reg urb_handle)5170 emit_urb_direct_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5171                            const brw_reg &src, brw_reg urb_handle)
5172 {
5173    assert(nir_src_bit_size(instr->src[0]) == 32);
5174 
5175    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5176    assert(nir_src_is_const(*offset_nir_src));
5177 
5178    const unsigned comps = nir_src_num_components(instr->src[0]);
5179    assert(comps <= 4);
5180 
5181    const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5182                                      nir_src_as_uint(*offset_nir_src) +
5183                                      component_from_intrinsic(instr);
5184 
5185    const unsigned mask = nir_intrinsic_write_mask(instr);
5186 
5187    emit_urb_direct_vec4_write_xe2(bld, offset_in_dwords * 4, src,
5188                                     urb_handle, comps, mask);
5189 }
5190 
5191 static void
emit_urb_indirect_vec4_write(const fs_builder & bld,const brw_reg & offset_src,unsigned base,const brw_reg & src,brw_reg urb_handle,unsigned dst_comp_offset,unsigned comps,unsigned mask)5192 emit_urb_indirect_vec4_write(const fs_builder &bld,
5193                              const brw_reg &offset_src,
5194                              unsigned base,
5195                              const brw_reg &src,
5196                              brw_reg urb_handle,
5197                              unsigned dst_comp_offset,
5198                              unsigned comps,
5199                              unsigned mask)
5200 {
5201    for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5202       fs_builder bld8 = bld.group(8, q);
5203 
5204       /* offset is always positive, so signedness doesn't matter */
5205       assert(offset_src.type == BRW_TYPE_D || offset_src.type == BRW_TYPE_UD);
5206       brw_reg qtr = bld8.MOV(quarter(retype(offset_src, BRW_TYPE_UD), q));
5207       brw_reg off = bld8.SHR(bld8.ADD(qtr, brw_imm_ud(base)), brw_imm_ud(2));
5208 
5209       brw_reg payload_srcs[8];
5210       unsigned length = 0;
5211 
5212       for (unsigned i = 0; i < dst_comp_offset; i++)
5213          payload_srcs[length++] = reg_undef;
5214 
5215       for (unsigned c = 0; c < comps; c++)
5216          payload_srcs[length++] = quarter(offset(src, bld, c), q);
5217 
5218       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5219       srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5220       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
5221       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5222       srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(bld.shader->alloc.allocate(length),
5223                                             BRW_TYPE_F);
5224       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5225       bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5226 
5227       fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5228                                 reg_undef, srcs, ARRAY_SIZE(srcs));
5229       inst->offset = 0;
5230    }
5231 }
5232 
5233 static void
emit_urb_indirect_writes_mod(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,const brw_reg & offset_src,brw_reg urb_handle,unsigned mod)5234 emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
5235                              const brw_reg &src, const brw_reg &offset_src,
5236                              brw_reg urb_handle, unsigned mod)
5237 {
5238    assert(nir_src_bit_size(instr->src[0]) == 32);
5239 
5240    const unsigned comps = nir_src_num_components(instr->src[0]);
5241    assert(comps <= 4);
5242 
5243    const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5244                                    component_from_intrinsic(instr);
5245 
5246    const unsigned comp_shift = mod;
5247    const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
5248 
5249    emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
5250                                 urb_handle, comp_shift, comps, mask);
5251 }
5252 
5253 static void
emit_urb_indirect_writes_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,const brw_reg & offset_src,brw_reg urb_handle)5254 emit_urb_indirect_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5255                              const brw_reg &src, const brw_reg &offset_src,
5256                              brw_reg urb_handle)
5257 {
5258    assert(nir_src_bit_size(instr->src[0]) == 32);
5259 
5260    const struct intel_device_info *devinfo = bld.shader->devinfo;
5261    const unsigned runit = reg_unit(devinfo);
5262    const unsigned write_size = 8 * runit;
5263 
5264    const unsigned comps = nir_src_num_components(instr->src[0]);
5265    assert(comps <= 4);
5266 
5267    const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5268                                    component_from_intrinsic(instr);
5269 
5270    if (base_in_dwords > 0) {
5271       fs_builder bldall = bld.group(write_size, 0).exec_all();
5272       urb_handle = bldall.ADD(urb_handle, brw_imm_ud(base_in_dwords * 4));
5273    }
5274 
5275    const unsigned mask = nir_intrinsic_write_mask(instr);
5276 
5277    for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
5278       fs_builder wbld = bld.group(write_size, q);
5279 
5280       brw_reg payload_srcs[4];
5281 
5282       for (unsigned c = 0; c < comps; c++)
5283          payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
5284 
5285       brw_reg addr =
5286          wbld.ADD(wbld.SHL(retype(horiz_offset(offset_src, write_size * q),
5287                                   BRW_TYPE_UD),
5288                            brw_imm_ud(2)), urb_handle);
5289 
5290       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5291       srcs[URB_LOGICAL_SRC_HANDLE] = addr;
5292       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5293       int nr = bld.shader->alloc.allocate(comps * runit);
5294       srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(nr, BRW_TYPE_F);
5295       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
5296       wbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
5297 
5298       wbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5299                 reg_undef, srcs, ARRAY_SIZE(srcs));
5300    }
5301 }
5302 
5303 static void
emit_urb_indirect_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,const brw_reg & offset_src,brw_reg urb_handle)5304 emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
5305                          const brw_reg &src, const brw_reg &offset_src,
5306                          brw_reg urb_handle)
5307 {
5308    assert(nir_src_bit_size(instr->src[0]) == 32);
5309 
5310    const unsigned comps = nir_src_num_components(instr->src[0]);
5311    assert(comps <= 4);
5312 
5313    const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5314                                    component_from_intrinsic(instr);
5315 
5316    /* Use URB write message that allow different offsets per-slot.  The offset
5317     * is in units of vec4s (128 bits), so we use a write for each component,
5318     * replicating it in the sources and applying the appropriate mask based on
5319     * the dword offset.
5320     */
5321 
5322    for (unsigned c = 0; c < comps; c++) {
5323       if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
5324          continue;
5325 
5326       brw_reg src_comp = offset(src, bld, c);
5327 
5328       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5329          fs_builder bld8 = bld.group(8, q);
5330 
5331          /* offset is always positive, so signedness doesn't matter */
5332          assert(offset_src.type == BRW_TYPE_D ||
5333                 offset_src.type == BRW_TYPE_UD);
5334 
5335          brw_reg off =
5336             bld8.ADD(quarter(retype(offset_src, BRW_TYPE_UD), q),
5337                      brw_imm_ud(c + base_in_dwords));
5338          brw_reg m = bld8.AND(off, brw_imm_ud(0x3));
5339          brw_reg t = bld8.SHL(bld8.MOV(brw_imm_ud(1)), m);
5340          brw_reg mask = bld8.SHL(t, brw_imm_ud(16));
5341          brw_reg final_offset = bld8.SHR(off, brw_imm_ud(2));
5342 
5343          brw_reg payload_srcs[4];
5344          unsigned length = 0;
5345 
5346          for (unsigned j = 0; j < 4; j++)
5347             payload_srcs[length++] = quarter(src_comp, q);
5348 
5349          brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5350          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5351          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = final_offset;
5352          srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
5353          srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(bld.shader->alloc.allocate(length),
5354                                                BRW_TYPE_F);
5355          srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5356          bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5357 
5358          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5359                                    reg_undef, srcs, ARRAY_SIZE(srcs));
5360          inst->offset = 0;
5361       }
5362    }
5363 }
5364 
5365 static void
emit_urb_direct_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,brw_reg urb_handle)5366 emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
5367                       const brw_reg &dest, brw_reg urb_handle)
5368 {
5369    assert(instr->def.bit_size == 32);
5370 
5371    unsigned comps = instr->def.num_components;
5372    if (comps == 0)
5373       return;
5374 
5375    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5376    assert(nir_src_is_const(*offset_nir_src));
5377 
5378    const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5379                                      nir_src_as_uint(*offset_nir_src) +
5380                                      component_from_intrinsic(instr);
5381 
5382    unsigned urb_global_offset = offset_in_dwords / 4;
5383    adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
5384 
5385    const unsigned comp_offset = offset_in_dwords % 4;
5386    const unsigned num_regs = comp_offset + comps;
5387 
5388    fs_builder ubld8 = bld.group(8, 0).exec_all();
5389    brw_reg data = ubld8.vgrf(BRW_TYPE_UD, num_regs);
5390    brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5391    srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5392 
5393    fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
5394                               srcs, ARRAY_SIZE(srcs));
5395    inst->offset = urb_global_offset;
5396    assert(inst->offset < 2048);
5397    inst->size_written = num_regs * REG_SIZE;
5398 
5399    for (unsigned c = 0; c < comps; c++) {
5400       brw_reg dest_comp = offset(dest, bld, c);
5401       brw_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
5402       bld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
5403    }
5404 }
5405 
5406 static void
emit_urb_direct_reads_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,brw_reg urb_handle)5407 emit_urb_direct_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5408                           const brw_reg &dest, brw_reg urb_handle)
5409 {
5410    assert(instr->def.bit_size == 32);
5411 
5412    unsigned comps = instr->def.num_components;
5413    if (comps == 0)
5414       return;
5415 
5416    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5417    assert(nir_src_is_const(*offset_nir_src));
5418 
5419    fs_builder ubld16 = bld.group(16, 0).exec_all();
5420 
5421    const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5422                                      nir_src_as_uint(*offset_nir_src) +
5423                                      component_from_intrinsic(instr);
5424 
5425    if (offset_in_dwords > 0)
5426       urb_handle = ubld16.ADD(urb_handle, brw_imm_ud(offset_in_dwords * 4));
5427 
5428    brw_reg data = ubld16.vgrf(BRW_TYPE_UD, comps);
5429    brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5430    srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5431 
5432    fs_inst *inst = ubld16.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5433                                data, srcs, ARRAY_SIZE(srcs));
5434    inst->size_written = 2 * comps * REG_SIZE;
5435 
5436    for (unsigned c = 0; c < comps; c++) {
5437       brw_reg dest_comp = offset(dest, bld, c);
5438       brw_reg data_comp = horiz_stride(offset(data, ubld16, c), 0);
5439       bld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
5440    }
5441 }
5442 
5443 static void
emit_urb_indirect_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,const brw_reg & offset_src,brw_reg urb_handle)5444 emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
5445                         const brw_reg &dest, const brw_reg &offset_src, brw_reg urb_handle)
5446 {
5447    assert(instr->def.bit_size == 32);
5448 
5449    unsigned comps = instr->def.num_components;
5450    if (comps == 0)
5451       return;
5452 
5453    brw_reg seq_ud;
5454    {
5455       fs_builder ubld8 = bld.group(8, 0).exec_all();
5456       seq_ud = ubld8.vgrf(BRW_TYPE_UD, 1);
5457       brw_reg seq_uw = ubld8.vgrf(BRW_TYPE_UW, 1);
5458       ubld8.MOV(seq_uw, brw_reg(brw_imm_v(0x76543210)));
5459       ubld8.MOV(seq_ud, seq_uw);
5460       seq_ud = ubld8.SHL(seq_ud, brw_imm_ud(2));
5461    }
5462 
5463    const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5464                                    component_from_intrinsic(instr);
5465 
5466    for (unsigned c = 0; c < comps; c++) {
5467       for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5468          fs_builder bld8 = bld.group(8, q);
5469 
5470          /* offset is always positive, so signedness doesn't matter */
5471          assert(offset_src.type == BRW_TYPE_D ||
5472                 offset_src.type == BRW_TYPE_UD);
5473          brw_reg off =
5474             bld8.ADD(bld8.MOV(quarter(retype(offset_src, BRW_TYPE_UD), q)),
5475                      brw_imm_ud(base_in_dwords + c));
5476 
5477          STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
5478 
5479          brw_reg comp;
5480          comp = bld8.AND(off, brw_imm_ud(0x3));
5481          comp = bld8.SHL(comp, brw_imm_ud(ffs(REG_SIZE) - 1));
5482          comp = bld8.ADD(comp, seq_ud);
5483 
5484          off = bld8.SHR(off, brw_imm_ud(2));
5485 
5486          brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5487          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5488          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
5489 
5490          brw_reg data = bld8.vgrf(BRW_TYPE_UD, 4);
5491 
5492          fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5493                                    data, srcs, ARRAY_SIZE(srcs));
5494          inst->offset = 0;
5495          inst->size_written = 4 * REG_SIZE;
5496 
5497          brw_reg dest_comp = offset(dest, bld, c);
5498          bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
5499                    retype(quarter(dest_comp, q), BRW_TYPE_UD),
5500                    data,
5501                    comp,
5502                    brw_imm_ud(4 * REG_SIZE));
5503       }
5504    }
5505 }
5506 
5507 static void
emit_urb_indirect_reads_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,const brw_reg & offset_src,brw_reg urb_handle)5508 emit_urb_indirect_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5509                             const brw_reg &dest, const brw_reg &offset_src,
5510                             brw_reg urb_handle)
5511 {
5512    assert(instr->def.bit_size == 32);
5513 
5514    unsigned comps = instr->def.num_components;
5515    if (comps == 0)
5516       return;
5517 
5518    fs_builder ubld16 = bld.group(16, 0).exec_all();
5519 
5520    const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5521                                      component_from_intrinsic(instr);
5522 
5523    if (offset_in_dwords > 0)
5524       urb_handle = ubld16.ADD(urb_handle, brw_imm_ud(offset_in_dwords * 4));
5525 
5526    brw_reg data = ubld16.vgrf(BRW_TYPE_UD, comps);
5527 
5528    for (unsigned q = 0; q < bld.dispatch_width() / 16; q++) {
5529       fs_builder wbld = bld.group(16, q);
5530 
5531       brw_reg addr = wbld.SHL(retype(horiz_offset(offset_src, 16 * q),
5532                                      BRW_TYPE_UD),
5533                               brw_imm_ud(2));
5534 
5535       brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5536       srcs[URB_LOGICAL_SRC_HANDLE] = wbld.ADD(addr, urb_handle);
5537 
5538       fs_inst *inst = wbld.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5539                                  data, srcs, ARRAY_SIZE(srcs));
5540       inst->size_written = 2 * comps * REG_SIZE;
5541 
5542       for (unsigned c = 0; c < comps; c++) {
5543          brw_reg dest_comp = horiz_offset(offset(dest, bld, c), 16 * q);
5544          brw_reg data_comp = offset(data, wbld, c);
5545          wbld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
5546       }
5547    }
5548 }
5549 
5550 static void
emit_task_mesh_store(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & urb_handle)5551 emit_task_mesh_store(nir_to_brw_state &ntb,
5552                      const fs_builder &bld, nir_intrinsic_instr *instr,
5553                      const brw_reg &urb_handle)
5554 {
5555    brw_reg src = get_nir_src(ntb, instr->src[0], -1);
5556    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5557 
5558    if (nir_src_is_const(*offset_nir_src)) {
5559       if (bld.shader->devinfo->ver >= 20)
5560          emit_urb_direct_writes_xe2(bld, instr, src, urb_handle);
5561       else
5562          emit_urb_direct_writes(bld, instr, src, urb_handle);
5563    } else {
5564       if (bld.shader->devinfo->ver >= 20) {
5565          emit_urb_indirect_writes_xe2(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
5566          return;
5567       }
5568       bool use_mod = false;
5569       unsigned mod;
5570 
5571       /* Try to calculate the value of (offset + base) % 4. If we can do
5572        * this, then we can do indirect writes using only 1 URB write.
5573        */
5574       use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
5575       if (use_mod) {
5576          mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
5577          mod %= 4;
5578       }
5579 
5580       if (use_mod) {
5581          emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle, mod);
5582       } else {
5583          emit_urb_indirect_writes(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
5584       }
5585    }
5586 }
5587 
5588 static void
emit_task_mesh_load(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & urb_handle)5589 emit_task_mesh_load(nir_to_brw_state &ntb,
5590                     const fs_builder &bld, nir_intrinsic_instr *instr,
5591                     const brw_reg &urb_handle)
5592 {
5593    brw_reg dest = get_nir_def(ntb, instr->def);
5594    nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5595 
5596    /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
5597     * the non-array-index offset, we could use to decide if we can perform
5598     * a single large aligned read instead one per component.
5599     */
5600 
5601    if (nir_src_is_const(*offset_nir_src)) {
5602       if (bld.shader->devinfo->ver >= 20)
5603          emit_urb_direct_reads_xe2(bld, instr, dest, urb_handle);
5604       else
5605          emit_urb_direct_reads(bld, instr, dest, urb_handle);
5606    } else {
5607       if (bld.shader->devinfo->ver >= 20)
5608          emit_urb_indirect_reads_xe2(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
5609       else
5610          emit_urb_indirect_reads(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
5611    }
5612 }
5613 
5614 static void
fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)5615 fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld,
5616                                 nir_intrinsic_instr *instr)
5617 {
5618    fs_visitor &s = ntb.s;
5619 
5620    assert(s.stage == MESA_SHADER_MESH || s.stage == MESA_SHADER_TASK);
5621    const task_mesh_thread_payload &payload = s.task_mesh_payload();
5622 
5623    brw_reg dest;
5624    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5625       dest = get_nir_def(ntb, instr->def);
5626 
5627    switch (instr->intrinsic) {
5628    case nir_intrinsic_load_draw_id:
5629       dest = retype(dest, BRW_TYPE_UD);
5630       bld.MOV(dest, payload.extended_parameter_0);
5631       break;
5632 
5633    case nir_intrinsic_load_local_invocation_id:
5634       unreachable("local invocation id should have been lowered earlier");
5635       break;
5636 
5637    case nir_intrinsic_load_local_invocation_index:
5638       dest = retype(dest, BRW_TYPE_UD);
5639       bld.MOV(dest, payload.local_index);
5640       break;
5641 
5642    case nir_intrinsic_load_num_workgroups:
5643       dest = retype(dest, BRW_TYPE_UD);
5644       bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */
5645       bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8));  /* g0.4 & 0xffff */
5646       bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9));  /* g0.4 >> 16 */
5647       break;
5648 
5649    case nir_intrinsic_load_workgroup_index:
5650       dest = retype(dest, BRW_TYPE_UD);
5651       bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_TYPE_UD));
5652       break;
5653 
5654    default:
5655       fs_nir_emit_cs_intrinsic(ntb, instr);
5656       break;
5657    }
5658 }
5659 
5660 static void
fs_nir_emit_task_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)5661 fs_nir_emit_task_intrinsic(nir_to_brw_state &ntb,
5662                            nir_intrinsic_instr *instr)
5663 {
5664    const fs_builder &bld = ntb.bld;
5665    fs_visitor &s = ntb.s;
5666 
5667    assert(s.stage == MESA_SHADER_TASK);
5668    const task_mesh_thread_payload &payload = s.task_mesh_payload();
5669 
5670    switch (instr->intrinsic) {
5671    case nir_intrinsic_store_output:
5672    case nir_intrinsic_store_task_payload:
5673       emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
5674       break;
5675 
5676    case nir_intrinsic_load_output:
5677    case nir_intrinsic_load_task_payload:
5678       emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
5679       break;
5680 
5681    default:
5682       fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
5683       break;
5684    }
5685 }
5686 
5687 static void
fs_nir_emit_mesh_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)5688 fs_nir_emit_mesh_intrinsic(nir_to_brw_state &ntb,
5689                            nir_intrinsic_instr *instr)
5690 {
5691    const fs_builder &bld = ntb.bld;
5692    fs_visitor &s = ntb.s;
5693 
5694    assert(s.stage == MESA_SHADER_MESH);
5695    const task_mesh_thread_payload &payload = s.task_mesh_payload();
5696 
5697    switch (instr->intrinsic) {
5698    case nir_intrinsic_store_per_primitive_output:
5699    case nir_intrinsic_store_per_vertex_output:
5700    case nir_intrinsic_store_output:
5701       emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
5702       break;
5703 
5704    case nir_intrinsic_load_per_vertex_output:
5705    case nir_intrinsic_load_per_primitive_output:
5706    case nir_intrinsic_load_output:
5707       emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
5708       break;
5709 
5710    case nir_intrinsic_load_task_payload:
5711       emit_task_mesh_load(ntb, bld, instr, payload.task_urb_input);
5712       break;
5713 
5714    default:
5715       fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
5716       break;
5717    }
5718 }
5719 
5720 static void
fs_nir_emit_intrinsic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)5721 fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
5722                       const fs_builder &bld, nir_intrinsic_instr *instr)
5723 {
5724    const intel_device_info *devinfo = ntb.devinfo;
5725    fs_visitor &s = ntb.s;
5726 
5727    /* We handle this as a special case */
5728    if (instr->intrinsic == nir_intrinsic_decl_reg) {
5729       assert(nir_intrinsic_num_array_elems(instr) == 0);
5730       unsigned bit_size = nir_intrinsic_bit_size(instr);
5731       unsigned num_components = nir_intrinsic_num_components(instr);
5732       const brw_reg_type reg_type =
5733          brw_type_with_size(bit_size == 8 ? BRW_TYPE_D : BRW_TYPE_F,
5734                             bit_size);
5735 
5736       /* Re-use the destination's slot in the table for the register */
5737       ntb.ssa_values[instr->def.index] =
5738          bld.vgrf(reg_type, num_components);
5739       return;
5740    }
5741 
5742    brw_reg dest;
5743    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5744       dest = get_nir_def(ntb, instr->def);
5745 
5746    const fs_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
5747 
5748    switch (instr->intrinsic) {
5749    case nir_intrinsic_resource_intel: {
5750       ntb.ssa_bind_infos[instr->def.index].valid = true;
5751       ntb.ssa_bind_infos[instr->def.index].bindless =
5752          (nir_intrinsic_resource_access_intel(instr) &
5753           nir_resource_intel_bindless) != 0;
5754       ntb.ssa_bind_infos[instr->def.index].block =
5755          nir_intrinsic_resource_block_intel(instr);
5756       ntb.ssa_bind_infos[instr->def.index].set =
5757          nir_intrinsic_desc_set(instr);
5758       ntb.ssa_bind_infos[instr->def.index].binding =
5759          nir_intrinsic_binding(instr);
5760 
5761       dest = retype(dest, BRW_TYPE_UD);
5762       ntb.ssa_values[instr->def.index] = dest;
5763 
5764       xbld.MOV(dest,
5765                bld.emit_uniformize(get_nir_src(ntb, instr->src[1])));
5766       break;
5767    }
5768 
5769    case nir_intrinsic_load_reg:
5770    case nir_intrinsic_store_reg:
5771       /* Nothing to do with these. */
5772       break;
5773 
5774    case nir_intrinsic_load_global_constant_uniform_block_intel:
5775    case nir_intrinsic_load_ssbo_uniform_block_intel:
5776    case nir_intrinsic_load_shared_uniform_block_intel:
5777    case nir_intrinsic_load_global_block_intel:
5778    case nir_intrinsic_store_global_block_intel:
5779    case nir_intrinsic_load_shared_block_intel:
5780    case nir_intrinsic_store_shared_block_intel:
5781    case nir_intrinsic_load_ssbo_block_intel:
5782    case nir_intrinsic_store_ssbo_block_intel:
5783    case nir_intrinsic_image_load:
5784    case nir_intrinsic_image_store:
5785    case nir_intrinsic_image_atomic:
5786    case nir_intrinsic_image_atomic_swap:
5787    case nir_intrinsic_bindless_image_load:
5788    case nir_intrinsic_bindless_image_store:
5789    case nir_intrinsic_bindless_image_atomic:
5790    case nir_intrinsic_bindless_image_atomic_swap:
5791    case nir_intrinsic_load_shared:
5792    case nir_intrinsic_store_shared:
5793    case nir_intrinsic_shared_atomic:
5794    case nir_intrinsic_shared_atomic_swap:
5795    case nir_intrinsic_load_ssbo:
5796    case nir_intrinsic_store_ssbo:
5797    case nir_intrinsic_ssbo_atomic:
5798    case nir_intrinsic_ssbo_atomic_swap:
5799    case nir_intrinsic_load_global:
5800    case nir_intrinsic_load_global_constant:
5801    case nir_intrinsic_store_global:
5802    case nir_intrinsic_global_atomic:
5803    case nir_intrinsic_global_atomic_swap:
5804    case nir_intrinsic_load_scratch:
5805    case nir_intrinsic_store_scratch:
5806       fs_nir_emit_memory_access(ntb, bld, xbld, instr);
5807       break;
5808 
5809    case nir_intrinsic_image_size:
5810    case nir_intrinsic_bindless_image_size: {
5811       /* Cube image sizes should have previously been lowered to a 2D array */
5812       assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
5813 
5814       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
5815        * into will handle the binding table index for us in the geneerator.
5816        * Incidentally, this means that we can handle bindless with exactly the
5817        * same code.
5818        */
5819       brw_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_TYPE_UD);
5820       image = bld.emit_uniformize(image);
5821 
5822       assert(nir_src_as_uint(instr->src[1]) == 0);
5823 
5824       brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
5825       if (instr->intrinsic == nir_intrinsic_image_size)
5826          srcs[TEX_LOGICAL_SRC_SURFACE] = image;
5827       else
5828          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
5829       srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
5830       srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
5831       srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
5832       srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
5833 
5834       /* Since the image size is always uniform, we can just emit a SIMD8
5835        * query instruction and splat the result out.
5836        */
5837       const fs_builder ubld = bld.scalar_group();
5838 
5839       brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
5840       fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
5841                                 tmp, srcs, ARRAY_SIZE(srcs));
5842       inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5843 
5844       for (unsigned c = 0; c < instr->def.num_components; ++c) {
5845          bld.MOV(offset(retype(dest, tmp.type), bld, c),
5846                  component(offset(tmp, ubld, c), 0));
5847       }
5848       break;
5849    }
5850 
5851    case nir_intrinsic_barrier:
5852    case nir_intrinsic_begin_invocation_interlock:
5853    case nir_intrinsic_end_invocation_interlock: {
5854       bool ugm_fence, slm_fence, tgm_fence, urb_fence;
5855       enum opcode opcode = BRW_OPCODE_NOP;
5856 
5857       /* Handling interlock intrinsics here will allow the logic for IVB
5858        * render cache (see below) to be reused.
5859        */
5860 
5861       switch (instr->intrinsic) {
5862       case nir_intrinsic_barrier: {
5863          /* Note we only care about the memory part of the
5864           * barrier.  The execution part will be taken care
5865           * of by the stage specific intrinsic handler functions.
5866           */
5867          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
5868          ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
5869          slm_fence = modes & nir_var_mem_shared;
5870          tgm_fence = modes & nir_var_image;
5871          urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
5872          if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
5873             opcode = SHADER_OPCODE_MEMORY_FENCE;
5874          break;
5875       }
5876 
5877       case nir_intrinsic_begin_invocation_interlock:
5878          /* For beginInvocationInterlockARB(), we will generate a memory fence
5879           * but with a different opcode so that generator can pick SENDC
5880           * instead of SEND.
5881           */
5882          assert(s.stage == MESA_SHADER_FRAGMENT);
5883          ugm_fence = tgm_fence = true;
5884          slm_fence = urb_fence = false;
5885          opcode = SHADER_OPCODE_INTERLOCK;
5886          break;
5887 
5888       case nir_intrinsic_end_invocation_interlock:
5889          /* For endInvocationInterlockARB(), we need to insert a memory fence which
5890           * stalls in the shader until the memory transactions prior to that
5891           * fence are complete.  This ensures that the shader does not end before
5892           * any writes from its critical section have landed.  Otherwise, you can
5893           * end up with a case where the next invocation on that pixel properly
5894           * stalls for previous FS invocation on its pixel to complete but
5895           * doesn't actually wait for the dataport memory transactions from that
5896           * thread to land before submitting its own.
5897           */
5898          assert(s.stage == MESA_SHADER_FRAGMENT);
5899          ugm_fence = tgm_fence = true;
5900          slm_fence = urb_fence = false;
5901          opcode = SHADER_OPCODE_MEMORY_FENCE;
5902          break;
5903 
5904       default:
5905          unreachable("invalid intrinsic");
5906       }
5907 
5908       if (opcode == BRW_OPCODE_NOP)
5909          break;
5910 
5911       if (s.nir->info.shared_size > 0) {
5912          assert(gl_shader_stage_uses_workgroup(s.stage));
5913       } else {
5914          slm_fence = false;
5915       }
5916 
5917       /* If the workgroup fits in a single HW thread, the messages for SLM are
5918        * processed in-order and the shader itself is already synchronized so
5919        * the memory fence is not necessary.
5920        *
5921        * TODO: Check if applies for many HW threads sharing same Data Port.
5922        */
5923       if (!s.nir->info.workgroup_size_variable &&
5924           slm_fence && brw_workgroup_size(s) <= s.dispatch_width)
5925          slm_fence = false;
5926 
5927       switch (s.stage) {
5928          case MESA_SHADER_TESS_CTRL:
5929          case MESA_SHADER_TASK:
5930          case MESA_SHADER_MESH:
5931             break;
5932          default:
5933             urb_fence = false;
5934             break;
5935       }
5936 
5937       unsigned fence_regs_count = 0;
5938       brw_reg fence_regs[4] = {};
5939 
5940       const fs_builder ubld = bld.group(8, 0);
5941 
5942       /* A memory barrier with acquire semantics requires us to
5943        * guarantee that memory operations of the specified storage
5944        * class sequenced-after the barrier aren't reordered before the
5945        * barrier, nor before any previous atomic operation
5946        * sequenced-before the barrier which may be synchronizing this
5947        * acquire barrier with a prior release sequence.
5948        *
5949        * In order to guarantee the latter we must make sure that any
5950        * such previous operation has completed execution before
5951        * invalidating the relevant caches, since otherwise some cache
5952        * could be polluted by a concurrent thread after its
5953        * invalidation but before the previous atomic completes, which
5954        * could lead to a violation of the expected memory ordering if
5955        * a subsequent memory read hits the polluted cacheline, which
5956        * would return a stale value read from memory before the
5957        * completion of the atomic sequenced-before the barrier.
5958        *
5959        * This ordering inversion can be avoided trivially if the
5960        * operations we need to order are all handled by a single
5961        * in-order cache, since the flush implied by the memory fence
5962        * occurs after any pending operations have completed, however
5963        * that doesn't help us when dealing with multiple caches
5964        * processing requests out of order, in which case we need to
5965        * explicitly stall the EU until any pending memory operations
5966        * have executed.
5967        *
5968        * Note that that might be somewhat heavy handed in some cases.
5969        * In particular when this memory fence was inserted by
5970        * spirv_to_nir() lowering an atomic with acquire semantics into
5971        * an atomic+barrier sequence we could do a better job by
5972        * synchronizing with respect to that one atomic *only*, but
5973        * that would require additional information not currently
5974        * available to the backend.
5975        *
5976        * XXX - Use an alternative workaround on IVB and ICL, since
5977        *       SYNC.ALLWR is only available on Gfx12+.
5978        */
5979       if (devinfo->ver >= 12 &&
5980           (!nir_intrinsic_has_memory_scope(instr) ||
5981            (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
5982          ubld.exec_all().group(1, 0).SYNC(TGL_SYNC_ALLWR);
5983       }
5984 
5985       if (devinfo->has_lsc) {
5986          assert(devinfo->verx10 >= 125);
5987          uint32_t desc =
5988             lsc_fence_descriptor_for_intrinsic(devinfo, instr);
5989          if (ugm_fence) {
5990             fence_regs[fence_regs_count++] =
5991                emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
5992                           true /* commit_enable */,
5993                           0 /* bti; ignored for LSC */);
5994          }
5995 
5996          if (tgm_fence) {
5997             fence_regs[fence_regs_count++] =
5998                emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
5999                           true /* commit_enable */,
6000                           0 /* bti; ignored for LSC */);
6001          }
6002 
6003          if (slm_fence) {
6004             assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6005             if (intel_needs_workaround(devinfo, 14014063774)) {
6006                /* Wa_14014063774
6007                 *
6008                 * Before SLM fence compiler needs to insert SYNC.ALLWR in order
6009                 * to avoid the SLM data race.
6010                 */
6011                ubld.exec_all().group(1, 0).SYNC(TGL_SYNC_ALLWR);
6012             }
6013             fence_regs[fence_regs_count++] =
6014                emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
6015                           true /* commit_enable */,
6016                           0 /* BTI; ignored for LSC */);
6017          }
6018 
6019          if (urb_fence) {
6020             assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6021             fence_regs[fence_regs_count++] =
6022                emit_fence(ubld, opcode, BRW_SFID_URB, desc,
6023                           true /* commit_enable */,
6024                           0 /* BTI; ignored for LSC */);
6025          }
6026       } else if (devinfo->ver >= 11) {
6027          if (tgm_fence || ugm_fence || urb_fence) {
6028             fence_regs[fence_regs_count++] =
6029                emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6030                           true /* commit_enable HSD ES # 1404612949 */,
6031                           0 /* BTI = 0 means data cache */);
6032          }
6033 
6034          if (slm_fence) {
6035             assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6036             fence_regs[fence_regs_count++] =
6037                emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6038                           true /* commit_enable HSD ES # 1404612949 */,
6039                           GFX7_BTI_SLM);
6040          }
6041       } else {
6042          /* Simulation also complains on Gfx9 if we do not enable commit.
6043           */
6044          const bool commit_enable =
6045             instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
6046             devinfo->ver == 9;
6047 
6048          if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
6049             fence_regs[fence_regs_count++] =
6050                emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6051                           commit_enable, 0 /* BTI */);
6052          }
6053       }
6054 
6055       assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
6056 
6057       /* Be conservative in Gen11+ and always stall in a fence.  Since
6058        * there are two different fences, and shader might want to
6059        * synchronize between them.
6060        *
6061        * TODO: Use scope and visibility information for the barriers from NIR
6062        * to make a better decision on whether we need to stall.
6063        */
6064       bool force_stall = devinfo->ver >= 11;
6065 
6066       /* There are four cases where we want to insert a stall:
6067        *
6068        *  1. If we're a nir_intrinsic_end_invocation_interlock.  This is
6069        *     required to ensure that the shader EOT doesn't happen until
6070        *     after the fence returns.  Otherwise, we might end up with the
6071        *     next shader invocation for that pixel not respecting our fence
6072        *     because it may happen on a different HW thread.
6073        *
6074        *  2. If we have multiple fences.  This is required to ensure that
6075        *     they all complete and nothing gets weirdly out-of-order.
6076        *
6077        *  3. If we have no fences.  In this case, we need at least a
6078        *     scheduling barrier to keep the compiler from moving things
6079        *     around in an invalid way.
6080        *
6081        *  4. On Gen11+ and platforms with LSC, we have multiple fence types,
6082        *     without further information about the fence, we need to force a
6083        *     stall.
6084        */
6085       if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
6086           fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
6087          ubld.exec_all().group(1, 0).emit(
6088             FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
6089             fence_regs, fence_regs_count);
6090       }
6091 
6092       break;
6093    }
6094 
6095    case nir_intrinsic_shader_clock: {
6096       /* We cannot do anything if there is an event, so ignore it for now */
6097       const brw_reg shader_clock = get_timestamp(bld);
6098       const brw_reg srcs[] = { component(shader_clock, 0),
6099                               component(shader_clock, 1) };
6100       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
6101       break;
6102    }
6103 
6104    case nir_intrinsic_load_reloc_const_intel: {
6105       uint32_t id = nir_intrinsic_param_idx(instr);
6106       uint32_t base = nir_intrinsic_base(instr);
6107 
6108       assert(dest.is_scalar);
6109 
6110       xbld.emit(SHADER_OPCODE_MOV_RELOC_IMM, retype(dest, BRW_TYPE_D),
6111                 brw_imm_ud(id), brw_imm_ud(base));
6112       break;
6113    }
6114 
6115    case nir_intrinsic_load_uniform: {
6116       /* Offsets are in bytes but they should always aligned to
6117        * the type size
6118        */
6119       unsigned base_offset = nir_intrinsic_base(instr);
6120       assert(base_offset % 4 == 0 || base_offset % brw_type_size_bytes(dest.type) == 0);
6121 
6122       brw_reg src = brw_uniform_reg(base_offset / 4, dest.type);
6123 
6124       if (nir_src_is_const(instr->src[0])) {
6125          unsigned load_offset = nir_src_as_uint(instr->src[0]);
6126          assert(load_offset % brw_type_size_bytes(dest.type) == 0);
6127          /* The base offset can only handle 32-bit units, so for 16-bit
6128           * data take the modulo of the offset with 4 bytes and add it to
6129           * the offset to read from within the source register.
6130           */
6131          src.offset = load_offset + base_offset % 4;
6132 
6133          for (unsigned j = 0; j < instr->num_components; j++) {
6134             xbld.MOV(offset(dest, xbld, j), offset(src, xbld, j));
6135          }
6136       } else {
6137          brw_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
6138                                   BRW_TYPE_UD);
6139 
6140          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
6141           * go past the end of the uniform.  In order to keep the n'th
6142           * component from running past, we subtract off the size of all but
6143           * one component of the vector.
6144           */
6145          assert(nir_intrinsic_range(instr) >=
6146                 instr->num_components * brw_type_size_bytes(dest.type));
6147          unsigned read_size = nir_intrinsic_range(instr) -
6148             (instr->num_components - 1) * brw_type_size_bytes(dest.type);
6149 
6150          bool supports_64bit_indirects = !intel_device_info_is_9lp(devinfo);
6151 
6152          if (brw_type_size_bytes(dest.type) != 8 || supports_64bit_indirects) {
6153             for (unsigned j = 0; j < instr->num_components; j++) {
6154                xbld.emit(SHADER_OPCODE_MOV_INDIRECT,
6155                          offset(dest, xbld, j), offset(src, xbld, j),
6156                          indirect, brw_imm_ud(read_size));
6157             }
6158          } else {
6159             const unsigned num_mov_indirects =
6160                brw_type_size_bytes(dest.type) / brw_type_size_bytes(BRW_TYPE_UD);
6161             /* We read a little bit less per MOV INDIRECT, as they are now
6162              * 32-bits ones instead of 64-bit. Fix read_size then.
6163              */
6164             const unsigned read_size_32bit = read_size -
6165                 (num_mov_indirects - 1) * brw_type_size_bytes(BRW_TYPE_UD);
6166             for (unsigned j = 0; j < instr->num_components; j++) {
6167                for (unsigned i = 0; i < num_mov_indirects; i++) {
6168                   xbld.emit(SHADER_OPCODE_MOV_INDIRECT,
6169                             subscript(offset(dest, xbld, j), BRW_TYPE_UD, i),
6170                             subscript(offset(src, xbld, j), BRW_TYPE_UD, i),
6171                             indirect, brw_imm_ud(read_size_32bit));
6172                }
6173             }
6174          }
6175       }
6176       break;
6177    }
6178 
6179    case nir_intrinsic_load_ubo:
6180    case nir_intrinsic_load_ubo_uniform_block_intel: {
6181       brw_reg surface, surface_handle;
6182       bool no_mask_handle = false;
6183 
6184       if (get_nir_src_bindless(ntb, instr->src[0]))
6185          surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
6186       else
6187          surface = get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
6188 
6189       const unsigned first_component =
6190          nir_def_first_component_read(&instr->def);
6191       const unsigned last_component =
6192          nir_def_last_component_read(&instr->def);
6193       const unsigned num_components = last_component - first_component + 1;
6194 
6195       if (!nir_src_is_const(instr->src[1])) {
6196          s.prog_data->has_ubo_pull = true;
6197 
6198          if (instr->intrinsic == nir_intrinsic_load_ubo) {
6199             /* load_ubo with non-constant offset. The offset might still be
6200              * uniform on non-LSC platforms when loading fewer than 4
6201              * components.
6202              */
6203             brw_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
6204                                         BRW_TYPE_UD);
6205 
6206             const unsigned comps_per_load = brw_type_size_bytes(dest.type) == 8 ? 2 : 4;
6207 
6208             for (unsigned i = first_component;
6209                  i <= last_component;
6210                  i += comps_per_load) {
6211                const unsigned remaining = last_component + 1 - i;
6212                xbld.VARYING_PULL_CONSTANT_LOAD(offset(dest, xbld, i),
6213                                                surface, surface_handle,
6214                                                base_offset,
6215                                                i * brw_type_size_bytes(dest.type),
6216                                                instr->def.bit_size / 8,
6217                                                MIN2(remaining, comps_per_load));
6218             }
6219          } else {
6220             /* load_ubo_uniform_block_intel with non-constant offset */
6221             fs_nir_emit_memory_access(ntb, bld, xbld, instr);
6222          }
6223       } else {
6224          /* Even if we are loading doubles, a pull constant load will load
6225           * a 32-bit vec4, so should only reserve vgrf space for that. If we
6226           * need to load a full dvec4 we will have to emit 2 loads. This is
6227           * similar to demote_pull_constants(), except that in that case we
6228           * see individual accesses to each component of the vector and then
6229           * we let CSE deal with duplicate loads. Here we see a vector access
6230           * and we have to split it if necessary.
6231           */
6232          const unsigned type_size = brw_type_size_bytes(dest.type);
6233          const unsigned load_offset =
6234             nir_src_as_uint(instr->src[1]) + first_component * type_size;
6235          const unsigned end_offset = load_offset + num_components * type_size;
6236          const unsigned ubo_block =
6237             brw_nir_ubo_surface_index_get_push_block(instr->src[0]);
6238          const unsigned offset_256b = load_offset / 32;
6239          const unsigned end_256b = DIV_ROUND_UP(end_offset, 32);
6240 
6241          /* See if we've selected this as a push constant candidate */
6242          brw_reg push_reg;
6243          for (int i = 0; i < 4; i++) {
6244             const struct brw_ubo_range *range = &s.prog_data->ubo_ranges[i];
6245             if (range->block == ubo_block &&
6246                 offset_256b >= range->start &&
6247                 end_256b <= range->start + range->length) {
6248 
6249                push_reg = brw_uniform_reg(UBO_START + i, dest.type);
6250                push_reg.offset = load_offset - 32 * range->start;
6251                break;
6252             }
6253          }
6254 
6255          if (push_reg.file != BAD_FILE) {
6256             for (unsigned i = first_component; i <= last_component; i++) {
6257                xbld.MOV(offset(dest, xbld, i),
6258                         byte_offset(push_reg,
6259                                     (i - first_component) * type_size));
6260             }
6261             break;
6262          }
6263 
6264          s.prog_data->has_ubo_pull = true;
6265 
6266          if (instr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) {
6267             fs_nir_emit_memory_access(ntb, bld, xbld, instr);
6268             break;
6269          }
6270 
6271          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
6272          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
6273 
6274          for (unsigned c = 0; c < num_components;) {
6275             const unsigned base = load_offset + c * type_size;
6276             /* Number of usable components in the next block-aligned load. */
6277             const unsigned count = MIN2(num_components - c,
6278                                         (block_sz - base % block_sz) / type_size);
6279 
6280             const brw_reg packed_consts = ubld.vgrf(BRW_TYPE_UD);
6281             brw_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
6282             srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE]        = surface;
6283             srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
6284             srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET]         = brw_imm_ud(base & ~(block_sz - 1));
6285             srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE]           = brw_imm_ud(block_sz);
6286 
6287             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
6288                       srcs, PULL_UNIFORM_CONSTANT_SRCS);
6289 
6290             const brw_reg consts =
6291                retype(byte_offset(packed_consts, base & (block_sz - 1)),
6292                       dest.type);
6293 
6294             for (unsigned d = 0; d < count; d++) {
6295                xbld.MOV(offset(dest, xbld, first_component + c + d),
6296                         component(consts, d));
6297             }
6298 
6299             c += count;
6300          }
6301       }
6302       break;
6303    }
6304 
6305    case nir_intrinsic_store_output: {
6306       assert(nir_src_bit_size(instr->src[0]) == 32);
6307       brw_reg src = get_nir_src(ntb, instr->src[0], -1);
6308 
6309       unsigned store_offset = nir_src_as_uint(instr->src[1]);
6310       unsigned num_components = instr->num_components;
6311       unsigned first_component = nir_intrinsic_component(instr);
6312 
6313       brw_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
6314                                       4 * store_offset), src.type);
6315 
6316       brw_combine_with_vec(bld, offset(new_dest, bld, first_component),
6317                            src, num_components);
6318       break;
6319    }
6320 
6321    case nir_intrinsic_get_ssbo_size: {
6322       assert(nir_src_num_components(instr->src[0]) == 1);
6323 
6324       /* A resinfo's sampler message is used to get the buffer size.  The
6325        * SIMD8's writeback message consists of four registers and SIMD16's
6326        * writeback message consists of 8 destination registers (two per each
6327        * component).  Because we are only interested on the first channel of
6328        * the first returned component, where resinfo returns the buffer size
6329        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
6330        * the dispatch width.
6331        */
6332       const fs_builder ubld = bld.scalar_group();
6333       brw_reg ret_payload = ubld.vgrf(BRW_TYPE_UD, 4);
6334 
6335       /* Set LOD = 0 */
6336       brw_reg src_payload = ubld.MOV(brw_imm_ud(0));
6337 
6338       brw_reg srcs[GET_BUFFER_SIZE_SRCS];
6339       srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
6340            GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
6341            GET_BUFFER_SIZE_SRC_SURFACE] =
6342          get_nir_buffer_intrinsic_index(ntb, bld, instr);
6343       srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
6344       fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
6345                                 srcs, GET_BUFFER_SIZE_SRCS);
6346       inst->header_size = 0;
6347       inst->mlen = reg_unit(devinfo);
6348       inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
6349 
6350       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
6351        *
6352        * "Out-of-bounds checking is always performed at a DWord granularity. If
6353        * any part of the DWord is out-of-bounds then the whole DWord is
6354        * considered out-of-bounds."
6355        *
6356        * This implies that types with size smaller than 4-bytes need to be
6357        * padded if they don't complete the last dword of the buffer. But as we
6358        * need to maintain the original size we need to reverse the padding
6359        * calculation to return the correct size to know the number of elements
6360        * of an unsized array. As we stored in the last two bits of the surface
6361        * size the needed padding for the buffer, we calculate here the
6362        * original buffer_size reversing the surface_size calculation:
6363        *
6364        * surface_size = isl_align(buffer_size, 4) +
6365        *                (isl_align(buffer_size) - buffer_size)
6366        *
6367        * buffer_size = surface_size & ~3 - surface_size & 3
6368        */
6369       brw_reg size_padding  = ubld.AND(ret_payload, brw_imm_ud(3));
6370       brw_reg size_aligned4 = ubld.AND(ret_payload, brw_imm_ud(~3));
6371       brw_reg buffer_size   = ubld.ADD(size_aligned4, negate(size_padding));
6372 
6373       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
6374       break;
6375    }
6376 
6377    case nir_intrinsic_load_subgroup_size:
6378       /* This should only happen for fragment shaders because every other case
6379        * is lowered in NIR so we can optimize on it.
6380        */
6381       assert(s.stage == MESA_SHADER_FRAGMENT);
6382       bld.MOV(retype(dest, BRW_TYPE_D), brw_imm_d(s.dispatch_width));
6383       break;
6384 
6385    case nir_intrinsic_load_subgroup_invocation:
6386       bld.MOV(retype(dest, BRW_TYPE_UD), bld.LOAD_SUBGROUP_INVOCATION());
6387       break;
6388 
6389    case nir_intrinsic_load_subgroup_eq_mask:
6390    case nir_intrinsic_load_subgroup_ge_mask:
6391    case nir_intrinsic_load_subgroup_gt_mask:
6392    case nir_intrinsic_load_subgroup_le_mask:
6393    case nir_intrinsic_load_subgroup_lt_mask:
6394       unreachable("not reached");
6395 
6396    case nir_intrinsic_ddx_fine:
6397       bld.emit(FS_OPCODE_DDX_FINE, retype(dest, BRW_TYPE_F),
6398                retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6399       break;
6400    case nir_intrinsic_ddx:
6401    case nir_intrinsic_ddx_coarse:
6402       bld.emit(FS_OPCODE_DDX_COARSE, retype(dest, BRW_TYPE_F),
6403                retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6404       break;
6405    case nir_intrinsic_ddy_fine:
6406       bld.emit(FS_OPCODE_DDY_FINE, retype(dest, BRW_TYPE_F),
6407                retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6408       break;
6409    case nir_intrinsic_ddy:
6410    case nir_intrinsic_ddy_coarse:
6411       bld.emit(FS_OPCODE_DDY_COARSE, retype(dest, BRW_TYPE_F),
6412                retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6413       break;
6414 
6415    case nir_intrinsic_vote_any:
6416    case nir_intrinsic_vote_all:
6417    case nir_intrinsic_quad_vote_any:
6418    case nir_intrinsic_quad_vote_all: {
6419       const bool any = instr->intrinsic == nir_intrinsic_vote_any ||
6420                        instr->intrinsic == nir_intrinsic_quad_vote_any;
6421       const bool quad = instr->intrinsic == nir_intrinsic_quad_vote_any ||
6422                         instr->intrinsic == nir_intrinsic_quad_vote_all;
6423 
6424       brw_reg cond = get_nir_src(ntb, instr->src[0]);
6425       const unsigned cluster_size = quad ? 4 : s.dispatch_width;
6426 
6427       bld.emit(any ? SHADER_OPCODE_VOTE_ANY : SHADER_OPCODE_VOTE_ALL,
6428                retype(dest, BRW_TYPE_UD), cond, brw_imm_ud(cluster_size));
6429 
6430       break;
6431    }
6432 
6433    case nir_intrinsic_vote_feq:
6434    case nir_intrinsic_vote_ieq: {
6435       brw_reg value = get_nir_src(ntb, instr->src[0]);
6436       if (instr->intrinsic == nir_intrinsic_vote_feq) {
6437          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6438          value.type = bit_size == 8 ? BRW_TYPE_B :
6439             brw_type_with_size(BRW_TYPE_F, bit_size);
6440       }
6441       bld.emit(SHADER_OPCODE_VOTE_EQUAL, retype(dest, BRW_TYPE_D), value);
6442       break;
6443    }
6444 
6445    case nir_intrinsic_ballot: {
6446       if (instr->def.bit_size > 32) {
6447          dest.type = BRW_TYPE_UQ;
6448       } else {
6449          dest.type = BRW_TYPE_UD;
6450       }
6451 
6452       brw_reg value = get_nir_src(ntb, instr->src[0]);
6453 
6454       /* A ballot will always be at the full dispatch width even if the
6455        * use of the ballot result is smaller. If the source is_scalar,
6456        * it may be allocated at less than the full dispatch width (e.g.,
6457        * allocated at SIMD8 with SIMD32 dispatch). The input may or may
6458        * not be stride=0. If it is not, the generated ballot
6459        *
6460        *    ballot(32) dst, value<1>
6461        *
6462        * is invalid because it will read out of bounds from value.
6463        *
6464        * To account for this, modify the stride of an is_scalar input to be
6465        * zero.
6466        */
6467       if (value.is_scalar)
6468          value = component(value, 0);
6469 
6470       /* Note the use of bld here instead of xbld. As mentioned above, the
6471        * ballot must execute on all SIMD lanes regardless of the amount of
6472        * data (i.e., scalar or not scalar) generated.
6473        */
6474       fs_inst *inst = bld.emit(SHADER_OPCODE_BALLOT, dest, value);
6475 
6476       if (dest.is_scalar)
6477          inst->size_written = dest.component_size(xbld.dispatch_width());
6478 
6479       break;
6480    }
6481 
6482    case nir_intrinsic_read_invocation: {
6483       const brw_reg value = get_nir_src(ntb, instr->src[0]);
6484       const brw_reg invocation = get_nir_src_imm(ntb, instr->src[1]);
6485 
6486       bld.emit(SHADER_OPCODE_READ_FROM_CHANNEL, retype(dest, value.type),
6487                value, invocation);
6488       break;
6489    }
6490 
6491    case nir_intrinsic_read_first_invocation: {
6492       const brw_reg value = get_nir_src(ntb, instr->src[0]);
6493 
6494       bld.emit(SHADER_OPCODE_READ_FROM_LIVE_CHANNEL, retype(dest, value.type), value);
6495       break;
6496    }
6497 
6498    case nir_intrinsic_shuffle: {
6499       const brw_reg value = get_nir_src(ntb, instr->src[0]);
6500       const brw_reg index = get_nir_src(ntb, instr->src[1]);
6501 
6502       bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
6503       break;
6504    }
6505 
6506    case nir_intrinsic_first_invocation: {
6507       brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
6508       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
6509       bld.MOV(retype(dest, BRW_TYPE_UD),
6510               brw_reg(component(tmp, 0)));
6511       break;
6512    }
6513 
6514    case nir_intrinsic_last_invocation: {
6515       brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
6516       bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
6517       bld.MOV(retype(dest, BRW_TYPE_UD),
6518               brw_reg(component(tmp, 0)));
6519       break;
6520    }
6521 
6522    case nir_intrinsic_quad_broadcast: {
6523       const brw_reg value = get_nir_src(ntb, instr->src[0]);
6524       const unsigned index = nir_src_as_uint(instr->src[1]);
6525 
6526       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
6527                value, brw_imm_ud(index), brw_imm_ud(4));
6528       break;
6529    }
6530 
6531    case nir_intrinsic_quad_swap_horizontal:
6532    case nir_intrinsic_quad_swap_vertical:
6533    case nir_intrinsic_quad_swap_diagonal: {
6534       const brw_reg value = get_nir_src(ntb, instr->src[0]);
6535 
6536       enum brw_swap_direction dir;
6537       switch (instr->intrinsic) {
6538       case nir_intrinsic_quad_swap_horizontal: dir = BRW_SWAP_HORIZONTAL; break;
6539       case nir_intrinsic_quad_swap_vertical:   dir = BRW_SWAP_VERTICAL;   break;
6540       case nir_intrinsic_quad_swap_diagonal:   dir = BRW_SWAP_DIAGONAL;   break;
6541       default: unreachable("invalid quad swap");
6542       }
6543 
6544       bld.emit(SHADER_OPCODE_QUAD_SWAP, retype(dest, value.type),
6545                value, brw_imm_ud(dir));
6546       break;
6547    }
6548 
6549    case nir_intrinsic_reduce: {
6550       brw_reg src = get_nir_src(ntb, instr->src[0]);
6551       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
6552       enum brw_reduce_op brw_op = brw_reduce_op_for_nir_reduction_op(op);
6553       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
6554       if (cluster_size == 0 || cluster_size > s.dispatch_width)
6555          cluster_size = s.dispatch_width;
6556 
6557       /* Figure out the source type */
6558       src.type = brw_type_for_nir_type(devinfo,
6559          (nir_alu_type)(nir_op_infos[op].input_types[0] |
6560                         nir_src_bit_size(instr->src[0])));
6561 
6562       bld.emit(SHADER_OPCODE_REDUCE, retype(dest, src.type), src,
6563                brw_imm_ud(brw_op), brw_imm_ud(cluster_size));
6564       break;
6565    }
6566 
6567    case nir_intrinsic_inclusive_scan:
6568    case nir_intrinsic_exclusive_scan: {
6569       brw_reg src = get_nir_src(ntb, instr->src[0]);
6570       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
6571       enum brw_reduce_op brw_op = brw_reduce_op_for_nir_reduction_op(op);
6572 
6573       /* Figure out the source type */
6574       src.type = brw_type_for_nir_type(devinfo,
6575          (nir_alu_type)(nir_op_infos[op].input_types[0] |
6576                         nir_src_bit_size(instr->src[0])));
6577 
6578       enum opcode opcode = instr->intrinsic == nir_intrinsic_exclusive_scan ?
6579             SHADER_OPCODE_EXCLUSIVE_SCAN : SHADER_OPCODE_INCLUSIVE_SCAN;
6580 
6581       bld.emit(opcode, retype(dest, src.type), src, brw_imm_ud(brw_op));
6582       break;
6583    }
6584 
6585    case nir_intrinsic_load_topology_id_intel: {
6586       /* These move around basically every hardware generation, so don't
6587        * do any unbounded checks and fail if the platform hasn't explicitly
6588        * been enabled here.
6589        */
6590       assert(devinfo->ver >= 12 && devinfo->ver <= 30);
6591 
6592       /* Here is what the layout of SR0 looks like on Gfx12
6593        * https://gfxspecs.intel.com/Predator/Home/Index/47256
6594        *   [13:11] : Slice ID.
6595        *   [10:9]  : Dual-SubSlice ID
6596        *   [8]     : SubSlice ID
6597        *   [7]     : EUID[2] (aka EU Row ID)
6598        *   [6]     : Reserved
6599        *   [5:4]   : EUID[1:0]
6600        *   [2:0]   : Thread ID
6601        *
6602        * Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and
6603        * Register Regions, ARF Registers, State Register,
6604        * https://gfxspecs.intel.com/Predator/Home/Index/56623
6605        *   [15:11] : Slice ID.
6606        *   [9:8]   : SubSlice ID
6607        *   [6:4]   : EUID
6608        *   [2:0]   : Thread ID
6609        *
6610        * Xe3: Engine 3D and GPGPU Programs, EU Overview, Registers and
6611        * Register Regions, ARF Registers, State Register.
6612        * Bspec 56623 (r55736)
6613        *
6614        *   [17:14] : Slice ID.
6615        *   [11:8]  : SubSlice ID
6616        *   [6:4]   : EUID
6617        *   [3:0]   : Thread ID
6618        */
6619       brw_reg raw_id = bld.vgrf(BRW_TYPE_UD);
6620       bld.UNDEF(raw_id);
6621       bld.emit(SHADER_OPCODE_READ_ARCH_REG, raw_id, retype(brw_sr0_reg(0),
6622                                                            BRW_TYPE_UD));
6623       switch (nir_intrinsic_base(instr)) {
6624       case BRW_TOPOLOGY_ID_DSS:
6625          if (devinfo->ver >= 20) {
6626             /* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing:
6627              * https://gfxspecs.intel.com/Predator/Home/Index/56936
6628              *
6629              * Note: DSSID in all formulas below is a logical identifier of an
6630              * XeCore (a value that goes from 0 to (number_of_slices *
6631              * number_of_XeCores_per_slice -1). SW can get this value from
6632              * either:
6633              *
6634              *  - Message Control Register LogicalSSID field (only in shaders
6635              *    eligible for Mid-Thread Preemption).
6636              *  - Calculated based of State Register with the following formula:
6637              *    DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE +
6638              *    StateRRegister.SubSliceID where GT_SS_PER_SLICE is an
6639              *    architectural parameter defined per product SKU.
6640              *
6641              * We are using the state register to calculate the DSSID.
6642              */
6643             const uint32_t slice_id_mask = devinfo->ver >= 30 ?
6644                                            INTEL_MASK(17, 14) :
6645                                            INTEL_MASK(15, 11);
6646             const uint32_t slice_id_shift = devinfo->ver >= 30 ? 14 : 11;
6647 
6648             const uint32_t subslice_id_mask = devinfo->ver >= 30 ?
6649                                               INTEL_MASK(11, 8) :
6650                                               INTEL_MASK(9, 8);
6651             brw_reg slice_id =
6652                bld.SHR(bld.AND(raw_id, brw_imm_ud(slice_id_mask)),
6653                        brw_imm_ud(slice_id_shift));
6654 
6655             /* Assert that max subslices covers at least 2 bits that we use for
6656              * subslices.
6657              */
6658             unsigned slice_stride = devinfo->max_subslices_per_slice;
6659             assert(slice_stride >= (1 << 2));
6660             brw_reg subslice_id =
6661                bld.SHR(bld.AND(raw_id, brw_imm_ud(subslice_id_mask)),
6662                        brw_imm_ud(8));
6663             bld.ADD(retype(dest, BRW_TYPE_UD),
6664                     bld.MUL(slice_id, brw_imm_ud(slice_stride)), subslice_id);
6665          } else {
6666             /* Get rid of anything below dualsubslice */
6667             bld.SHR(retype(dest, BRW_TYPE_UD),
6668                     bld.AND(raw_id, brw_imm_ud(0x3fff)), brw_imm_ud(9));
6669          }
6670          break;
6671       case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: {
6672          s.limit_dispatch_width(16, "Topology helper for Ray queries, "
6673                               "not supported in SIMD32 mode.");
6674          brw_reg dst = retype(dest, BRW_TYPE_UD);
6675          brw_reg eu;
6676 
6677          if (devinfo->ver >= 20) {
6678             /* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions
6679              * Ray Tracing,
6680              * https://gfxspecs.intel.com/Predator/Home/Index/56936
6681              *
6682              * SyncStackID = (EUID[2:0] <<  8) | (ThreadID[2:0] << 4) |
6683              *               SIMDLaneID[3:0];
6684              *
6685              * This section just deals with the EUID part.
6686              *
6687              * The 3bit EU[2:0] we need to build for ray query memory addresses
6688              * computations is a bit odd :
6689              *
6690              *   EU[2:0] = raw_id[6:4] (identified as EUID[2:0])
6691              */
6692             eu = bld.SHL(bld.AND(raw_id, brw_imm_ud(INTEL_MASK(6, 4))),
6693                          brw_imm_ud(4));
6694          } else {
6695             /* EU[3:0] << 7
6696              *
6697              * The 4bit EU[3:0] we need to build for ray query memory addresses
6698              * computations is a bit odd :
6699              *
6700              *   EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
6701              *   EU[2]   = raw_id[8]   (identified as SubSlice ID)
6702              *   EU[3]   = raw_id[7]   (identified as EUID[2] or Row ID)
6703              */
6704             brw_reg raw5_4 = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(5, 4)));
6705             brw_reg raw7   = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(7, 7)));
6706             brw_reg raw8   = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(8, 8)));
6707             eu = bld.OR(bld.SHL(raw5_4, brw_imm_ud(3)),
6708                         bld.OR(bld.SHL(raw7, brw_imm_ud(3)),
6709                                bld.SHL(raw8, brw_imm_ud(1))));
6710          }
6711 
6712          brw_reg tid;
6713          /* Xe3: Graphics Engine, 3D and GPGPU Programs, Shared Functions
6714           * Ray Tracing, (Bspec 56936 (r56740))
6715           *
6716           * SyncStackID = (EUID[2:0] << 8) | (ThreadID[3:0] << 4) |
6717           * SIMDLaneID[3:0];
6718           *
6719           * ThreadID[3:0] << 4 (ThreadID comes from raw_id[3:0])
6720           *
6721           * On older platforms (< Xe3):
6722           * ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0])
6723           */
6724          const uint32_t raw_id_mask = devinfo->ver >= 30 ?
6725                                       INTEL_MASK(3, 0) :
6726                                       INTEL_MASK(2, 0);
6727          tid = bld.SHL(bld.AND(raw_id, brw_imm_ud(raw_id_mask)),
6728                        brw_imm_ud(4));
6729 
6730          /* LaneID[0:3] << 0 (Use subgroup invocation) */
6731          assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
6732          bld.ADD(dst, bld.OR(eu, tid), bld.LOAD_SUBGROUP_INVOCATION());
6733          break;
6734       }
6735       default:
6736          unreachable("Invalid topology id type");
6737       }
6738       break;
6739    }
6740 
6741    case nir_intrinsic_load_btd_stack_id_intel:
6742       if (s.stage == MESA_SHADER_COMPUTE) {
6743          assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
6744       } else {
6745          assert(brw_shader_stage_is_bindless(s.stage));
6746       }
6747       /* Stack IDs are always in R1 regardless of whether we're coming from a
6748        * bindless shader or a regular compute shader.
6749        */
6750       bld.MOV(retype(dest, BRW_TYPE_UD),
6751               retype(brw_vec8_grf(1 * reg_unit(devinfo), 0), BRW_TYPE_UW));
6752       break;
6753 
6754    case nir_intrinsic_btd_spawn_intel:
6755       if (s.stage == MESA_SHADER_COMPUTE) {
6756          assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
6757       } else {
6758          assert(brw_shader_stage_is_bindless(s.stage));
6759       }
6760       /* Make sure all the pointers to resume shaders have landed where other
6761        * threads can see them.
6762        */
6763       emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
6764 
6765       bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
6766                bld.emit_uniformize(get_nir_src(ntb, instr->src[0], -1)),
6767                get_nir_src(ntb, instr->src[1]));
6768       break;
6769 
6770    case nir_intrinsic_btd_retire_intel:
6771       if (s.stage == MESA_SHADER_COMPUTE) {
6772          assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
6773       } else {
6774          assert(brw_shader_stage_is_bindless(s.stage));
6775       }
6776       /* Make sure all the pointers to resume shaders have landed where other
6777        * threads can see them.
6778        */
6779       emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
6780       bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
6781       break;
6782 
6783    case nir_intrinsic_trace_ray_intel: {
6784       const bool synchronous = nir_intrinsic_synchronous(instr);
6785       assert(brw_shader_stage_is_bindless(s.stage) || synchronous);
6786 
6787       /* Make sure all the previous RT structure writes are visible to the RT
6788        * fixed function within the DSS, as well as stack pointers to resume
6789        * shaders.
6790        */
6791       emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
6792 
6793       brw_reg srcs[RT_LOGICAL_NUM_SRCS];
6794 
6795       brw_reg globals = get_nir_src(ntb, instr->src[0], -1);
6796       srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals);
6797       srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(ntb, instr->src[1]);
6798       srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(ntb, instr->src[2]);
6799       srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
6800 
6801       /* Bspec 57508: Structure_SIMD16TraceRayMessage:: RayQuery Enable
6802        *
6803        *    "When this bit is set in the header, Trace Ray Message behaves like
6804        *    a Ray Query. This message requires a write-back message indicating
6805        *    RayQuery for all valid Rays (SIMD lanes) have completed."
6806        */
6807       brw_reg dst = (devinfo->ver >= 20 && synchronous) ?
6808                     bld.vgrf(BRW_TYPE_UD) :
6809                     bld.null_reg_ud();
6810 
6811       bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, dst, srcs, RT_LOGICAL_NUM_SRCS);
6812 
6813       /* There is no actual value to use in the destination register of the
6814        * synchronous trace instruction. All of the communication with the HW
6815        * unit happens through memory reads/writes. So to ensure that the
6816        * operation has completed before we go read the results in memory, we
6817        * need a barrier followed by an invalidate before accessing memory.
6818        */
6819       if (synchronous) {
6820          bld.SYNC(TGL_SYNC_ALLWR);
6821          emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE);
6822       }
6823       break;
6824    }
6825 
6826    default:
6827 #ifndef NDEBUG
6828       assert(instr->intrinsic < nir_num_intrinsics);
6829       fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
6830 #endif
6831       unreachable("unknown intrinsic");
6832    }
6833 }
6834 
6835 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)6836 lsc_bits_to_data_size(unsigned bit_size)
6837 {
6838    switch (bit_size / 8) {
6839    case 1:  return LSC_DATA_SIZE_D8U32;
6840    case 2:  return LSC_DATA_SIZE_D16U32;
6841    case 4:  return LSC_DATA_SIZE_D32;
6842    case 8:  return LSC_DATA_SIZE_D64;
6843    default:
6844       unreachable("Unsupported data size.");
6845    }
6846 }
6847 
6848 /**
6849  *
6850  * \param bld  "Normal" builder. This is the full dispatch width of the shader.
6851  *
6852  * \param xbld Builder for the intrinsic. If the intrinsic is convergent, this
6853  *             builder will be scalar_group(). Otherwise it will be the same
6854  *             as bld.
6855  *
6856  * Some places in the function will also use \c ubld. There are two cases of
6857  * this. Sometimes it is to generate intermediate values as SIMD1. Other
6858  * places that use \c ubld need a scalar_group() builder to operate on sources
6859  * to the intrinsic that are is_scalar.
6860  */
6861 static void
fs_nir_emit_memory_access(nir_to_brw_state & ntb,const fs_builder & bld,const fs_builder & xbld,nir_intrinsic_instr * instr)6862 fs_nir_emit_memory_access(nir_to_brw_state &ntb,
6863                           const fs_builder &bld,
6864                           const fs_builder &xbld,
6865                           nir_intrinsic_instr *instr)
6866 {
6867    const intel_device_info *devinfo = ntb.devinfo;
6868    fs_visitor &s = ntb.s;
6869 
6870    brw_reg srcs[MEMORY_LOGICAL_NUM_SRCS];
6871 
6872    /* Start with some default values for most cases */
6873 
6874    enum lsc_opcode op = lsc_op_for_nir_intrinsic(instr);
6875    const bool is_store = !nir_intrinsic_infos[instr->intrinsic].has_dest;
6876    const bool is_atomic = lsc_opcode_is_atomic(op);
6877    const bool is_load = !is_store && !is_atomic;
6878    const bool include_helpers = nir_intrinsic_has_access(instr) &&
6879       (nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
6880    const unsigned align =
6881       nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0;
6882    bool no_mask_handle = false;
6883    int data_src = -1;
6884 
6885    srcs[MEMORY_LOGICAL_OPCODE] = brw_imm_ud(op);
6886    /* BINDING_TYPE, BINDING, and ADDRESS are handled in the switch */
6887    srcs[MEMORY_LOGICAL_COORD_COMPONENTS] = brw_imm_ud(1);
6888    srcs[MEMORY_LOGICAL_ALIGNMENT] = brw_imm_ud(align);
6889    /* DATA_SIZE and CHANNELS are handled below the switch */
6890    srcs[MEMORY_LOGICAL_FLAGS] =
6891       brw_imm_ud(include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0);
6892    /* DATA0 and DATA1 are handled below */
6893 
6894    switch (instr->intrinsic) {
6895    case nir_intrinsic_bindless_image_load:
6896    case nir_intrinsic_bindless_image_store:
6897    case nir_intrinsic_bindless_image_atomic:
6898    case nir_intrinsic_bindless_image_atomic_swap:
6899       srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_BSS);
6900       FALLTHROUGH;
6901    case nir_intrinsic_image_load:
6902    case nir_intrinsic_image_store:
6903    case nir_intrinsic_image_atomic:
6904    case nir_intrinsic_image_atomic_swap:
6905       srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_TYPED);
6906       srcs[MEMORY_LOGICAL_BINDING] =
6907          get_nir_image_intrinsic_image(ntb, bld, instr);
6908 
6909       if (srcs[MEMORY_LOGICAL_BINDING_TYPE].file == BAD_FILE)
6910          srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_BTI);
6911 
6912       srcs[MEMORY_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6913       srcs[MEMORY_LOGICAL_COORD_COMPONENTS] =
6914          brw_imm_ud(nir_image_intrinsic_coord_components(instr));
6915 
6916       data_src = 3;
6917       break;
6918 
6919    case nir_intrinsic_load_ubo_uniform_block_intel:
6920       srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_CONSTANT);
6921       FALLTHROUGH;
6922    case nir_intrinsic_load_ssbo:
6923    case nir_intrinsic_store_ssbo:
6924    case nir_intrinsic_ssbo_atomic:
6925    case nir_intrinsic_ssbo_atomic_swap:
6926    case nir_intrinsic_load_ssbo_block_intel:
6927    case nir_intrinsic_store_ssbo_block_intel:
6928    case nir_intrinsic_load_ssbo_uniform_block_intel:
6929       if (srcs[MEMORY_LOGICAL_MODE].file == BAD_FILE)
6930          srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_UNTYPED);
6931       srcs[MEMORY_LOGICAL_BINDING_TYPE] =
6932          brw_imm_ud(get_nir_src_bindless(ntb, instr->src[is_store ? 1 : 0]) ?
6933                     LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI);
6934       srcs[MEMORY_LOGICAL_BINDING] =
6935          get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
6936       srcs[MEMORY_LOGICAL_ADDRESS] =
6937          get_nir_src_imm(ntb, instr->src[is_store ? 2 : 1]);
6938 
6939       data_src = is_atomic ? 2 : 0;
6940       break;
6941    case nir_intrinsic_load_shared:
6942    case nir_intrinsic_store_shared:
6943    case nir_intrinsic_shared_atomic:
6944    case nir_intrinsic_shared_atomic_swap:
6945    case nir_intrinsic_load_shared_block_intel:
6946    case nir_intrinsic_store_shared_block_intel:
6947    case nir_intrinsic_load_shared_uniform_block_intel: {
6948       srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_SHARED_LOCAL);
6949       srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
6950 
6951       const brw_reg nir_src = get_nir_src(ntb, instr->src[is_store ? 1 : 0]);
6952       const fs_builder ubld = nir_src.is_scalar ? bld.scalar_group() : bld;
6953 
6954       /* If the logical address is not uniform, a call to emit_uniformize
6955        * below will fix it up.
6956        */
6957       srcs[MEMORY_LOGICAL_ADDRESS] =
6958          ubld.ADD(retype(nir_src, BRW_TYPE_UD),
6959                   brw_imm_ud(nir_intrinsic_base(instr)));
6960 
6961       /* If nir_src is_scalar, the MEMORY_LOGICAL_ADDRESS will be allocated at
6962        * scalar_group() size and will have every component the same
6963        * value. This is the definition of is_scalar. Much more importantly,
6964        * setting is_scalar properly also ensures that emit_uniformize (below)
6965        * will handle the value as scalar_group() size instead of full dispatch
6966        * width.
6967        */
6968       srcs[MEMORY_LOGICAL_ADDRESS].is_scalar = nir_src.is_scalar;
6969 
6970       data_src = is_atomic ? 1 : 0;
6971       no_mask_handle = true;
6972       break;
6973    }
6974    case nir_intrinsic_load_scratch:
6975    case nir_intrinsic_store_scratch: {
6976       srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_SCRATCH);
6977 
6978       const nir_src &addr = instr->src[is_store ? 1 : 0];
6979 
6980       if (devinfo->verx10 >= 125) {
6981          srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_SS);
6982 
6983          const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
6984          brw_reg bind = ubld.AND(retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
6985                                  brw_imm_ud(INTEL_MASK(31, 10)));
6986          if (devinfo->ver >= 20)
6987             bind = ubld.SHR(bind, brw_imm_ud(4));
6988 
6989          /* load_scratch / store_scratch cannot be is_scalar yet. */
6990          assert(xbld.dispatch_width() == bld.dispatch_width());
6991 
6992          srcs[MEMORY_LOGICAL_BINDING] = component(bind, 0);
6993          srcs[MEMORY_LOGICAL_ADDRESS] =
6994             swizzle_nir_scratch_addr(ntb, bld, addr, false);
6995       } else {
6996          unsigned bit_size =
6997             is_store ? nir_src_bit_size(instr->src[0]) : instr->def.bit_size;
6998          bool dword_aligned = align >= 4 && bit_size == 32;
6999 
7000          /* load_scratch / store_scratch cannot be is_scalar yet. */
7001          assert(xbld.dispatch_width() == bld.dispatch_width());
7002 
7003          srcs[MEMORY_LOGICAL_BINDING_TYPE] =
7004             brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
7005          srcs[MEMORY_LOGICAL_ADDRESS] =
7006             swizzle_nir_scratch_addr(ntb, bld, addr, dword_aligned);
7007       }
7008 
7009       if (is_store)
7010          ++s.shader_stats.spill_count;
7011       else
7012          ++s.shader_stats.fill_count;
7013 
7014       data_src = 0;
7015       break;
7016    }
7017 
7018    case nir_intrinsic_load_global_constant_uniform_block_intel:
7019    case nir_intrinsic_load_global:
7020    case nir_intrinsic_load_global_constant:
7021    case nir_intrinsic_store_global:
7022    case nir_intrinsic_global_atomic:
7023    case nir_intrinsic_global_atomic_swap:
7024    case nir_intrinsic_load_global_block_intel:
7025    case nir_intrinsic_store_global_block_intel:
7026       srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_UNTYPED);
7027       srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
7028       srcs[MEMORY_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[is_store ? 1 : 0]);
7029       no_mask_handle = srcs[MEMORY_LOGICAL_ADDRESS].is_scalar;
7030 
7031       data_src = is_atomic ? 1 : 0;
7032       break;
7033 
7034    default:
7035       unreachable("unknown memory intrinsic");
7036    }
7037 
7038    unsigned components = is_store ? instr->src[data_src].ssa->num_components
7039                                   : instr->def.num_components;
7040    if (components == 0)
7041       components = instr->num_components;
7042 
7043    srcs[MEMORY_LOGICAL_COMPONENTS] = brw_imm_ud(components);
7044 
7045    const unsigned nir_bit_size =
7046       is_store ? instr->src[data_src].ssa->bit_size : instr->def.bit_size;
7047    enum lsc_data_size data_size = lsc_bits_to_data_size(nir_bit_size);
7048    uint32_t data_bit_size = lsc_data_size_bytes(data_size) * 8;
7049 
7050    srcs[MEMORY_LOGICAL_DATA_SIZE] = brw_imm_ud(data_size);
7051 
7052    const brw_reg_type data_type =
7053       brw_type_with_size(BRW_TYPE_UD, data_bit_size);
7054    const brw_reg_type nir_data_type =
7055       brw_type_with_size(BRW_TYPE_UD, nir_bit_size);
7056    assert(data_bit_size >= nir_bit_size);
7057 
7058    if (!is_load) {
7059       for (unsigned i = 0; i < lsc_op_num_data_values(op); i++) {
7060          brw_reg nir_src =
7061             retype(get_nir_src(ntb, instr->src[data_src + i], -1), nir_data_type);
7062 
7063          if (data_bit_size > nir_bit_size) {
7064             /* Expand e.g. D16 to D16U32 */
7065             srcs[MEMORY_LOGICAL_DATA0 + i] = xbld.vgrf(data_type, components);
7066             for (unsigned c = 0; c < components; c++) {
7067                xbld.MOV(offset(srcs[MEMORY_LOGICAL_DATA0 + i], xbld, c),
7068                         offset(nir_src, xbld, c));
7069             }
7070          } else {
7071             srcs[MEMORY_LOGICAL_DATA0 + i] = nir_src;
7072          }
7073       }
7074    }
7075 
7076    brw_reg dest, nir_dest;
7077    if (!is_store) {
7078       nir_dest = retype(get_nir_def(ntb, instr->def), nir_data_type);
7079       dest = data_bit_size > nir_bit_size ? xbld.vgrf(data_type, components)
7080                                           : nir_dest;
7081    }
7082 
7083    enum opcode opcode = is_load ? SHADER_OPCODE_MEMORY_LOAD_LOGICAL :
7084                         is_store ? SHADER_OPCODE_MEMORY_STORE_LOGICAL :
7085                         SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL;
7086 
7087    const bool convergent_block_load =
7088       instr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
7089       instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
7090       instr->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
7091       instr->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel;
7092    const bool block = convergent_block_load ||
7093       instr->intrinsic == nir_intrinsic_load_global_block_intel ||
7094       instr->intrinsic == nir_intrinsic_load_shared_block_intel ||
7095       instr->intrinsic == nir_intrinsic_load_ssbo_block_intel ||
7096       instr->intrinsic == nir_intrinsic_store_global_block_intel ||
7097       instr->intrinsic == nir_intrinsic_store_shared_block_intel ||
7098       instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
7099 
7100    fs_inst *inst;
7101 
7102    if (!block) {
7103       inst = xbld.emit(opcode, dest, srcs, MEMORY_LOGICAL_NUM_SRCS);
7104       inst->size_written *= components;
7105 
7106       if (dest.file != BAD_FILE && data_bit_size > nir_bit_size) {
7107          /* Shrink e.g. D16U32 result back to D16 */
7108          for (unsigned i = 0; i < components; i++) {
7109             xbld.MOV(offset(nir_dest, xbld, i),
7110                      subscript(offset(dest, xbld, i), nir_dest.type, 0));
7111          }
7112       }
7113    } else {
7114       assert(nir_bit_size == 32);
7115 
7116       srcs[MEMORY_LOGICAL_FLAGS] =
7117          brw_imm_ud(MEMORY_FLAG_TRANSPOSE | srcs[MEMORY_LOGICAL_FLAGS].ud);
7118       srcs[MEMORY_LOGICAL_ADDRESS] =
7119          bld.emit_uniformize(srcs[MEMORY_LOGICAL_ADDRESS]);
7120 
7121       const fs_builder ubld = bld.exec_all().group(1, 0);
7122       unsigned total, done;
7123       unsigned first_read_component = 0;
7124 
7125       if (convergent_block_load) {
7126          /* If the address is a constant and alignment permits, skip unread
7127           * leading and trailing components.  (It's probably not worth the
7128           * extra address math for non-constant addresses.)
7129           *
7130           * Note that SLM block loads on HDC platforms need to be 16B aligned.
7131           */
7132          if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM &&
7133              align >= data_bit_size / 8 &&
7134              (devinfo->has_lsc ||
7135               srcs[MEMORY_LOGICAL_MODE].ud != MEMORY_MODE_SHARED_LOCAL)) {
7136             first_read_component = nir_def_first_component_read(&instr->def);
7137             unsigned last_component = nir_def_last_component_read(&instr->def);
7138             srcs[MEMORY_LOGICAL_ADDRESS].u64 +=
7139                first_read_component * (data_bit_size / 8);
7140             components = last_component - first_read_component + 1;
7141          }
7142 
7143          total = ALIGN(components, REG_SIZE * reg_unit(devinfo) / 4);
7144          dest = ubld.vgrf(BRW_TYPE_UD, total);
7145       } else {
7146          total = components * bld.dispatch_width();
7147          dest = nir_dest;
7148       }
7149 
7150       brw_reg src = srcs[MEMORY_LOGICAL_DATA0];
7151 
7152       unsigned block_comps = components;
7153 
7154       for (done = 0; done < total; done += block_comps) {
7155          block_comps = choose_block_size_dwords(devinfo, total - done);
7156          const unsigned block_bytes = block_comps * (nir_bit_size / 8);
7157 
7158          srcs[MEMORY_LOGICAL_COMPONENTS] = brw_imm_ud(block_comps);
7159 
7160          brw_reg dst_offset = is_store ? brw_reg() :
7161             retype(byte_offset(dest, done * 4), BRW_TYPE_UD);
7162          if (is_store) {
7163             srcs[MEMORY_LOGICAL_DATA0] =
7164                retype(byte_offset(src, done * 4), BRW_TYPE_UD);
7165          }
7166 
7167          inst = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS);
7168          inst->has_no_mask_send_params = no_mask_handle;
7169          if (is_load)
7170             inst->size_written = block_bytes;
7171 
7172          if (brw_type_size_bits(srcs[MEMORY_LOGICAL_ADDRESS].type) == 64) {
7173             increment_a64_address(ubld, srcs[MEMORY_LOGICAL_ADDRESS],
7174                                   block_bytes, no_mask_handle);
7175          } else {
7176             srcs[MEMORY_LOGICAL_ADDRESS] =
7177                ubld.ADD(retype(srcs[MEMORY_LOGICAL_ADDRESS], BRW_TYPE_UD),
7178                         brw_imm_ud(block_bytes));
7179          }
7180       }
7181       assert(done == total);
7182 
7183       if (convergent_block_load) {
7184          for (unsigned c = 0; c < components; c++) {
7185             xbld.MOV(retype(offset(nir_dest, xbld, first_read_component + c),
7186                             BRW_TYPE_UD),
7187                      component(dest, c));
7188          }
7189       }
7190    }
7191 }
7192 
7193 static void
fs_nir_emit_texture(nir_to_brw_state & ntb,nir_tex_instr * instr)7194 fs_nir_emit_texture(nir_to_brw_state &ntb,
7195                     nir_tex_instr *instr)
7196 {
7197    const intel_device_info *devinfo = ntb.devinfo;
7198    const fs_builder &bld = ntb.bld;
7199 
7200    brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
7201 
7202    /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
7203     *
7204     *    "The Pixel Null Mask field, when enabled via the Pixel Null Mask
7205     *     Enable will be incorect for sample_c when applied to a surface with
7206     *     64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
7207     *     Enable may incorrectly report pixels as referencing a Null surface."
7208     *
7209     * We'll take care of this in NIR.
7210     */
7211    assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
7212 
7213    srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse);
7214 
7215    int lod_components = 0;
7216 
7217    /* The hardware requires a LOD for buffer textures */
7218    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
7219       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
7220 
7221    ASSERTED bool got_lod = false;
7222    ASSERTED bool got_bias = false;
7223    bool pack_lod_bias_and_offset = false;
7224    uint32_t header_bits = 0;
7225    for (unsigned i = 0; i < instr->num_srcs; i++) {
7226       nir_src nir_src = instr->src[i].src;
7227       brw_reg src = get_nir_src(ntb, nir_src, -1);
7228 
7229       /* If the source is not a vector (e.g., a 1D texture coordinate), then
7230        * the eventual LOAD_PAYLOAD lowering will not properly adjust the
7231        * stride, etc., so do it now.
7232        */
7233       if (nir_tex_instr_src_size(instr, i) == 1)
7234          src = offset(src, bld, 0);
7235 
7236       switch (instr->src[i].src_type) {
7237       case nir_tex_src_bias:
7238          assert(!got_lod);
7239          got_bias = true;
7240 
7241          srcs[TEX_LOGICAL_SRC_LOD] =
7242             retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7243          break;
7244       case nir_tex_src_comparator:
7245          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_TYPE_F);
7246          break;
7247       case nir_tex_src_coord:
7248          switch (instr->op) {
7249          case nir_texop_txf:
7250          case nir_texop_txf_ms:
7251          case nir_texop_txf_ms_mcs_intel:
7252          case nir_texop_samples_identical:
7253             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_TYPE_D);
7254             break;
7255          default:
7256             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_TYPE_F);
7257             break;
7258          }
7259          break;
7260       case nir_tex_src_ddx:
7261          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_TYPE_F);
7262          lod_components = nir_tex_instr_src_size(instr, i);
7263          break;
7264       case nir_tex_src_ddy:
7265          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_TYPE_F);
7266          break;
7267       case nir_tex_src_lod:
7268          assert(!got_bias);
7269          got_lod = true;
7270 
7271          switch (instr->op) {
7272          case nir_texop_txs:
7273             srcs[TEX_LOGICAL_SRC_LOD] =
7274                retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_UD);
7275             break;
7276          case nir_texop_txf:
7277             srcs[TEX_LOGICAL_SRC_LOD] =
7278                retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_D);
7279             break;
7280          default:
7281             srcs[TEX_LOGICAL_SRC_LOD] =
7282                retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7283             break;
7284          }
7285          break;
7286       case nir_tex_src_min_lod:
7287          srcs[TEX_LOGICAL_SRC_MIN_LOD] =
7288             retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7289          break;
7290       case nir_tex_src_ms_index:
7291          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_TYPE_UD);
7292          break;
7293 
7294       case nir_tex_src_offset: {
7295          uint32_t offset_bits = 0;
7296          if (brw_texture_offset(instr, i, &offset_bits)) {
7297             header_bits |= offset_bits;
7298          } else {
7299             /* On gfx12.5+, if the offsets are not both constant and in the
7300              * {-8,7} range, nir_lower_tex() will have already lowered the
7301              * source offset. So we should never reach this point.
7302              */
7303             assert(devinfo->verx10 < 125);
7304             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
7305                retype(src, BRW_TYPE_D);
7306          }
7307          break;
7308       }
7309 
7310       case nir_tex_src_projector:
7311          unreachable("should be lowered");
7312 
7313       case nir_tex_src_texture_offset: {
7314          assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
7315          /* Emit code to evaluate the actual indexing expression */
7316          srcs[TEX_LOGICAL_SRC_SURFACE] =
7317             bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
7318                                         brw_imm_ud(instr->texture_index)));
7319          assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
7320          break;
7321       }
7322 
7323       case nir_tex_src_sampler_offset: {
7324          /* Emit code to evaluate the actual indexing expression */
7325          srcs[TEX_LOGICAL_SRC_SAMPLER] =
7326             bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
7327                                         brw_imm_ud(instr->sampler_index)));
7328          break;
7329       }
7330 
7331       case nir_tex_src_texture_handle:
7332          assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
7333          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_reg();
7334          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
7335          break;
7336 
7337       case nir_tex_src_sampler_handle:
7338          assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
7339          srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_reg();
7340          srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
7341          break;
7342 
7343       case nir_tex_src_ms_mcs_intel:
7344          assert(instr->op == nir_texop_txf_ms);
7345          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_TYPE_D);
7346          break;
7347 
7348       /* If this parameter is present, we are packing offset U, V and LOD/Bias
7349        * into a single (32-bit) value.
7350        */
7351       case nir_tex_src_backend2:
7352          assert(instr->op == nir_texop_tg4);
7353          pack_lod_bias_and_offset = true;
7354          srcs[TEX_LOGICAL_SRC_LOD] =
7355             retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7356          break;
7357 
7358       /* If this parameter is present, we are packing either the explicit LOD
7359        * or LOD bias and the array index into a single (32-bit) value when
7360        * 32-bit texture coordinates are used.
7361        */
7362       case nir_tex_src_backend1:
7363          assert(!got_lod && !got_bias);
7364          got_lod = true;
7365          assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
7366          srcs[TEX_LOGICAL_SRC_LOD] =
7367             retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7368          break;
7369 
7370       default:
7371          unreachable("unknown texture source");
7372       }
7373    }
7374 
7375    /* If the surface or sampler were not specified through sources, use the
7376     * instruction index.
7377     */
7378    if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
7379        srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
7380       srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
7381    if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
7382        srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
7383       srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
7384 
7385    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
7386        (instr->op == nir_texop_txf_ms ||
7387         instr->op == nir_texop_samples_identical)) {
7388       srcs[TEX_LOGICAL_SRC_MCS] =
7389          emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
7390                         instr->coord_components,
7391                         srcs[TEX_LOGICAL_SRC_SURFACE],
7392                         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
7393    }
7394 
7395    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
7396    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
7397 
7398    enum opcode opcode;
7399    switch (instr->op) {
7400    case nir_texop_tex:
7401       opcode = SHADER_OPCODE_TEX_LOGICAL;
7402       break;
7403    case nir_texop_txb:
7404       opcode = FS_OPCODE_TXB_LOGICAL;
7405       break;
7406    case nir_texop_txl:
7407       opcode = SHADER_OPCODE_TXL_LOGICAL;
7408       break;
7409    case nir_texop_txd:
7410       opcode = SHADER_OPCODE_TXD_LOGICAL;
7411       break;
7412    case nir_texop_txf:
7413       opcode = SHADER_OPCODE_TXF_LOGICAL;
7414       break;
7415    case nir_texop_txf_ms:
7416       /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
7417        * Functions - 3D Sampler - Messages - Message Format:
7418        *
7419        *   ld2dms REMOVEDBY(GEN:HAS:1406788836)
7420        */
7421       if (devinfo->verx10 >= 125)
7422          opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
7423       else
7424          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
7425       break;
7426    case nir_texop_txf_ms_mcs_intel:
7427       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
7428       break;
7429    case nir_texop_query_levels:
7430    case nir_texop_txs:
7431       opcode = SHADER_OPCODE_TXS_LOGICAL;
7432       break;
7433    case nir_texop_lod:
7434       opcode = SHADER_OPCODE_LOD_LOGICAL;
7435       break;
7436    case nir_texop_tg4: {
7437       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) {
7438          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
7439       } else {
7440          opcode = SHADER_OPCODE_TG4_LOGICAL;
7441          if (devinfo->ver >= 20) {
7442             /* If SPV_AMD_texture_gather_bias_lod extension is enabled, all
7443              * texture gather functions (ie. the ones which do not take the
7444              * extra bias argument and the ones that do) fetch texels from
7445              * implicit LOD in fragment shader stage. In all other shader
7446              * stages, base level is used instead.
7447              */
7448             if (instr->is_gather_implicit_lod)
7449                opcode = SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL;
7450 
7451             if (got_bias)
7452                opcode = SHADER_OPCODE_TG4_BIAS_LOGICAL;
7453 
7454             if (got_lod)
7455                opcode = SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL;
7456 
7457             if (pack_lod_bias_and_offset) {
7458                if (got_lod)
7459                   opcode = SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL;
7460                if (got_bias)
7461                   opcode = SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL;
7462             }
7463          }
7464       }
7465       break;
7466    }
7467    case nir_texop_texture_samples:
7468       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
7469       break;
7470    case nir_texop_samples_identical: {
7471       brw_reg dst = retype(get_nir_def(ntb, instr->def), BRW_TYPE_D);
7472 
7473       /* If mcs is an immediate value, it means there is no MCS.  In that case
7474        * just return false.
7475        */
7476       if (srcs[TEX_LOGICAL_SRC_MCS].file == IMM) {
7477          bld.MOV(dst, brw_imm_ud(0u));
7478       } else {
7479          brw_reg tmp =
7480             bld.OR(srcs[TEX_LOGICAL_SRC_MCS],
7481                    offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
7482          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
7483       }
7484       return;
7485    }
7486    default:
7487       unreachable("unknown texture opcode");
7488    }
7489 
7490    if (instr->op == nir_texop_tg4) {
7491       header_bits |= instr->component << 16;
7492    }
7493 
7494    brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
7495 
7496    const unsigned dest_size = nir_tex_instr_dest_size(instr);
7497    unsigned dest_comp;
7498    if (instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
7499       unsigned write_mask = nir_def_components_read(&instr->def);
7500       assert(write_mask != 0); /* dead code should have been eliminated */
7501 
7502       dest_comp = util_last_bit(write_mask) - instr->is_sparse;
7503    } else {
7504       dest_comp = 4;
7505    }
7506 
7507    /* Compute the number of physical registers needed to hold a single
7508     * component and round it up to a physical register count.
7509     */
7510    brw_reg_type dst_type = brw_type_for_nir_type(devinfo, instr->dest_type);
7511    const unsigned grf_size = reg_unit(devinfo) * REG_SIZE;
7512    const unsigned per_component_regs =
7513       DIV_ROUND_UP(brw_type_size_bytes(dst_type) * bld.dispatch_width(),
7514                    grf_size);
7515    const unsigned total_regs =
7516       dest_comp * per_component_regs + instr->is_sparse;
7517    /* Allocate enough space for the components + one physical register for the
7518     * residency data.
7519     */
7520    brw_reg dst = brw_vgrf(
7521       bld.shader->alloc.allocate(total_regs * reg_unit(devinfo)),
7522       dst_type);
7523 
7524    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
7525    inst->offset = header_bits;
7526    inst->size_written = total_regs * grf_size;
7527 
7528    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
7529       inst->shadow_compare = true;
7530 
7531    /* Wa_14012688258:
7532     *
7533     * Don't trim zeros at the end of payload for sample operations
7534     * in cube and cube arrays.
7535     */
7536    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
7537        intel_needs_workaround(devinfo, 14012688258)) {
7538 
7539       /* Compiler should send U,V,R parameters even if V,R are 0. */
7540       if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
7541          assert(instr->coord_components >= 3u);
7542 
7543       /* See opt_zero_samples(). */
7544       inst->keep_payload_trailing_zeros = true;
7545    }
7546 
7547    /* With half-floats returns, the stride into a GRF allocation for each
7548     * component might be different than where the sampler is storing each
7549     * component. For example in SIMD8 on DG2 the layout of the data returned
7550     * by the sampler is as follow for 2 components load:
7551     *
7552     *           _______________________________________________________________
7553     *   g0 : |           unused              |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
7554     *   g1 : |           unused              |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
7555     *
7556     * The same issue also happens in SIMD16 on Xe2 because the physical
7557     * register size has doubled but we're still loading data only on half the
7558     * register.
7559     *
7560     * In those cases we need the special remapping case below.
7561     */
7562    const bool non_aligned_component_stride =
7563       (brw_type_size_bytes(dst_type) * bld.dispatch_width()) % grf_size != 0;
7564    if (instr->op != nir_texop_query_levels && !instr->is_sparse &&
7565        !non_aligned_component_stride) {
7566       /* In most cases we can write directly to the result. */
7567       inst->dst = nir_def_reg;
7568    } else {
7569       /* In other cases, we have to reorganize the sampler message's results
7570        * a bit to match the NIR intrinsic's expectations.
7571        */
7572       brw_reg nir_dest[5];
7573       for (unsigned i = 0; i < dest_comp; i++)
7574          nir_dest[i] = byte_offset(dst, i * per_component_regs * grf_size);
7575 
7576       for (unsigned i = dest_comp; i < dest_size; i++)
7577          nir_dest[i].type = dst.type;
7578 
7579       if (instr->op == nir_texop_query_levels) {
7580          /* # levels is in .w */
7581          if (devinfo->ver == 9) {
7582             /**
7583              * Wa_1940217:
7584              *
7585              * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
7586              * MIPCount returned is undefined instead of 0.
7587              */
7588             fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
7589             mov->conditional_mod = BRW_CONDITIONAL_NZ;
7590             nir_dest[0] = bld.vgrf(BRW_TYPE_D);
7591             fs_inst *sel =
7592                bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
7593             sel->predicate = BRW_PREDICATE_NORMAL;
7594          } else {
7595             nir_dest[0] = offset(dst, bld, 3);
7596          }
7597       }
7598 
7599       /* The residency bits are only in the first component. */
7600       if (instr->is_sparse) {
7601          nir_dest[dest_size - 1] =
7602             component(offset(dst, bld, dest_size - 1), 0);
7603       }
7604 
7605       bld.LOAD_PAYLOAD(nir_def_reg, nir_dest, dest_size, 0);
7606    }
7607 }
7608 
7609 static void
fs_nir_emit_jump(nir_to_brw_state & ntb,nir_jump_instr * instr)7610 fs_nir_emit_jump(nir_to_brw_state &ntb, nir_jump_instr *instr)
7611 {
7612    switch (instr->type) {
7613    case nir_jump_break:
7614       ntb.bld.emit(BRW_OPCODE_BREAK);
7615       break;
7616    case nir_jump_continue:
7617       ntb.bld.emit(BRW_OPCODE_CONTINUE);
7618       break;
7619    case nir_jump_halt:
7620       ntb.bld.emit(BRW_OPCODE_HALT);
7621       break;
7622    case nir_jump_return:
7623    default:
7624       unreachable("unknown jump");
7625    }
7626 }
7627 
7628 /*
7629  * This helper takes a source register and un/shuffles it into the destination
7630  * register.
7631  *
7632  * If source type size is smaller than destination type size the operation
7633  * needed is a component shuffle. The opposite case would be an unshuffle. If
7634  * source/destination type size is equal a shuffle is done that would be
7635  * equivalent to a simple MOV.
7636  *
7637  * For example, if source is a 16-bit type and destination is 32-bit. A 3
7638  * components .xyz 16-bit vector on SIMD8 would be.
7639  *
7640  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
7641  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
7642  *
7643  * This helper will return the following 2 32-bit components with the 16-bit
7644  * values shuffled:
7645  *
7646  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
7647  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
7648  *
7649  * For unshuffle, the example would be the opposite, a 64-bit type source
7650  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
7651  * would be:
7652  *
7653  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
7654  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
7655  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
7656  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
7657  *
7658  * The returned result would be the following 4 32-bit components unshuffled:
7659  *
7660  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
7661  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
7662  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
7663  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
7664  *
7665  * - Source and destination register must not be overlapped.
7666  * - components units are measured in terms of the smaller type between
7667  *   source and destination because we are un/shuffling the smaller
7668  *   components from/into the bigger ones.
7669  * - first_component parameter allows skipping source components.
7670  */
7671 void
shuffle_src_to_dst(const fs_builder & bld,const brw_reg & dst,const brw_reg & src,uint32_t first_component,uint32_t components)7672 shuffle_src_to_dst(const fs_builder &bld,
7673                    const brw_reg &dst,
7674                    const brw_reg &src,
7675                    uint32_t first_component,
7676                    uint32_t components)
7677 {
7678    if (brw_type_size_bytes(src.type) == brw_type_size_bytes(dst.type)) {
7679       assert(!regions_overlap(dst,
7680          brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
7681          offset(src, bld, first_component),
7682          brw_type_size_bytes(src.type) * bld.dispatch_width() * components));
7683       for (unsigned i = 0; i < components; i++) {
7684          bld.MOV(retype(offset(dst, bld, i), src.type),
7685                  offset(src, bld, i + first_component));
7686       }
7687    } else if (brw_type_size_bytes(src.type) < brw_type_size_bytes(dst.type)) {
7688       /* Source is shuffled into destination */
7689       unsigned size_ratio = brw_type_size_bytes(dst.type) / brw_type_size_bytes(src.type);
7690       assert(!regions_overlap(dst,
7691          brw_type_size_bytes(dst.type) * bld.dispatch_width() *
7692          DIV_ROUND_UP(components, size_ratio),
7693          offset(src, bld, first_component),
7694          brw_type_size_bytes(src.type) * bld.dispatch_width() * components));
7695 
7696       brw_reg_type shuffle_type =
7697          brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(src.type));
7698       for (unsigned i = 0; i < components; i++) {
7699          brw_reg shuffle_component_i =
7700             subscript(offset(dst, bld, i / size_ratio),
7701                       shuffle_type, i % size_ratio);
7702          bld.MOV(shuffle_component_i,
7703                  retype(offset(src, bld, i + first_component), shuffle_type));
7704       }
7705    } else {
7706       /* Source is unshuffled into destination */
7707       unsigned size_ratio = brw_type_size_bytes(src.type) / brw_type_size_bytes(dst.type);
7708       assert(!regions_overlap(dst,
7709          brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
7710          offset(src, bld, first_component / size_ratio),
7711          brw_type_size_bytes(src.type) * bld.dispatch_width() *
7712          DIV_ROUND_UP(components + (first_component % size_ratio),
7713                       size_ratio)));
7714 
7715       brw_reg_type shuffle_type =
7716          brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(dst.type));
7717       for (unsigned i = 0; i < components; i++) {
7718          brw_reg shuffle_component_i =
7719             subscript(offset(src, bld, (first_component + i) / size_ratio),
7720                       shuffle_type, (first_component + i) % size_ratio);
7721          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
7722                  shuffle_component_i);
7723       }
7724    }
7725 }
7726 
7727 void
shuffle_from_32bit_read(const fs_builder & bld,const brw_reg & dst,const brw_reg & src,uint32_t first_component,uint32_t components)7728 shuffle_from_32bit_read(const fs_builder &bld,
7729                         const brw_reg &dst,
7730                         const brw_reg &src,
7731                         uint32_t first_component,
7732                         uint32_t components)
7733 {
7734    assert(brw_type_size_bytes(src.type) == 4);
7735 
7736    /* This function takes components in units of the destination type while
7737     * shuffle_src_to_dst takes components in units of the smallest type
7738     */
7739    if (brw_type_size_bytes(dst.type) > 4) {
7740       assert(brw_type_size_bytes(dst.type) == 8);
7741       first_component *= 2;
7742       components *= 2;
7743    }
7744 
7745    shuffle_src_to_dst(bld, dst, src, first_component, components);
7746 }
7747 
7748 static void
fs_nir_emit_instr(nir_to_brw_state & ntb,nir_instr * instr)7749 fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr)
7750 {
7751 #ifndef NDEBUG
7752    if (unlikely(ntb.annotate)) {
7753       /* Use shader mem_ctx since annotations outlive the NIR conversion. */
7754       ntb.bld = ntb.bld.annotate(nir_instr_as_str(instr, ntb.s.mem_ctx));
7755    }
7756 #endif
7757 
7758    switch (instr->type) {
7759    case nir_instr_type_alu:
7760       fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
7761       break;
7762 
7763    case nir_instr_type_deref:
7764       unreachable("All derefs should've been lowered");
7765       break;
7766 
7767    case nir_instr_type_intrinsic:
7768       switch (ntb.s.stage) {
7769       case MESA_SHADER_VERTEX:
7770          fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7771          break;
7772       case MESA_SHADER_TESS_CTRL:
7773          fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7774          break;
7775       case MESA_SHADER_TESS_EVAL:
7776          fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7777          break;
7778       case MESA_SHADER_GEOMETRY:
7779          fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7780          break;
7781       case MESA_SHADER_FRAGMENT:
7782          fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7783          break;
7784       case MESA_SHADER_COMPUTE:
7785       case MESA_SHADER_KERNEL:
7786          fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7787          break;
7788       case MESA_SHADER_RAYGEN:
7789       case MESA_SHADER_ANY_HIT:
7790       case MESA_SHADER_CLOSEST_HIT:
7791       case MESA_SHADER_MISS:
7792       case MESA_SHADER_INTERSECTION:
7793       case MESA_SHADER_CALLABLE:
7794          fs_nir_emit_bs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7795          break;
7796       case MESA_SHADER_TASK:
7797          fs_nir_emit_task_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7798          break;
7799       case MESA_SHADER_MESH:
7800          fs_nir_emit_mesh_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7801          break;
7802       default:
7803          unreachable("unsupported shader stage");
7804       }
7805       break;
7806 
7807    case nir_instr_type_tex:
7808       fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
7809       break;
7810 
7811    case nir_instr_type_load_const:
7812       fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
7813       break;
7814 
7815    case nir_instr_type_undef:
7816       /* We create a new VGRF for undefs on every use (by handling
7817        * them in get_nir_src()), rather than for each definition.
7818        * This helps register coalescing eliminate MOVs from undef.
7819        */
7820       break;
7821 
7822    case nir_instr_type_jump:
7823       fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
7824       break;
7825 
7826    default:
7827       unreachable("unknown instruction type");
7828    }
7829 }
7830 
7831 static unsigned
brw_rnd_mode_from_nir(unsigned mode,unsigned * mask)7832 brw_rnd_mode_from_nir(unsigned mode, unsigned *mask)
7833 {
7834    unsigned brw_mode = 0;
7835    *mask = 0;
7836 
7837    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
7838         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
7839         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
7840        mode) {
7841       brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT;
7842       *mask |= BRW_CR0_RND_MODE_MASK;
7843    }
7844    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
7845         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
7846         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
7847        mode) {
7848       brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT;
7849       *mask |= BRW_CR0_RND_MODE_MASK;
7850    }
7851    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
7852       brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE;
7853       *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
7854    }
7855    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
7856       brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE;
7857       *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
7858    }
7859    if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
7860       brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE;
7861       *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
7862    }
7863    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
7864       *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
7865    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
7866       *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
7867    if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
7868       *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
7869    if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7870       *mask |= BRW_CR0_FP_MODE_MASK;
7871 
7872    if (*mask != 0)
7873       assert((*mask & brw_mode) == brw_mode);
7874 
7875    return brw_mode;
7876 }
7877 
7878 static void
emit_shader_float_controls_execution_mode(nir_to_brw_state & ntb)7879 emit_shader_float_controls_execution_mode(nir_to_brw_state &ntb)
7880 {
7881    const fs_builder &bld = ntb.bld;
7882    fs_visitor &s = ntb.s;
7883 
7884    unsigned execution_mode = s.nir->info.float_controls_execution_mode;
7885    if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7886       return;
7887 
7888    fs_builder ubld = bld.exec_all().group(1, 0);
7889    fs_builder abld = ubld.annotate("shader floats control execution mode");
7890    unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask);
7891 
7892    if (mask == 0)
7893       return;
7894 
7895    abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
7896              brw_imm_d(mode), brw_imm_d(mask));
7897 }
7898 
7899 /**
7900  * Test the dispatch mask packing assumptions of
7901  * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
7902  * nir_to_brw() to cause a GPU hang if any shader invocation is
7903  * executed with an unexpected dispatch mask.
7904  */
7905 static UNUSED void
brw_fs_test_dispatch_packing(const fs_builder & bld)7906 brw_fs_test_dispatch_packing(const fs_builder &bld)
7907 {
7908    const fs_visitor *shader = bld.shader;
7909    const gl_shader_stage stage = shader->stage;
7910    const bool uses_vmask =
7911       stage == MESA_SHADER_FRAGMENT &&
7912       brw_wm_prog_data(shader->prog_data)->uses_vmask;
7913 
7914    if (brw_stage_has_packed_dispatch(shader->devinfo, stage,
7915                                      shader->max_polygons,
7916                                      shader->prog_data)) {
7917       const fs_builder ubld = bld.exec_all().group(1, 0);
7918       const brw_reg tmp = component(bld.vgrf(BRW_TYPE_UD), 0);
7919       const brw_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
7920 
7921       ubld.ADD(tmp, mask, brw_imm_ud(1));
7922       ubld.AND(tmp, mask, tmp);
7923 
7924       /* This will loop forever if the dispatch mask doesn't have the expected
7925        * form '2^n-1', in which case tmp will be non-zero.
7926        */
7927       bld.emit(BRW_OPCODE_DO);
7928       bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
7929       set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
7930    }
7931 }
7932 
7933 void
nir_to_brw(fs_visitor * s)7934 nir_to_brw(fs_visitor *s)
7935 {
7936    nir_to_brw_state ntb = {
7937       .s       = *s,
7938       .nir     = s->nir,
7939       .devinfo = s->devinfo,
7940       .mem_ctx = ralloc_context(NULL),
7941       .bld     = fs_builder(s).at_end(),
7942    };
7943 
7944    if (INTEL_DEBUG(DEBUG_ANNOTATION))
7945       ntb.annotate = true;
7946 
7947    if (ENABLE_FS_TEST_DISPATCH_PACKING)
7948       brw_fs_test_dispatch_packing(ntb.bld);
7949 
7950    for (unsigned i = 0; i < s->nir->printf_info_count; i++) {
7951       brw_stage_prog_data_add_printf(s->prog_data,
7952                                      s->mem_ctx,
7953                                      &s->nir->printf_info[i]);
7954    }
7955 
7956    emit_shader_float_controls_execution_mode(ntb);
7957 
7958    /* emit the arrays used for inputs and outputs - load/store intrinsics will
7959     * be converted to reads/writes of these arrays
7960     */
7961    fs_nir_setup_outputs(ntb);
7962    fs_nir_setup_uniforms(ntb.s);
7963    fs_nir_emit_system_values(ntb);
7964    ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
7965 
7966    fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
7967 
7968    ntb.bld.emit(SHADER_OPCODE_HALT_TARGET);
7969 
7970    ralloc_free(ntb.mem_ctx);
7971 
7972    brw_shader_phase_update(*s, BRW_SHADER_PHASE_AFTER_NIR);
7973 }
7974