• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/glsl/ir.h"
25 #include "brw_fs.h"
26 #include "brw_nir.h"
27 #include "brw_rt.h"
28 #include "brw_eu.h"
29 #include "nir_search_helpers.h"
30 #include "util/u_math.h"
31 #include "util/bitscan.h"
32 
33 using namespace brw;
34 
35 void
emit_nir_code()36 fs_visitor::emit_nir_code()
37 {
38    emit_shader_float_controls_execution_mode();
39 
40    /* emit the arrays used for inputs and outputs - load/store intrinsics will
41     * be converted to reads/writes of these arrays
42     */
43    nir_setup_outputs();
44    nir_setup_uniforms();
45    nir_emit_system_values();
46    last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width;
47 
48    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
49 
50    bld.emit(SHADER_OPCODE_HALT_TARGET);
51 }
52 
53 void
nir_setup_outputs()54 fs_visitor::nir_setup_outputs()
55 {
56    if (stage == MESA_SHADER_TESS_CTRL ||
57        stage == MESA_SHADER_TASK ||
58        stage == MESA_SHADER_MESH ||
59        stage == MESA_SHADER_FRAGMENT)
60       return;
61 
62    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
63 
64    /* Calculate the size of output registers in a separate pass, before
65     * allocating them.  With ARB_enhanced_layouts, multiple output variables
66     * may occupy the same slot, but have different type sizes.
67     */
68    nir_foreach_shader_out_variable(var, nir) {
69       const int loc = var->data.driver_location;
70       const unsigned var_vec4s =
71          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
72                            : type_size_vec4(var->type, true);
73       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
74    }
75 
76    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
77       if (vec4s[loc] == 0) {
78          loc++;
79          continue;
80       }
81 
82       unsigned reg_size = vec4s[loc];
83 
84       /* Check if there are any ranges that start within this range and extend
85        * past it. If so, include them in this allocation.
86        */
87       for (unsigned i = 1; i < reg_size; i++) {
88          assert(i + loc < ARRAY_SIZE(vec4s));
89          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
90       }
91 
92       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
93       for (unsigned i = 0; i < reg_size; i++) {
94          assert(loc + i < ARRAY_SIZE(outputs));
95          outputs[loc + i] = offset(reg, bld, 4 * i);
96       }
97 
98       loc += reg_size;
99    }
100 }
101 
102 void
nir_setup_uniforms()103 fs_visitor::nir_setup_uniforms()
104 {
105    /* Only the first compile gets to set up uniforms. */
106    if (push_constant_loc)
107       return;
108 
109    uniforms = nir->num_uniforms / 4;
110 
111    if (gl_shader_stage_is_compute(stage) && devinfo->verx10 < 125) {
112       /* Add uniforms for builtins after regular NIR uniforms. */
113       assert(uniforms == prog_data->nr_params);
114 
115       uint32_t *param;
116       if (nir->info.workgroup_size_variable &&
117           compiler->lower_variable_group_size) {
118          param = brw_stage_prog_data_add_params(prog_data, 3);
119          for (unsigned i = 0; i < 3; i++) {
120             param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
121             group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
122          }
123       }
124 
125       /* Subgroup ID must be the last uniform on the list.  This will make
126        * easier later to split between cross thread and per thread
127        * uniforms.
128        */
129       param = brw_stage_prog_data_add_params(prog_data, 1);
130       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
131       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
132    }
133 }
134 
135 static bool
emit_system_values_block(nir_block * block,fs_visitor * v)136 emit_system_values_block(nir_block *block, fs_visitor *v)
137 {
138    fs_reg *reg;
139 
140    nir_foreach_instr(instr, block) {
141       if (instr->type != nir_instr_type_intrinsic)
142          continue;
143 
144       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
145       switch (intrin->intrinsic) {
146       case nir_intrinsic_load_vertex_id:
147       case nir_intrinsic_load_base_vertex:
148          unreachable("should be lowered by nir_lower_system_values().");
149 
150       case nir_intrinsic_load_vertex_id_zero_base:
151       case nir_intrinsic_load_is_indexed_draw:
152       case nir_intrinsic_load_first_vertex:
153       case nir_intrinsic_load_instance_id:
154       case nir_intrinsic_load_base_instance:
155          unreachable("should be lowered by brw_nir_lower_vs_inputs().");
156          break;
157 
158       case nir_intrinsic_load_draw_id:
159          /* For Task/Mesh, draw_id will be handled later in
160           * nir_emit_mesh_task_intrinsic().
161           */
162          if (!gl_shader_stage_is_mesh(v->stage))
163             unreachable("should be lowered by brw_nir_lower_vs_inputs().");
164          break;
165 
166       case nir_intrinsic_load_invocation_id:
167          if (v->stage == MESA_SHADER_TESS_CTRL)
168             break;
169          assert(v->stage == MESA_SHADER_GEOMETRY);
170          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
171          if (reg->file == BAD_FILE) {
172             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
173             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
174             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
175             abld.SHR(iid, g1, brw_imm_ud(27u));
176             *reg = iid;
177          }
178          break;
179 
180       case nir_intrinsic_load_sample_pos:
181       case nir_intrinsic_load_sample_pos_or_center:
182          assert(v->stage == MESA_SHADER_FRAGMENT);
183          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
184          if (reg->file == BAD_FILE)
185             *reg = v->emit_samplepos_setup();
186          break;
187 
188       case nir_intrinsic_load_sample_id:
189          assert(v->stage == MESA_SHADER_FRAGMENT);
190          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
191          if (reg->file == BAD_FILE)
192             *reg = v->emit_sampleid_setup();
193          break;
194 
195       case nir_intrinsic_load_sample_mask_in:
196          assert(v->stage == MESA_SHADER_FRAGMENT);
197          assert(v->devinfo->ver >= 7);
198          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
199          if (reg->file == BAD_FILE)
200             *reg = v->emit_samplemaskin_setup();
201          break;
202 
203       case nir_intrinsic_load_workgroup_id:
204          assert(gl_shader_stage_uses_workgroup(v->stage));
205          reg = &v->nir_system_values[SYSTEM_VALUE_WORKGROUP_ID];
206          if (reg->file == BAD_FILE)
207             *reg = v->emit_work_group_id_setup();
208          break;
209 
210       case nir_intrinsic_load_helper_invocation:
211          assert(v->stage == MESA_SHADER_FRAGMENT);
212          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
213          if (reg->file == BAD_FILE) {
214             const fs_builder abld =
215                v->bld.annotate("gl_HelperInvocation", NULL);
216 
217             /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
218              * pixel mask is in g1.7 of the thread payload.
219              *
220              * We move the per-channel pixel enable bit to the low bit of each
221              * channel by shifting the byte containing the pixel mask by the
222              * vector immediate 0x76543210UV.
223              *
224              * The region of <1,8,0> reads only 1 byte (the pixel masks for
225              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
226              * masks for 2 and 3) in SIMD16.
227              */
228             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
229 
230             for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
231                const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
232                hbld.SHR(offset(shifted, hbld, i),
233                         stride(retype(brw_vec1_grf(1 + i, 7),
234                                       BRW_REGISTER_TYPE_UB),
235                                1, 8, 0),
236                         brw_imm_v(0x76543210));
237             }
238 
239             /* A set bit in the pixel mask means the channel is enabled, but
240              * that is the opposite of gl_HelperInvocation so we need to invert
241              * the mask.
242              *
243              * The negate source-modifier bit of logical instructions on Gfx8+
244              * performs 1's complement negation, so we can use that instead of
245              * a NOT instruction.
246              */
247             fs_reg inverted = negate(shifted);
248             if (v->devinfo->ver < 8) {
249                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
250                abld.NOT(inverted, shifted);
251             }
252 
253             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
254              * with 1 and negating.
255              */
256             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
257             abld.AND(anded, inverted, brw_imm_uw(1));
258 
259             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
260             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
261             *reg = dst;
262          }
263          break;
264 
265       case nir_intrinsic_load_frag_shading_rate:
266          reg = &v->nir_system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
267          if (reg->file == BAD_FILE)
268             *reg = v->emit_shading_rate_setup();
269          break;
270 
271       default:
272          break;
273       }
274    }
275 
276    return true;
277 }
278 
279 void
nir_emit_system_values()280 fs_visitor::nir_emit_system_values()
281 {
282    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
283    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
284       nir_system_values[i] = fs_reg();
285    }
286 
287    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
288     * never end up using it.
289     */
290    {
291       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
292       fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
293       reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
294 
295       const fs_builder allbld8 = abld.group(8, 0).exec_all();
296       allbld8.MOV(reg, brw_imm_v(0x76543210));
297       if (dispatch_width > 8)
298          allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
299       if (dispatch_width > 16) {
300          const fs_builder allbld16 = abld.group(16, 0).exec_all();
301          allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
302       }
303    }
304 
305    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
306    nir_foreach_block(block, impl)
307       emit_system_values_block(block, this);
308 }
309 
310 void
nir_emit_impl(nir_function_impl * impl)311 fs_visitor::nir_emit_impl(nir_function_impl *impl)
312 {
313    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
314    for (unsigned i = 0; i < impl->reg_alloc; i++) {
315       nir_locals[i] = fs_reg();
316    }
317 
318    foreach_list_typed(nir_register, reg, node, &impl->registers) {
319       unsigned array_elems =
320          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
321       unsigned size = array_elems * reg->num_components;
322       const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :
323          brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
324       nir_locals[reg->index] = bld.vgrf(reg_type, size);
325    }
326 
327    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
328                              impl->ssa_alloc);
329 
330    nir_emit_cf_list(&impl->body);
331 }
332 
333 void
nir_emit_cf_list(exec_list * list)334 fs_visitor::nir_emit_cf_list(exec_list *list)
335 {
336    exec_list_validate(list);
337    foreach_list_typed(nir_cf_node, node, node, list) {
338       switch (node->type) {
339       case nir_cf_node_if:
340          nir_emit_if(nir_cf_node_as_if(node));
341          break;
342 
343       case nir_cf_node_loop:
344          nir_emit_loop(nir_cf_node_as_loop(node));
345          break;
346 
347       case nir_cf_node_block:
348          nir_emit_block(nir_cf_node_as_block(node));
349          break;
350 
351       default:
352          unreachable("Invalid CFG node block");
353       }
354    }
355 }
356 
357 void
nir_emit_if(nir_if * if_stmt)358 fs_visitor::nir_emit_if(nir_if *if_stmt)
359 {
360    bool invert;
361    fs_reg cond_reg;
362 
363    /* If the condition has the form !other_condition, use other_condition as
364     * the source, but invert the predicate on the if instruction.
365     */
366    nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
367    if (cond != NULL && cond->op == nir_op_inot) {
368       invert = true;
369       cond_reg = get_nir_src(cond->src[0].src);
370       cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
371    } else {
372       invert = false;
373       cond_reg = get_nir_src(if_stmt->condition);
374    }
375 
376    /* first, put the condition into f0 */
377    fs_inst *inst = bld.MOV(bld.null_reg_d(),
378                            retype(cond_reg, BRW_REGISTER_TYPE_D));
379    inst->conditional_mod = BRW_CONDITIONAL_NZ;
380 
381    bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
382 
383    nir_emit_cf_list(&if_stmt->then_list);
384 
385    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
386       bld.emit(BRW_OPCODE_ELSE);
387       nir_emit_cf_list(&if_stmt->else_list);
388    }
389 
390    bld.emit(BRW_OPCODE_ENDIF);
391 
392    if (devinfo->ver < 7)
393       limit_dispatch_width(16, "Non-uniform control flow unsupported "
394                            "in SIMD32 mode.");
395 }
396 
397 void
nir_emit_loop(nir_loop * loop)398 fs_visitor::nir_emit_loop(nir_loop *loop)
399 {
400    bld.emit(BRW_OPCODE_DO);
401 
402    nir_emit_cf_list(&loop->body);
403 
404    bld.emit(BRW_OPCODE_WHILE);
405 
406    if (devinfo->ver < 7)
407       limit_dispatch_width(16, "Non-uniform control flow unsupported "
408                            "in SIMD32 mode.");
409 }
410 
411 void
nir_emit_block(nir_block * block)412 fs_visitor::nir_emit_block(nir_block *block)
413 {
414    nir_foreach_instr(instr, block) {
415       nir_emit_instr(instr);
416    }
417 }
418 
419 void
nir_emit_instr(nir_instr * instr)420 fs_visitor::nir_emit_instr(nir_instr *instr)
421 {
422    const fs_builder abld = bld.annotate(NULL, instr);
423 
424    switch (instr->type) {
425    case nir_instr_type_alu:
426       nir_emit_alu(abld, nir_instr_as_alu(instr), true);
427       break;
428 
429    case nir_instr_type_deref:
430       unreachable("All derefs should've been lowered");
431       break;
432 
433    case nir_instr_type_intrinsic:
434       switch (stage) {
435       case MESA_SHADER_VERTEX:
436          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
437          break;
438       case MESA_SHADER_TESS_CTRL:
439          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
440          break;
441       case MESA_SHADER_TESS_EVAL:
442          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
443          break;
444       case MESA_SHADER_GEOMETRY:
445          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
446          break;
447       case MESA_SHADER_FRAGMENT:
448          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
449          break;
450       case MESA_SHADER_COMPUTE:
451       case MESA_SHADER_KERNEL:
452          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
453          break;
454       case MESA_SHADER_RAYGEN:
455       case MESA_SHADER_ANY_HIT:
456       case MESA_SHADER_CLOSEST_HIT:
457       case MESA_SHADER_MISS:
458       case MESA_SHADER_INTERSECTION:
459       case MESA_SHADER_CALLABLE:
460          nir_emit_bs_intrinsic(abld, nir_instr_as_intrinsic(instr));
461          break;
462       case MESA_SHADER_TASK:
463          nir_emit_task_intrinsic(abld, nir_instr_as_intrinsic(instr));
464          break;
465       case MESA_SHADER_MESH:
466          nir_emit_mesh_intrinsic(abld, nir_instr_as_intrinsic(instr));
467          break;
468       default:
469          unreachable("unsupported shader stage");
470       }
471       break;
472 
473    case nir_instr_type_tex:
474       nir_emit_texture(abld, nir_instr_as_tex(instr));
475       break;
476 
477    case nir_instr_type_load_const:
478       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
479       break;
480 
481    case nir_instr_type_ssa_undef:
482       /* We create a new VGRF for undefs on every use (by handling
483        * them in get_nir_src()), rather than for each definition.
484        * This helps register coalescing eliminate MOVs from undef.
485        */
486       break;
487 
488    case nir_instr_type_jump:
489       nir_emit_jump(abld, nir_instr_as_jump(instr));
490       break;
491 
492    default:
493       unreachable("unknown instruction type");
494    }
495 }
496 
497 /**
498  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
499  * match instr.
500  */
501 bool
optimize_extract_to_float(nir_alu_instr * instr,const fs_reg & result)502 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
503                                       const fs_reg &result)
504 {
505    if (!instr->src[0].src.is_ssa ||
506        !instr->src[0].src.ssa->parent_instr)
507       return false;
508 
509    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
510       return false;
511 
512    nir_alu_instr *src0 =
513       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
514 
515    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
516        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
517       return false;
518 
519    unsigned element = nir_src_as_uint(src0->src[1].src);
520 
521    /* Element type to extract.*/
522    const brw_reg_type type = brw_int_type(
523       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
524       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
525 
526    fs_reg op0 = get_nir_src(src0->src[0].src);
527    op0.type = brw_type_for_nir_type(devinfo,
528       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
529                      nir_src_bit_size(src0->src[0].src)));
530    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
531 
532    bld.MOV(result, subscript(op0, type, element));
533    return true;
534 }
535 
536 bool
optimize_frontfacing_ternary(nir_alu_instr * instr,const fs_reg & result)537 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
538                                          const fs_reg &result)
539 {
540    nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
541    if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
542       return false;
543 
544    if (!nir_src_is_const(instr->src[1].src) ||
545        !nir_src_is_const(instr->src[2].src))
546       return false;
547 
548    const float value1 = nir_src_as_float(instr->src[1].src);
549    const float value2 = nir_src_as_float(instr->src[2].src);
550    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
551       return false;
552 
553    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
554    assert(value1 == -value2);
555 
556    fs_reg tmp = vgrf(glsl_type::int_type);
557 
558    if (devinfo->ver >= 12) {
559       /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
560       fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
561 
562       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
563        *
564        *    or(8)  tmp.1<2>W  g1.1<0,1,0>W  0x00003f80W
565        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
566        *
567        * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
568        */
569       if (value1 == -1.0f)
570          g1.negate = true;
571 
572       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
573              g1, brw_imm_uw(0x3f80));
574    } else if (devinfo->ver >= 6) {
575       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
576       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
577 
578       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
579        *
580        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
581        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
582        *
583        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
584        *
585        * This negation looks like it's safe in practice, because bits 0:4 will
586        * surely be TRIANGLES
587        */
588 
589       if (value1 == -1.0f) {
590          g0.negate = true;
591       }
592 
593       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
594              g0, brw_imm_uw(0x3f80));
595    } else {
596       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
597       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
598 
599       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
600        *
601        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
602        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
603        *
604        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
605        *
606        * This negation looks like it's safe in practice, because bits 0:4 will
607        * surely be TRIANGLES
608        */
609 
610       if (value1 == -1.0f) {
611          g1_6.negate = true;
612       }
613 
614       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
615    }
616    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
617 
618    return true;
619 }
620 
621 static void
emit_find_msb_using_lzd(const fs_builder & bld,const fs_reg & result,const fs_reg & src,bool is_signed)622 emit_find_msb_using_lzd(const fs_builder &bld,
623                         const fs_reg &result,
624                         const fs_reg &src,
625                         bool is_signed)
626 {
627    fs_inst *inst;
628    fs_reg temp = src;
629 
630    if (is_signed) {
631       /* LZD of an absolute value source almost always does the right
632        * thing.  There are two problem values:
633        *
634        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
635        *   0.  However, findMSB(int(0x80000000)) == 30.
636        *
637        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
638        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
639        *
640        *    For a value of zero or negative one, -1 will be returned.
641        *
642        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
643        *   findMSB(-(1<<x)) should return x-1.
644        *
645        * For all negative number cases, including 0x80000000 and
646        * 0xffffffff, the correct value is obtained from LZD if instead of
647        * negating the (already negative) value the logical-not is used.  A
648        * conditional logical-not can be achieved in two instructions.
649        */
650       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
651 
652       bld.ASR(temp, src, brw_imm_d(31));
653       bld.XOR(temp, temp, src);
654    }
655 
656    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
657            retype(temp, BRW_REGISTER_TYPE_UD));
658 
659    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
660     * from the LSB side. Subtract the result from 31 to convert the MSB
661     * count into an LSB count.  If no bits are set, LZD will return 32.
662     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
663     */
664    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
665    inst->src[0].negate = true;
666 }
667 
668 static brw_rnd_mode
brw_rnd_mode_from_nir_op(const nir_op op)669 brw_rnd_mode_from_nir_op (const nir_op op) {
670    switch (op) {
671    case nir_op_f2f16_rtz:
672       return BRW_RND_MODE_RTZ;
673    case nir_op_f2f16_rtne:
674       return BRW_RND_MODE_RTNE;
675    default:
676       unreachable("Operation doesn't support rounding mode");
677    }
678 }
679 
680 static brw_rnd_mode
brw_rnd_mode_from_execution_mode(unsigned execution_mode)681 brw_rnd_mode_from_execution_mode(unsigned execution_mode)
682 {
683    if (nir_has_any_rounding_mode_rtne(execution_mode))
684       return BRW_RND_MODE_RTNE;
685    if (nir_has_any_rounding_mode_rtz(execution_mode))
686       return BRW_RND_MODE_RTZ;
687    return BRW_RND_MODE_UNSPECIFIED;
688 }
689 
690 fs_reg
prepare_alu_destination_and_sources(const fs_builder & bld,nir_alu_instr * instr,fs_reg * op,bool need_dest)691 fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
692                                                 nir_alu_instr *instr,
693                                                 fs_reg *op,
694                                                 bool need_dest)
695 {
696    fs_reg result =
697       need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();
698 
699    result.type = brw_type_for_nir_type(devinfo,
700       (nir_alu_type)(nir_op_infos[instr->op].output_type |
701                      nir_dest_bit_size(instr->dest.dest)));
702 
703    assert(!instr->dest.saturate);
704 
705    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
706       /* We don't lower to source modifiers so they should not exist. */
707       assert(!instr->src[i].abs);
708       assert(!instr->src[i].negate);
709 
710       op[i] = get_nir_src(instr->src[i].src);
711       op[i].type = brw_type_for_nir_type(devinfo,
712          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
713                         nir_src_bit_size(instr->src[i].src)));
714    }
715 
716    /* Move and vecN instrutions may still be vectored.  Return the raw,
717     * vectored source and destination so that fs_visitor::nir_emit_alu can
718     * handle it.  Other callers should not have to handle these kinds of
719     * instructions.
720     */
721    switch (instr->op) {
722    case nir_op_mov:
723    case nir_op_vec2:
724    case nir_op_vec3:
725    case nir_op_vec4:
726    case nir_op_vec8:
727    case nir_op_vec16:
728       return result;
729    default:
730       break;
731    }
732 
733    /* At this point, we have dealt with any instruction that operates on
734     * more than a single channel.  Therefore, we can just adjust the source
735     * and destination registers for that channel and emit the instruction.
736     */
737    unsigned channel = 0;
738    if (nir_op_infos[instr->op].output_size == 0) {
739       /* Since NIR is doing the scalarizing for us, we should only ever see
740        * vectorized operations with a single channel.
741        */
742       assert(util_bitcount(instr->dest.write_mask) == 1);
743       channel = ffs(instr->dest.write_mask) - 1;
744 
745       result = offset(result, bld, channel);
746    }
747 
748    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
749       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
750       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
751    }
752 
753    return result;
754 }
755 
756 void
resolve_inot_sources(const fs_builder & bld,nir_alu_instr * instr,fs_reg * op)757 fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
758                                  fs_reg *op)
759 {
760    for (unsigned i = 0; i < 2; i++) {
761       nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
762 
763       if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
764          /* The source of the inot is now the source of instr. */
765          prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
766 
767          assert(!op[i].negate);
768          op[i].negate = true;
769       } else {
770          op[i] = resolve_source_modifiers(op[i]);
771       }
772    }
773 }
774 
775 bool
try_emit_b2fi_of_inot(const fs_builder & bld,fs_reg result,nir_alu_instr * instr)776 fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
777                                   fs_reg result,
778                                   nir_alu_instr *instr)
779 {
780    if (devinfo->ver < 6 || devinfo->verx10 >= 125)
781       return false;
782 
783    nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
784 
785    if (inot_instr == NULL || inot_instr->op != nir_op_inot)
786       return false;
787 
788    /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
789     * of valid size-changing combinations is a bit more complex.
790     *
791     * The source restriction is just because I was lazy about generating the
792     * constant below.
793     */
794    if (nir_dest_bit_size(instr->dest.dest) != 32 ||
795        nir_src_bit_size(inot_instr->src[0].src) != 32)
796       return false;
797 
798    /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
799     * this is float(1 + a).
800     */
801    fs_reg op;
802 
803    prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
804 
805    /* Ignore the saturate modifier, if there is one.  The result of the
806     * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
807     */
808    bld.ADD(result, op, brw_imm_d(1));
809 
810    return true;
811 }
812 
813 /**
814  * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
815  *
816  * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
817  * the source of \c instr that is a \c nir_op_fsign.
818  */
819 void
emit_fsign(const fs_builder & bld,const nir_alu_instr * instr,fs_reg result,fs_reg * op,unsigned fsign_src)820 fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,
821                        fs_reg result, fs_reg *op, unsigned fsign_src)
822 {
823    fs_inst *inst;
824 
825    assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
826    assert(fsign_src < nir_op_infos[instr->op].num_inputs);
827 
828    if (instr->op != nir_op_fsign) {
829       const nir_alu_instr *const fsign_instr =
830          nir_src_as_alu_instr(instr->src[fsign_src].src);
831 
832       /* op[fsign_src] has the nominal result of the fsign, and op[1 -
833        * fsign_src] has the other multiply source.  This must be rearranged so
834        * that op[0] is the source of the fsign op[1] is the other multiply
835        * source.
836        */
837       if (fsign_src != 0)
838          op[1] = op[0];
839 
840       op[0] = get_nir_src(fsign_instr->src[0].src);
841 
842       const nir_alu_type t =
843          (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
844                         nir_src_bit_size(fsign_instr->src[0].src));
845 
846       op[0].type = brw_type_for_nir_type(devinfo, t);
847 
848       unsigned channel = 0;
849       if (nir_op_infos[instr->op].output_size == 0) {
850          /* Since NIR is doing the scalarizing for us, we should only ever see
851           * vectorized operations with a single channel.
852           */
853          assert(util_bitcount(instr->dest.write_mask) == 1);
854          channel = ffs(instr->dest.write_mask) - 1;
855       }
856 
857       op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
858    }
859 
860    if (type_sz(op[0].type) == 2) {
861       /* AND(val, 0x8000) gives the sign bit.
862        *
863        * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
864        */
865       fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
866       bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
867 
868       op[0].type = BRW_REGISTER_TYPE_UW;
869       result.type = BRW_REGISTER_TYPE_UW;
870       bld.AND(result, op[0], brw_imm_uw(0x8000u));
871 
872       if (instr->op == nir_op_fsign)
873          inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
874       else {
875          /* Use XOR here to get the result sign correct. */
876          inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
877       }
878 
879       inst->predicate = BRW_PREDICATE_NORMAL;
880    } else if (type_sz(op[0].type) == 4) {
881       /* AND(val, 0x80000000) gives the sign bit.
882        *
883        * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
884        * zero.
885        */
886       bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
887 
888       op[0].type = BRW_REGISTER_TYPE_UD;
889       result.type = BRW_REGISTER_TYPE_UD;
890       bld.AND(result, op[0], brw_imm_ud(0x80000000u));
891 
892       if (instr->op == nir_op_fsign)
893          inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
894       else {
895          /* Use XOR here to get the result sign correct. */
896          inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
897       }
898 
899       inst->predicate = BRW_PREDICATE_NORMAL;
900    } else {
901       /* For doubles we do the same but we need to consider:
902        *
903        * - 2-src instructions can't operate with 64-bit immediates
904        * - The sign is encoded in the high 32-bit of each DF
905        * - We need to produce a DF result.
906        */
907 
908       fs_reg zero = vgrf(glsl_type::double_type);
909       bld.MOV(zero, setup_imm_df(bld, 0.0));
910       bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
911 
912       bld.MOV(result, zero);
913 
914       fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
915       bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
916               brw_imm_ud(0x80000000u));
917 
918       if (instr->op == nir_op_fsign) {
919          set_predicate(BRW_PREDICATE_NORMAL,
920                        bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
921       } else {
922          /* This could be done better in some cases.  If the scale is an
923           * immediate with the low 32-bits all 0, emitting a separate XOR and
924           * OR would allow an algebraic optimization to remove the OR.  There
925           * are currently zero instances of fsign(double(x))*IMM in shader-db
926           * or any test suite, so it is hard to care at this time.
927           */
928          fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
929          inst = bld.XOR(result_int64, result_int64,
930                         retype(op[1], BRW_REGISTER_TYPE_UQ));
931       }
932    }
933 }
934 
935 /**
936  * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
937  *
938  * Checks the operands of a \c nir_op_fmul to determine whether or not
939  * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
940  *
941  * \param instr  The multiplication instruction
942  *
943  * \param fsign_src The source of \c instr that may or may not be a
944  *                  \c nir_op_fsign
945  */
946 static bool
can_fuse_fmul_fsign(nir_alu_instr * instr,unsigned fsign_src)947 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
948 {
949    assert(instr->op == nir_op_fmul);
950 
951    nir_alu_instr *const fsign_instr =
952       nir_src_as_alu_instr(instr->src[fsign_src].src);
953 
954    /* Rules:
955     *
956     * 1. instr->src[fsign_src] must be a nir_op_fsign.
957     * 2. The nir_op_fsign can only be used by this multiplication.
958     * 3. The source that is the nir_op_fsign does not have source modifiers.
959     *    \c emit_fsign only examines the source modifiers of the source of the
960     *    \c nir_op_fsign.
961     *
962     * The nir_op_fsign must also not have the saturate modifier, but steps
963     * have already been taken (in nir_opt_algebraic) to ensure that.
964     */
965    return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
966           is_used_once(fsign_instr);
967 }
968 
969 void
nir_emit_alu(const fs_builder & bld,nir_alu_instr * instr,bool need_dest)970 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
971                          bool need_dest)
972 {
973    fs_inst *inst;
974    unsigned execution_mode =
975       bld.shader->nir->info.float_controls_execution_mode;
976 
977    fs_reg op[NIR_MAX_VEC_COMPONENTS];
978    fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest);
979 
980 #ifndef NDEBUG
981    /* Everything except raw moves, some type conversions, iabs, and ineg
982     * should have 8-bit sources lowered by nir_lower_bit_size in
983     * brw_preprocess_nir or by brw_nir_lower_conversions in
984     * brw_postprocess_nir.
985     */
986    switch (instr->op) {
987    case nir_op_mov:
988    case nir_op_vec2:
989    case nir_op_vec3:
990    case nir_op_vec4:
991    case nir_op_vec8:
992    case nir_op_vec16:
993    case nir_op_i2f16:
994    case nir_op_i2f32:
995    case nir_op_i2i16:
996    case nir_op_i2i32:
997    case nir_op_u2f16:
998    case nir_op_u2f32:
999    case nir_op_u2u16:
1000    case nir_op_u2u32:
1001    case nir_op_iabs:
1002    case nir_op_ineg:
1003    case nir_op_pack_32_4x8_split:
1004       break;
1005 
1006    default:
1007       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1008          assert(type_sz(op[i].type) > 1);
1009       }
1010    }
1011 #endif
1012 
1013    switch (instr->op) {
1014    case nir_op_mov:
1015    case nir_op_vec2:
1016    case nir_op_vec3:
1017    case nir_op_vec4:
1018    case nir_op_vec8:
1019    case nir_op_vec16: {
1020       fs_reg temp = result;
1021       bool need_extra_copy = false;
1022       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1023          if (!instr->src[i].src.is_ssa &&
1024              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
1025             need_extra_copy = true;
1026             temp = bld.vgrf(result.type, 4);
1027             break;
1028          }
1029       }
1030 
1031       for (unsigned i = 0; i < 4; i++) {
1032          if (!(instr->dest.write_mask & (1 << i)))
1033             continue;
1034 
1035          if (instr->op == nir_op_mov) {
1036             bld.MOV(offset(temp, bld, i),
1037                            offset(op[0], bld, instr->src[0].swizzle[i]));
1038          } else {
1039             bld.MOV(offset(temp, bld, i),
1040                            offset(op[i], bld, instr->src[i].swizzle[0]));
1041          }
1042       }
1043 
1044       /* In this case the source and destination registers were the same,
1045        * so we need to insert an extra set of moves in order to deal with
1046        * any swizzling.
1047        */
1048       if (need_extra_copy) {
1049          for (unsigned i = 0; i < 4; i++) {
1050             if (!(instr->dest.write_mask & (1 << i)))
1051                continue;
1052 
1053             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1054          }
1055       }
1056       return;
1057    }
1058 
1059    case nir_op_i2f32:
1060    case nir_op_u2f32:
1061       if (optimize_extract_to_float(instr, result))
1062          return;
1063       inst = bld.MOV(result, op[0]);
1064       break;
1065 
1066    case nir_op_f2f16_rtne:
1067    case nir_op_f2f16_rtz:
1068    case nir_op_f2f16: {
1069       brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
1070 
1071       if (nir_op_f2f16 == instr->op)
1072          rnd = brw_rnd_mode_from_execution_mode(execution_mode);
1073       else
1074          rnd = brw_rnd_mode_from_nir_op(instr->op);
1075 
1076       if (BRW_RND_MODE_UNSPECIFIED != rnd)
1077          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
1078 
1079       /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
1080        * on the HW gen, it is a special hw opcode or just a MOV, and
1081        * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
1082        *
1083        * But if we want to use that opcode, we need to provide support on
1084        * different optimizations and lowerings. As right now HF support is
1085        * only for gfx8+, it will be better to use directly the MOV, and use
1086        * BRW_OPCODE_F32TO16 when/if we work for HF support on gfx7.
1087        */
1088       assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1089       inst = bld.MOV(result, op[0]);
1090       break;
1091    }
1092 
1093    case nir_op_b2i8:
1094    case nir_op_b2i16:
1095    case nir_op_b2i32:
1096    case nir_op_b2i64:
1097    case nir_op_b2f16:
1098    case nir_op_b2f32:
1099    case nir_op_b2f64:
1100       if (try_emit_b2fi_of_inot(bld, result, instr))
1101          break;
1102       op[0].type = BRW_REGISTER_TYPE_D;
1103       op[0].negate = !op[0].negate;
1104       FALLTHROUGH;
1105    case nir_op_i2f64:
1106    case nir_op_i2i64:
1107    case nir_op_u2f64:
1108    case nir_op_u2u64:
1109    case nir_op_f2f64:
1110    case nir_op_f2i64:
1111    case nir_op_f2u64:
1112    case nir_op_i2i32:
1113    case nir_op_u2u32:
1114    case nir_op_f2i32:
1115    case nir_op_f2u32:
1116    case nir_op_i2f16:
1117    case nir_op_u2f16:
1118    case nir_op_f2i16:
1119    case nir_op_f2u16:
1120    case nir_op_f2i8:
1121    case nir_op_f2u8:
1122       if (result.type == BRW_REGISTER_TYPE_B ||
1123           result.type == BRW_REGISTER_TYPE_UB ||
1124           result.type == BRW_REGISTER_TYPE_HF)
1125          assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1126 
1127       if (op[0].type == BRW_REGISTER_TYPE_B ||
1128           op[0].type == BRW_REGISTER_TYPE_UB ||
1129           op[0].type == BRW_REGISTER_TYPE_HF)
1130          assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1131 
1132       inst = bld.MOV(result, op[0]);
1133       break;
1134 
1135    case nir_op_i2i8:
1136    case nir_op_u2u8:
1137       assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1138       FALLTHROUGH;
1139    case nir_op_i2i16:
1140    case nir_op_u2u16: {
1141       /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1142        * Emitting the instructions one by one results in two MOV instructions
1143        * that won't be propagated.  By handling both instructions here, a
1144        * single MOV is emitted.
1145        */
1146       nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1147       if (extract_instr != NULL) {
1148          if (extract_instr->op == nir_op_extract_u8 ||
1149              extract_instr->op == nir_op_extract_i8) {
1150             prepare_alu_destination_and_sources(bld, extract_instr, op, false);
1151 
1152             const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1153             const brw_reg_type type =
1154                brw_int_type(1, extract_instr->op == nir_op_extract_i8);
1155 
1156             op[0] = subscript(op[0], type, byte);
1157          } else if (extract_instr->op == nir_op_extract_u16 ||
1158                     extract_instr->op == nir_op_extract_i16) {
1159             prepare_alu_destination_and_sources(bld, extract_instr, op, false);
1160 
1161             const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1162             const brw_reg_type type =
1163                brw_int_type(2, extract_instr->op == nir_op_extract_i16);
1164 
1165             op[0] = subscript(op[0], type, word);
1166          }
1167       }
1168 
1169       inst = bld.MOV(result, op[0]);
1170       break;
1171    }
1172 
1173    case nir_op_fsat:
1174       inst = bld.MOV(result, op[0]);
1175       inst->saturate = true;
1176       break;
1177 
1178    case nir_op_fneg:
1179    case nir_op_ineg:
1180       op[0].negate = true;
1181       inst = bld.MOV(result, op[0]);
1182       break;
1183 
1184    case nir_op_fabs:
1185    case nir_op_iabs:
1186       op[0].negate = false;
1187       op[0].abs = true;
1188       inst = bld.MOV(result, op[0]);
1189       break;
1190 
1191    case nir_op_f2f32:
1192       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1193          brw_rnd_mode rnd =
1194             brw_rnd_mode_from_execution_mode(execution_mode);
1195          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1196                   brw_imm_d(rnd));
1197       }
1198 
1199       if (op[0].type == BRW_REGISTER_TYPE_HF)
1200          assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1201 
1202       inst = bld.MOV(result, op[0]);
1203       break;
1204 
1205    case nir_op_fsign:
1206       emit_fsign(bld, instr, result, op, 0);
1207       break;
1208 
1209    case nir_op_frcp:
1210       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
1211       break;
1212 
1213    case nir_op_fexp2:
1214       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
1215       break;
1216 
1217    case nir_op_flog2:
1218       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
1219       break;
1220 
1221    case nir_op_fsin:
1222       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
1223       break;
1224 
1225    case nir_op_fcos:
1226       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1227       break;
1228 
1229    case nir_op_fddx_fine:
1230       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1231       break;
1232    case nir_op_fddx:
1233    case nir_op_fddx_coarse:
1234       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1235       break;
1236    case nir_op_fddy_fine:
1237       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1238       break;
1239    case nir_op_fddy:
1240    case nir_op_fddy_coarse:
1241       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1242       break;
1243 
1244    case nir_op_fadd:
1245       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1246          brw_rnd_mode rnd =
1247             brw_rnd_mode_from_execution_mode(execution_mode);
1248          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1249                   brw_imm_d(rnd));
1250       }
1251       FALLTHROUGH;
1252    case nir_op_iadd:
1253       inst = bld.ADD(result, op[0], op[1]);
1254       break;
1255 
1256    case nir_op_iadd3:
1257       inst = bld.ADD3(result, op[0], op[1], op[2]);
1258       break;
1259 
1260    case nir_op_iadd_sat:
1261    case nir_op_uadd_sat:
1262       inst = bld.ADD(result, op[0], op[1]);
1263       inst->saturate = true;
1264       break;
1265 
1266    case nir_op_isub_sat:
1267       bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1268       break;
1269 
1270    case nir_op_usub_sat:
1271       bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1272       break;
1273 
1274    case nir_op_irhadd:
1275    case nir_op_urhadd:
1276       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1277       inst = bld.AVG(result, op[0], op[1]);
1278       break;
1279 
1280    case nir_op_ihadd:
1281    case nir_op_uhadd: {
1282       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1283       fs_reg tmp = bld.vgrf(result.type);
1284 
1285       if (devinfo->ver >= 8) {
1286          op[0] = resolve_source_modifiers(op[0]);
1287          op[1] = resolve_source_modifiers(op[1]);
1288       }
1289 
1290       /* AVG(x, y) - ((x ^ y) & 1) */
1291       bld.XOR(tmp, op[0], op[1]);
1292       bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type));
1293       bld.AVG(result, op[0], op[1]);
1294       inst = bld.ADD(result, result, tmp);
1295       inst->src[1].negate = true;
1296       break;
1297    }
1298 
1299    case nir_op_fmul:
1300       for (unsigned i = 0; i < 2; i++) {
1301          if (can_fuse_fmul_fsign(instr, i)) {
1302             emit_fsign(bld, instr, result, op, i);
1303             return;
1304          }
1305       }
1306 
1307       /* We emit the rounding mode after the previous fsign optimization since
1308        * it won't result in a MUL, but will try to negate the value by other
1309        * means.
1310        */
1311       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1312          brw_rnd_mode rnd =
1313             brw_rnd_mode_from_execution_mode(execution_mode);
1314          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1315                   brw_imm_d(rnd));
1316       }
1317 
1318       inst = bld.MUL(result, op[0], op[1]);
1319       break;
1320 
1321    case nir_op_imul_2x32_64:
1322    case nir_op_umul_2x32_64:
1323       bld.MUL(result, op[0], op[1]);
1324       break;
1325 
1326    case nir_op_imul_32x16:
1327    case nir_op_umul_32x16: {
1328       const bool ud = instr->op == nir_op_umul_32x16;
1329 
1330       assert(nir_dest_bit_size(instr->dest.dest) == 32);
1331 
1332       /* Before Gfx7, the order of the 32-bit source and the 16-bit source was
1333        * swapped.  The extension isn't enabled on those platforms, so don't
1334        * pretend to support the differences.
1335        */
1336       assert(devinfo->ver >= 7);
1337 
1338       if (op[1].file == IMM)
1339          op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d);
1340       else {
1341          const enum brw_reg_type word_type =
1342             ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W;
1343 
1344          op[1] = subscript(op[1], word_type, 0);
1345       }
1346 
1347       const enum brw_reg_type dword_type =
1348          ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
1349 
1350       bld.MUL(result, retype(op[0], dword_type), op[1]);
1351       break;
1352    }
1353 
1354    case nir_op_imul:
1355       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1356       bld.MUL(result, op[0], op[1]);
1357       break;
1358 
1359    case nir_op_imul_high:
1360    case nir_op_umul_high:
1361       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1362       if (nir_dest_bit_size(instr->dest.dest) == 32) {
1363          bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1364       } else {
1365          fs_reg tmp = bld.vgrf(brw_reg_type_from_bit_size(32, op[0].type));
1366          bld.MUL(tmp, op[0], op[1]);
1367          bld.MOV(result, subscript(tmp, result.type, 1));
1368       }
1369       break;
1370 
1371    case nir_op_idiv:
1372    case nir_op_udiv:
1373       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1374       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1375       break;
1376 
1377    case nir_op_uadd_carry:
1378       unreachable("Should have been lowered by carry_to_arith().");
1379 
1380    case nir_op_usub_borrow:
1381       unreachable("Should have been lowered by borrow_to_arith().");
1382 
1383    case nir_op_umod:
1384    case nir_op_irem:
1385       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1386        * appears that our hardware just does the right thing for signed
1387        * remainder.
1388        */
1389       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1390       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1391       break;
1392 
1393    case nir_op_imod: {
1394       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1395       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1396 
1397       /* Math instructions don't support conditional mod */
1398       inst = bld.MOV(bld.null_reg_d(), result);
1399       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1400 
1401       /* Now, we need to determine if signs of the sources are different.
1402        * When we XOR the sources, the top bit is 0 if they are the same and 1
1403        * if they are different.  We can then use a conditional modifier to
1404        * turn that into a predicate.  This leads us to an XOR.l instruction.
1405        *
1406        * Technically, according to the PRM, you're not allowed to use .l on a
1407        * XOR instruction.  However, empirical experiments and Curro's reading
1408        * of the simulator source both indicate that it's safe.
1409        */
1410       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1411       inst = bld.XOR(tmp, op[0], op[1]);
1412       inst->predicate = BRW_PREDICATE_NORMAL;
1413       inst->conditional_mod = BRW_CONDITIONAL_L;
1414 
1415       /* If the result of the initial remainder operation is non-zero and the
1416        * two sources have different signs, add in a copy of op[1] to get the
1417        * final integer modulus value.
1418        */
1419       inst = bld.ADD(result, result, op[1]);
1420       inst->predicate = BRW_PREDICATE_NORMAL;
1421       break;
1422    }
1423 
1424    case nir_op_flt32:
1425    case nir_op_fge32:
1426    case nir_op_feq32:
1427    case nir_op_fneu32: {
1428       fs_reg dest = result;
1429 
1430       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1431       if (bit_size != 32)
1432          dest = bld.vgrf(op[0].type, 1);
1433 
1434       bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
1435 
1436       if (bit_size > 32) {
1437          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1438       } else if(bit_size < 32) {
1439          /* When we convert the result to 32-bit we need to be careful and do
1440           * it as a signed conversion to get sign extension (for 32-bit true)
1441           */
1442          const brw_reg_type src_type =
1443             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1444 
1445          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1446       }
1447       break;
1448    }
1449 
1450    case nir_op_ilt32:
1451    case nir_op_ult32:
1452    case nir_op_ige32:
1453    case nir_op_uge32:
1454    case nir_op_ieq32:
1455    case nir_op_ine32: {
1456       fs_reg dest = result;
1457 
1458       const uint32_t bit_size = type_sz(op[0].type) * 8;
1459       if (bit_size != 32)
1460          dest = bld.vgrf(op[0].type, 1);
1461 
1462       bld.CMP(dest, op[0], op[1],
1463               brw_cmod_for_nir_comparison(instr->op));
1464 
1465       if (bit_size > 32) {
1466          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1467       } else if (bit_size < 32) {
1468          /* When we convert the result to 32-bit we need to be careful and do
1469           * it as a signed conversion to get sign extension (for 32-bit true)
1470           */
1471          const brw_reg_type src_type =
1472             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1473 
1474          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1475       }
1476       break;
1477    }
1478 
1479    case nir_op_inot:
1480       if (devinfo->ver >= 8) {
1481          nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1482 
1483          if (inot_src_instr != NULL &&
1484              (inot_src_instr->op == nir_op_ior ||
1485               inot_src_instr->op == nir_op_ixor ||
1486               inot_src_instr->op == nir_op_iand)) {
1487             /* The sources of the source logical instruction are now the
1488              * sources of the instruction that will be generated.
1489              */
1490             prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
1491             resolve_inot_sources(bld, inot_src_instr, op);
1492 
1493             /* Smash all of the sources and destination to be signed.  This
1494              * doesn't matter for the operation of the instruction, but cmod
1495              * propagation fails on unsigned sources with negation (due to
1496              * fs_inst::can_do_cmod returning false).
1497              */
1498             result.type =
1499                brw_type_for_nir_type(devinfo,
1500                                      (nir_alu_type)(nir_type_int |
1501                                                     nir_dest_bit_size(instr->dest.dest)));
1502             op[0].type =
1503                brw_type_for_nir_type(devinfo,
1504                                      (nir_alu_type)(nir_type_int |
1505                                                     nir_src_bit_size(inot_src_instr->src[0].src)));
1506             op[1].type =
1507                brw_type_for_nir_type(devinfo,
1508                                      (nir_alu_type)(nir_type_int |
1509                                                     nir_src_bit_size(inot_src_instr->src[1].src)));
1510 
1511             /* For XOR, only invert one of the sources.  Arbitrarily choose
1512              * the first source.
1513              */
1514             op[0].negate = !op[0].negate;
1515             if (inot_src_instr->op != nir_op_ixor)
1516                op[1].negate = !op[1].negate;
1517 
1518             switch (inot_src_instr->op) {
1519             case nir_op_ior:
1520                bld.AND(result, op[0], op[1]);
1521                return;
1522 
1523             case nir_op_iand:
1524                bld.OR(result, op[0], op[1]);
1525                return;
1526 
1527             case nir_op_ixor:
1528                bld.XOR(result, op[0], op[1]);
1529                return;
1530 
1531             default:
1532                unreachable("impossible opcode");
1533             }
1534          }
1535          op[0] = resolve_source_modifiers(op[0]);
1536       }
1537       bld.NOT(result, op[0]);
1538       break;
1539    case nir_op_ixor:
1540       if (devinfo->ver >= 8) {
1541          resolve_inot_sources(bld, instr, op);
1542       }
1543       bld.XOR(result, op[0], op[1]);
1544       break;
1545    case nir_op_ior:
1546       if (devinfo->ver >= 8) {
1547          resolve_inot_sources(bld, instr, op);
1548       }
1549       bld.OR(result, op[0], op[1]);
1550       break;
1551    case nir_op_iand:
1552       if (devinfo->ver >= 8) {
1553          resolve_inot_sources(bld, instr, op);
1554       }
1555       bld.AND(result, op[0], op[1]);
1556       break;
1557 
1558    case nir_op_fdot2:
1559    case nir_op_fdot3:
1560    case nir_op_fdot4:
1561    case nir_op_b32all_fequal2:
1562    case nir_op_b32all_iequal2:
1563    case nir_op_b32all_fequal3:
1564    case nir_op_b32all_iequal3:
1565    case nir_op_b32all_fequal4:
1566    case nir_op_b32all_iequal4:
1567    case nir_op_b32any_fnequal2:
1568    case nir_op_b32any_inequal2:
1569    case nir_op_b32any_fnequal3:
1570    case nir_op_b32any_inequal3:
1571    case nir_op_b32any_fnequal4:
1572    case nir_op_b32any_inequal4:
1573       unreachable("Lowered by nir_lower_alu_reductions");
1574 
1575    case nir_op_ldexp:
1576       unreachable("not reached: should be handled by ldexp_to_arith()");
1577 
1578    case nir_op_fsqrt:
1579       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1580       break;
1581 
1582    case nir_op_frsq:
1583       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1584       break;
1585 
1586    case nir_op_i2b32:
1587    case nir_op_f2b32: {
1588       uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1589       if (bit_size == 64) {
1590          /* two-argument instructions can't take 64-bit immediates */
1591          fs_reg zero;
1592          fs_reg tmp;
1593 
1594          if (instr->op == nir_op_f2b32) {
1595             zero = vgrf(glsl_type::double_type);
1596             tmp = vgrf(glsl_type::double_type);
1597             bld.MOV(zero, setup_imm_df(bld, 0.0));
1598          } else {
1599             zero = vgrf(glsl_type::int64_t_type);
1600             tmp = vgrf(glsl_type::int64_t_type);
1601             bld.MOV(zero, brw_imm_q(0));
1602          }
1603 
1604          /* A SIMD16 execution needs to be split in two instructions, so use
1605           * a vgrf instead of the flag register as dst so instruction splitting
1606           * works
1607           */
1608          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1609          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1610       } else {
1611          fs_reg zero;
1612          if (bit_size == 32) {
1613             zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);
1614          } else {
1615             assert(bit_size == 16);
1616             zero = instr->op == nir_op_f2b32 ?
1617                retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1618          }
1619          bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1620       }
1621       break;
1622    }
1623 
1624    case nir_op_ftrunc:
1625       inst = bld.RNDZ(result, op[0]);
1626       if (devinfo->ver < 6) {
1627          set_condmod(BRW_CONDITIONAL_R, inst);
1628          set_predicate(BRW_PREDICATE_NORMAL,
1629                        bld.ADD(result, result, brw_imm_f(1.0f)));
1630          inst = bld.MOV(result, result); /* for potential saturation */
1631       }
1632       break;
1633 
1634    case nir_op_fceil: {
1635       op[0].negate = !op[0].negate;
1636       fs_reg temp = vgrf(glsl_type::float_type);
1637       bld.RNDD(temp, op[0]);
1638       temp.negate = true;
1639       inst = bld.MOV(result, temp);
1640       break;
1641    }
1642    case nir_op_ffloor:
1643       inst = bld.RNDD(result, op[0]);
1644       break;
1645    case nir_op_ffract:
1646       inst = bld.FRC(result, op[0]);
1647       break;
1648    case nir_op_fround_even:
1649       inst = bld.RNDE(result, op[0]);
1650       if (devinfo->ver < 6) {
1651          set_condmod(BRW_CONDITIONAL_R, inst);
1652          set_predicate(BRW_PREDICATE_NORMAL,
1653                        bld.ADD(result, result, brw_imm_f(1.0f)));
1654          inst = bld.MOV(result, result); /* for potential saturation */
1655       }
1656       break;
1657 
1658    case nir_op_fquantize2f16: {
1659       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1660       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1661       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1662 
1663       /* The destination stride must be at least as big as the source stride. */
1664       tmp16.type = devinfo->ver > 7
1665          ? BRW_REGISTER_TYPE_HF : BRW_REGISTER_TYPE_W;
1666       tmp16.stride = 2;
1667 
1668       /* Check for denormal */
1669       fs_reg abs_src0 = op[0];
1670       abs_src0.abs = true;
1671       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1672               BRW_CONDITIONAL_L);
1673       /* Get the appropriately signed zero */
1674       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1675               retype(op[0], BRW_REGISTER_TYPE_UD),
1676               brw_imm_ud(0x80000000));
1677       /* Do the actual F32 -> F16 -> F32 conversion */
1678       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1679       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1680       /* Select that or zero based on normal status */
1681       inst = bld.SEL(result, zero, tmp32);
1682       inst->predicate = BRW_PREDICATE_NORMAL;
1683       break;
1684    }
1685 
1686    case nir_op_imin:
1687    case nir_op_umin:
1688    case nir_op_fmin:
1689       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1690       break;
1691 
1692    case nir_op_imax:
1693    case nir_op_umax:
1694    case nir_op_fmax:
1695       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1696       break;
1697 
1698    case nir_op_pack_snorm_2x16:
1699    case nir_op_pack_snorm_4x8:
1700    case nir_op_pack_unorm_2x16:
1701    case nir_op_pack_unorm_4x8:
1702    case nir_op_unpack_snorm_2x16:
1703    case nir_op_unpack_snorm_4x8:
1704    case nir_op_unpack_unorm_2x16:
1705    case nir_op_unpack_unorm_4x8:
1706    case nir_op_unpack_half_2x16:
1707    case nir_op_pack_half_2x16:
1708       unreachable("not reached: should be handled by lower_packing_builtins");
1709 
1710    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
1711       assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1712       FALLTHROUGH;
1713    case nir_op_unpack_half_2x16_split_x:
1714       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1715                       subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1716       break;
1717 
1718    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
1719       assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1720       FALLTHROUGH;
1721    case nir_op_unpack_half_2x16_split_y:
1722       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1723                       subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1724       break;
1725 
1726    case nir_op_pack_64_2x32_split:
1727    case nir_op_pack_32_2x16_split:
1728       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1729       break;
1730 
1731    case nir_op_pack_32_4x8_split:
1732       bld.emit(FS_OPCODE_PACK, result, op, 4);
1733       break;
1734 
1735    case nir_op_unpack_64_2x32_split_x:
1736    case nir_op_unpack_64_2x32_split_y: {
1737       if (instr->op == nir_op_unpack_64_2x32_split_x)
1738          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1739       else
1740          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1741       break;
1742    }
1743 
1744    case nir_op_unpack_32_2x16_split_x:
1745    case nir_op_unpack_32_2x16_split_y: {
1746       if (instr->op == nir_op_unpack_32_2x16_split_x)
1747          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1748       else
1749          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1750       break;
1751    }
1752 
1753    case nir_op_fpow:
1754       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1755       break;
1756 
1757    case nir_op_bitfield_reverse:
1758       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1759       bld.BFREV(result, op[0]);
1760       break;
1761 
1762    case nir_op_bit_count:
1763       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1764       bld.CBIT(result, op[0]);
1765       break;
1766 
1767    case nir_op_ufind_msb: {
1768       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1769       emit_find_msb_using_lzd(bld, result, op[0], false);
1770       break;
1771    }
1772 
1773    case nir_op_uclz:
1774       assert(nir_dest_bit_size(instr->dest.dest) == 32);
1775       bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1776       break;
1777 
1778    case nir_op_ifind_msb: {
1779       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1780 
1781       if (devinfo->ver < 7) {
1782          emit_find_msb_using_lzd(bld, result, op[0], true);
1783       } else {
1784          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1785 
1786          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1787           * count from the LSB side. If FBH didn't return an error
1788           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1789           * count into an LSB count.
1790           */
1791          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1792 
1793          inst = bld.ADD(result, result, brw_imm_d(31));
1794          inst->predicate = BRW_PREDICATE_NORMAL;
1795          inst->src[0].negate = true;
1796       }
1797       break;
1798    }
1799 
1800    case nir_op_find_lsb:
1801       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1802 
1803       if (devinfo->ver < 7) {
1804          fs_reg temp = vgrf(glsl_type::int_type);
1805 
1806          /* (x & -x) generates a value that consists of only the LSB of x.
1807           * For all powers of 2, findMSB(y) == findLSB(y).
1808           */
1809          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1810          fs_reg negated_src = src;
1811 
1812          /* One must be negated, and the other must be non-negated.  It
1813           * doesn't matter which is which.
1814           */
1815          negated_src.negate = true;
1816          src.negate = false;
1817 
1818          bld.AND(temp, src, negated_src);
1819          emit_find_msb_using_lzd(bld, result, temp, false);
1820       } else {
1821          bld.FBL(result, op[0]);
1822       }
1823       break;
1824 
1825    case nir_op_ubitfield_extract:
1826    case nir_op_ibitfield_extract:
1827       unreachable("should have been lowered");
1828    case nir_op_ubfe:
1829    case nir_op_ibfe:
1830       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1831       bld.BFE(result, op[2], op[1], op[0]);
1832       break;
1833    case nir_op_bfm:
1834       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1835       bld.BFI1(result, op[0], op[1]);
1836       break;
1837    case nir_op_bfi:
1838       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1839       bld.BFI2(result, op[0], op[1], op[2]);
1840       break;
1841 
1842    case nir_op_bitfield_insert:
1843       unreachable("not reached: should have been lowered");
1844 
1845    /* For all shift operations:
1846     *
1847     * Gen4 - Gen7: After application of source modifiers, the low 5-bits of
1848     * src1 are used an unsigned value for the shift count.
1849     *
1850     * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low
1851     * 6-bit of src1 are used.
1852     *
1853     * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for
1854     * W or UW src0).
1855     *
1856     * The implication is that the following instruction will produce a
1857     * different result on Gen9+ than on previous platforms:
1858     *
1859     *    shr(8)    g4<1>UW    g12<8,8,1>UW    0x0010UW
1860     *
1861     * where Gen9+ will shift by zero, and earlier platforms will shift by 16.
1862     *
1863     * This does not seem to be the case.  Experimentally, it has been
1864     * determined that shifts of 16-bit values on Gen8 behave properly.  Shifts
1865     * of 8-bit values on both Gen8 and Gen9 do not.  Gen11+ lowers 8-bit
1866     * values, so those platforms were not tested.  No features expose access
1867     * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not
1868     * tested either.  See
1869     * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76.
1870     *
1871     * This is part of the reason 8-bit values are lowered to 16-bit on all
1872     * platforms.
1873     */
1874    case nir_op_ishl:
1875       bld.SHL(result, op[0], op[1]);
1876       break;
1877    case nir_op_ishr:
1878       bld.ASR(result, op[0], op[1]);
1879       break;
1880    case nir_op_ushr:
1881       bld.SHR(result, op[0], op[1]);
1882       break;
1883 
1884    case nir_op_urol:
1885       bld.ROL(result, op[0], op[1]);
1886       break;
1887    case nir_op_uror:
1888       bld.ROR(result, op[0], op[1]);
1889       break;
1890 
1891    case nir_op_pack_half_2x16_split:
1892       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1893       break;
1894 
1895    case nir_op_sdot_4x8_iadd:
1896    case nir_op_sdot_4x8_iadd_sat:
1897       inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
1898                       retype(op[2], BRW_REGISTER_TYPE_D),
1899                       retype(op[0], BRW_REGISTER_TYPE_D),
1900                       retype(op[1], BRW_REGISTER_TYPE_D));
1901 
1902       if (instr->op == nir_op_sdot_4x8_iadd_sat)
1903          inst->saturate = true;
1904       break;
1905 
1906    case nir_op_udot_4x8_uadd:
1907    case nir_op_udot_4x8_uadd_sat:
1908       inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_UD),
1909                       retype(op[2], BRW_REGISTER_TYPE_UD),
1910                       retype(op[0], BRW_REGISTER_TYPE_UD),
1911                       retype(op[1], BRW_REGISTER_TYPE_UD));
1912 
1913       if (instr->op == nir_op_udot_4x8_uadd_sat)
1914          inst->saturate = true;
1915       break;
1916 
1917    case nir_op_sudot_4x8_iadd:
1918    case nir_op_sudot_4x8_iadd_sat:
1919       inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
1920                       retype(op[2], BRW_REGISTER_TYPE_D),
1921                       retype(op[0], BRW_REGISTER_TYPE_D),
1922                       retype(op[1], BRW_REGISTER_TYPE_UD));
1923 
1924       if (instr->op == nir_op_sudot_4x8_iadd_sat)
1925          inst->saturate = true;
1926       break;
1927 
1928    case nir_op_ffma:
1929       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1930          brw_rnd_mode rnd =
1931             brw_rnd_mode_from_execution_mode(execution_mode);
1932          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1933                   brw_imm_d(rnd));
1934       }
1935 
1936       inst = bld.MAD(result, op[2], op[1], op[0]);
1937       break;
1938 
1939    case nir_op_flrp:
1940       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1941          brw_rnd_mode rnd =
1942             brw_rnd_mode_from_execution_mode(execution_mode);
1943          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1944                   brw_imm_d(rnd));
1945       }
1946 
1947       inst = bld.LRP(result, op[0], op[1], op[2]);
1948       break;
1949 
1950    case nir_op_b32csel:
1951       if (optimize_frontfacing_ternary(instr, result))
1952          return;
1953 
1954       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1955       inst = bld.SEL(result, op[1], op[2]);
1956       inst->predicate = BRW_PREDICATE_NORMAL;
1957       break;
1958 
1959    case nir_op_extract_u8:
1960    case nir_op_extract_i8: {
1961       unsigned byte = nir_src_as_uint(instr->src[1].src);
1962 
1963       /* The PRMs say:
1964        *
1965        *    BDW+
1966        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1967        *    Use two instructions and a word or DWord intermediate integer type.
1968        */
1969       if (nir_dest_bit_size(instr->dest.dest) == 64) {
1970          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1971 
1972          if (instr->op == nir_op_extract_i8) {
1973             /* If we need to sign extend, extract to a word first */
1974             fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1975             bld.MOV(w_temp, subscript(op[0], type, byte));
1976             bld.MOV(result, w_temp);
1977          } else if (byte & 1) {
1978             /* Extract the high byte from the word containing the desired byte
1979              * offset.
1980              */
1981             bld.SHR(result,
1982                     subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1983                     brw_imm_uw(8));
1984          } else {
1985             /* Otherwise use an AND with 0xff and a word type */
1986             bld.AND(result,
1987                     subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1988                     brw_imm_uw(0xff));
1989          }
1990       } else {
1991          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1992          bld.MOV(result, subscript(op[0], type, byte));
1993       }
1994       break;
1995    }
1996 
1997    case nir_op_extract_u16:
1998    case nir_op_extract_i16: {
1999       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
2000       unsigned word = nir_src_as_uint(instr->src[1].src);
2001       bld.MOV(result, subscript(op[0], type, word));
2002       break;
2003    }
2004 
2005    default:
2006       unreachable("unhandled instruction");
2007    }
2008 
2009    /* If we need to do a boolean resolve, replace the result with -(x & 1)
2010     * to sign extend the low bit to 0/~0
2011     */
2012    if (devinfo->ver <= 5 &&
2013        !result.is_null() &&
2014        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
2015       fs_reg masked = vgrf(glsl_type::int_type);
2016       bld.AND(masked, result, brw_imm_d(1));
2017       masked.negate = true;
2018       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
2019    }
2020 }
2021 
2022 void
nir_emit_load_const(const fs_builder & bld,nir_load_const_instr * instr)2023 fs_visitor::nir_emit_load_const(const fs_builder &bld,
2024                                 nir_load_const_instr *instr)
2025 {
2026    const brw_reg_type reg_type =
2027       brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
2028    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
2029 
2030    switch (instr->def.bit_size) {
2031    case 8:
2032       for (unsigned i = 0; i < instr->def.num_components; i++)
2033          bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
2034       break;
2035 
2036    case 16:
2037       for (unsigned i = 0; i < instr->def.num_components; i++)
2038          bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
2039       break;
2040 
2041    case 32:
2042       for (unsigned i = 0; i < instr->def.num_components; i++)
2043          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
2044       break;
2045 
2046    case 64:
2047       assert(devinfo->ver >= 7);
2048       if (devinfo->ver == 7) {
2049          /* We don't get 64-bit integer types until gfx8 */
2050          for (unsigned i = 0; i < instr->def.num_components; i++) {
2051             bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
2052                     setup_imm_df(bld, instr->value[i].f64));
2053          }
2054       } else {
2055          for (unsigned i = 0; i < instr->def.num_components; i++)
2056             bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
2057       }
2058       break;
2059 
2060    default:
2061       unreachable("Invalid bit size");
2062    }
2063 
2064    nir_ssa_values[instr->def.index] = reg;
2065 }
2066 
2067 fs_reg
get_nir_src(const nir_src & src)2068 fs_visitor::get_nir_src(const nir_src &src)
2069 {
2070    fs_reg reg;
2071    if (src.is_ssa) {
2072       if (nir_src_is_undef(src)) {
2073          const brw_reg_type reg_type =
2074             brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
2075          reg = bld.vgrf(reg_type, src.ssa->num_components);
2076       } else {
2077          reg = nir_ssa_values[src.ssa->index];
2078       }
2079    } else {
2080       /* We don't handle indirects on locals */
2081       assert(src.reg.indirect == NULL);
2082       reg = offset(nir_locals[src.reg.reg->index], bld,
2083                    src.reg.base_offset * src.reg.reg->num_components);
2084    }
2085 
2086    if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
2087       /* The only 64-bit type available on gfx7 is DF, so use that. */
2088       reg.type = BRW_REGISTER_TYPE_DF;
2089    } else {
2090       /* To avoid floating-point denorm flushing problems, set the type by
2091        * default to an integer type - instructions that need floating point
2092        * semantics will set this to F if they need to
2093        */
2094       reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
2095                                             BRW_REGISTER_TYPE_D);
2096    }
2097 
2098    return reg;
2099 }
2100 
2101 /**
2102  * Return an IMM for constants; otherwise call get_nir_src() as normal.
2103  *
2104  * This function should not be called on any value which may be 64 bits.
2105  * We could theoretically support 64-bit on gfx8+ but we choose not to
2106  * because it wouldn't work in general (no gfx7 support) and there are
2107  * enough restrictions in 64-bit immediates that you can't take the return
2108  * value and treat it the same as the result of get_nir_src().
2109  */
2110 fs_reg
get_nir_src_imm(const nir_src & src)2111 fs_visitor::get_nir_src_imm(const nir_src &src)
2112 {
2113    assert(nir_src_bit_size(src) == 32);
2114    return nir_src_is_const(src) ?
2115           fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
2116 }
2117 
2118 fs_reg
get_nir_dest(const nir_dest & dest)2119 fs_visitor::get_nir_dest(const nir_dest &dest)
2120 {
2121    if (dest.is_ssa) {
2122       const brw_reg_type reg_type =
2123          brw_reg_type_from_bit_size(dest.ssa.bit_size,
2124                                     dest.ssa.bit_size == 8 ?
2125                                     BRW_REGISTER_TYPE_D :
2126                                     BRW_REGISTER_TYPE_F);
2127       nir_ssa_values[dest.ssa.index] =
2128          bld.vgrf(reg_type, dest.ssa.num_components);
2129       bld.UNDEF(nir_ssa_values[dest.ssa.index]);
2130       return nir_ssa_values[dest.ssa.index];
2131    } else {
2132       /* We don't handle indirects on locals */
2133       assert(dest.reg.indirect == NULL);
2134       return offset(nir_locals[dest.reg.reg->index], bld,
2135                     dest.reg.base_offset * dest.reg.reg->num_components);
2136    }
2137 }
2138 
2139 void
emit_percomp(const fs_builder & bld,const fs_inst & inst,unsigned wr_mask)2140 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
2141                          unsigned wr_mask)
2142 {
2143    for (unsigned i = 0; i < 4; i++) {
2144       if (!((wr_mask >> i) & 1))
2145          continue;
2146 
2147       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
2148       new_inst->dst = offset(new_inst->dst, bld, i);
2149       for (unsigned j = 0; j < new_inst->sources; j++)
2150          if (new_inst->src[j].file == VGRF)
2151             new_inst->src[j] = offset(new_inst->src[j], bld, i);
2152 
2153       bld.emit(new_inst);
2154    }
2155 }
2156 
2157 static fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum opcode opcode,const fs_reg & dst,const fs_reg & src,const fs_reg & desc,glsl_interp_mode interpolation)2158 emit_pixel_interpolater_send(const fs_builder &bld,
2159                              enum opcode opcode,
2160                              const fs_reg &dst,
2161                              const fs_reg &src,
2162                              const fs_reg &desc,
2163                              glsl_interp_mode interpolation)
2164 {
2165    struct brw_wm_prog_data *wm_prog_data =
2166       brw_wm_prog_data(bld.shader->stage_prog_data);
2167 
2168    fs_inst *inst = bld.emit(opcode, dst, src, desc);
2169    /* 2 floats per slot returned */
2170    inst->size_written = 2 * dst.component_size(inst->exec_size);
2171    if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2172       inst->pi_noperspective = true;
2173       /* TGL BSpec says:
2174        *     This field cannot be set to "Linear Interpolation"
2175        *     unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2176        */
2177       wm_prog_data->uses_nonperspective_interp_modes = true;
2178    }
2179 
2180    wm_prog_data->pulls_bary = true;
2181 
2182    return inst;
2183 }
2184 
2185 /**
2186  * Computes 1 << x, given a D/UD register containing some value x.
2187  */
2188 static fs_reg
intexp2(const fs_builder & bld,const fs_reg & x)2189 intexp2(const fs_builder &bld, const fs_reg &x)
2190 {
2191    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
2192 
2193    fs_reg result = bld.vgrf(x.type, 1);
2194    fs_reg one = bld.vgrf(x.type, 1);
2195 
2196    bld.MOV(one, retype(brw_imm_d(1), one.type));
2197    bld.SHL(result, one, x);
2198    return result;
2199 }
2200 
2201 void
emit_gs_end_primitive(const nir_src & vertex_count_nir_src)2202 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
2203 {
2204    assert(stage == MESA_SHADER_GEOMETRY);
2205 
2206    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2207 
2208    if (gs_compile->control_data_header_size_bits == 0)
2209       return;
2210 
2211    /* We can only do EndPrimitive() functionality when the control data
2212     * consists of cut bits.  Fortunately, the only time it isn't is when the
2213     * output type is points, in which case EndPrimitive() is a no-op.
2214     */
2215    if (gs_prog_data->control_data_format !=
2216        GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2217       return;
2218    }
2219 
2220    /* Cut bits use one bit per vertex. */
2221    assert(gs_compile->control_data_bits_per_vertex == 1);
2222 
2223    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2224    vertex_count.type = BRW_REGISTER_TYPE_UD;
2225 
2226    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2227     * vertex n, 0 otherwise.  So all we need to do here is mark bit
2228     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2229     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2230     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2231     *
2232     * Note that if EndPrimitive() is called before emitting any vertices, this
2233     * will cause us to set bit 31 of the control_data_bits register to 1.
2234     * That's fine because:
2235     *
2236     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2237     *   output, so the hardware will ignore cut bit 31.
2238     *
2239     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2240     *   last vertex, so setting cut bit 31 has no effect (since the primitive
2241     *   is automatically ended when the GS terminates).
2242     *
2243     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2244     *   control_data_bits register to 0 when the first vertex is emitted.
2245     */
2246 
2247    const fs_builder abld = bld.annotate("end primitive");
2248 
2249    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2250    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2251    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2252    fs_reg mask = intexp2(abld, prev_count);
2253    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2254     * attention to the lower 5 bits of its second source argument, so on this
2255     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2256     * ((vertex_count - 1) % 32).
2257     */
2258    abld.OR(this->control_data_bits, this->control_data_bits, mask);
2259 }
2260 
2261 void
emit_gs_control_data_bits(const fs_reg & vertex_count)2262 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
2263 {
2264    assert(stage == MESA_SHADER_GEOMETRY);
2265    assert(gs_compile->control_data_bits_per_vertex != 0);
2266 
2267    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2268 
2269    const fs_builder abld = bld.annotate("emit control data bits");
2270    const fs_builder fwa_bld = bld.exec_all();
2271 
2272    /* We use a single UD register to accumulate control data bits (32 bits
2273     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2274     * at a time.
2275     *
2276     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2277     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2278     * use the Channel Mask phase to enable/disable which DWord within that
2279     * group to write.  (Remember, different SIMD8 channels may have emitted
2280     * different numbers of vertices, so we may need per-slot offsets.)
2281     *
2282     * Channel masking presents an annoying problem: we may have to replicate
2283     * the data up to 4 times:
2284     *
2285     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2286     *
2287     * To avoid penalizing shaders that emit a small number of vertices, we
2288     * can avoid these sometimes: if the size of the control data header is
2289     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2290     * land in the same 128-bit group, so we can skip per-slot offsets.
2291     *
2292     * Similarly, if the control data header is <= 32 bits, there is only one
2293     * DWord, so we can skip channel masks.
2294     */
2295    fs_reg channel_mask, per_slot_offset;
2296 
2297    if (gs_compile->control_data_header_size_bits > 32)
2298       channel_mask = vgrf(glsl_type::uint_type);
2299 
2300    if (gs_compile->control_data_header_size_bits > 128)
2301       per_slot_offset = vgrf(glsl_type::uint_type);
2302 
2303    /* Figure out which DWord we're trying to write to using the formula:
2304     *
2305     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2306     *
2307     * Since bits_per_vertex is a power of two, and is known at compile
2308     * time, this can be optimized to:
2309     *
2310     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2311     */
2312    if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
2313       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2314       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2315       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2316       unsigned log2_bits_per_vertex =
2317          util_last_bit(gs_compile->control_data_bits_per_vertex);
2318       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2319 
2320       if (per_slot_offset.file != BAD_FILE) {
2321          /* Set the per-slot offset to dword_index / 4, so that we'll write to
2322           * the appropriate OWord within the control data header.
2323           */
2324          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
2325       }
2326 
2327       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2328        * write to the appropriate DWORD within the OWORD.
2329        */
2330       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2331       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
2332       channel_mask = intexp2(fwa_bld, channel);
2333       /* Then the channel masks need to be in bits 23:16. */
2334       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
2335    }
2336 
2337    /* Store the control data bits in the message payload and send it. */
2338    const unsigned header_size = 1 + unsigned(channel_mask.file != BAD_FILE) +
2339       unsigned(per_slot_offset.file != BAD_FILE);
2340 
2341    /* If there are channel masks, add 3 extra copies of the data. */
2342    const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2343 
2344    fs_reg sources[4];
2345 
2346    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
2347       sources[i] = this->control_data_bits;
2348 
2349    fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2350    srcs[URB_LOGICAL_SRC_HANDLE] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
2351    srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2352    srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2353    srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
2354                                        BRW_REGISTER_TYPE_F);
2355    abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2356 
2357    fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2358                              srcs, ARRAY_SIZE(srcs));
2359    inst->mlen = header_size + length;
2360    /* We need to increment Global Offset by 256-bits to make room for
2361     * Broadwell's extra "Vertex Count" payload at the beginning of the
2362     * URB entry.  Since this is an OWord message, Global Offset is counted
2363     * in 128-bit units, so we must set it to 2.
2364     */
2365    if (gs_prog_data->static_vertex_count == -1)
2366       inst->offset = 2;
2367 }
2368 
2369 void
set_gs_stream_control_data_bits(const fs_reg & vertex_count,unsigned stream_id)2370 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
2371                                             unsigned stream_id)
2372 {
2373    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2374 
2375    /* Note: we are calling this *before* increasing vertex_count, so
2376     * this->vertex_count == vertex_count - 1 in the formula above.
2377     */
2378 
2379    /* Stream mode uses 2 bits per vertex */
2380    assert(gs_compile->control_data_bits_per_vertex == 2);
2381 
2382    /* Must be a valid stream */
2383    assert(stream_id < MAX_VERTEX_STREAMS);
2384 
2385    /* Control data bits are initialized to 0 so we don't have to set any
2386     * bits when sending vertices to stream 0.
2387     */
2388    if (stream_id == 0)
2389       return;
2390 
2391    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
2392 
2393    /* reg::sid = stream_id */
2394    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2395    abld.MOV(sid, brw_imm_ud(stream_id));
2396 
2397    /* reg:shift_count = 2 * (vertex_count - 1) */
2398    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2399    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2400 
2401    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2402     * attention to the lower 5 bits of its second source argument, so on this
2403     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2404     * stream_id << ((2 * (vertex_count - 1)) % 32).
2405     */
2406    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2407    abld.SHL(mask, sid, shift_count);
2408    abld.OR(this->control_data_bits, this->control_data_bits, mask);
2409 }
2410 
2411 void
emit_gs_vertex(const nir_src & vertex_count_nir_src,unsigned stream_id)2412 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
2413                            unsigned stream_id)
2414 {
2415    assert(stage == MESA_SHADER_GEOMETRY);
2416 
2417    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2418 
2419    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2420    vertex_count.type = BRW_REGISTER_TYPE_UD;
2421 
2422    /* Haswell and later hardware ignores the "Render Stream Select" bits
2423     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2424     * and instead sends all primitives down the pipeline for rasterization.
2425     * If the SOL stage is enabled, "Render Stream Select" is honored and
2426     * primitives bound to non-zero streams are discarded after stream output.
2427     *
2428     * Since the only purpose of primives sent to non-zero streams is to
2429     * be recorded by transform feedback, we can simply discard all geometry
2430     * bound to these streams when transform feedback is disabled.
2431     */
2432    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
2433       return;
2434 
2435    /* If we're outputting 32 control data bits or less, then we can wait
2436     * until the shader is over to output them all.  Otherwise we need to
2437     * output them as we go.  Now is the time to do it, since we're about to
2438     * output the vertex_count'th vertex, so it's guaranteed that the
2439     * control data bits associated with the (vertex_count - 1)th vertex are
2440     * correct.
2441     */
2442    if (gs_compile->control_data_header_size_bits > 32) {
2443       const fs_builder abld =
2444          bld.annotate("emit vertex: emit control data bits");
2445 
2446       /* Only emit control data bits if we've finished accumulating a batch
2447        * of 32 bits.  This is the case when:
2448        *
2449        *     (vertex_count * bits_per_vertex) % 32 == 0
2450        *
2451        * (in other words, when the last 5 bits of vertex_count *
2452        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2453        * integer n (which is always the case, since bits_per_vertex is
2454        * always 1 or 2), this is equivalent to requiring that the last 5-n
2455        * bits of vertex_count are 0:
2456        *
2457        *     vertex_count & (2^(5-n) - 1) == 0
2458        *
2459        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2460        * equivalent to:
2461        *
2462        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2463        *
2464        * TODO: If vertex_count is an immediate, we could do some of this math
2465        *       at compile time...
2466        */
2467       fs_inst *inst =
2468          abld.AND(bld.null_reg_d(), vertex_count,
2469                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2470       inst->conditional_mod = BRW_CONDITIONAL_Z;
2471 
2472       abld.IF(BRW_PREDICATE_NORMAL);
2473       /* If vertex_count is 0, then no control data bits have been
2474        * accumulated yet, so we can skip emitting them.
2475        */
2476       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2477                BRW_CONDITIONAL_NEQ);
2478       abld.IF(BRW_PREDICATE_NORMAL);
2479       emit_gs_control_data_bits(vertex_count);
2480       abld.emit(BRW_OPCODE_ENDIF);
2481 
2482       /* Reset control_data_bits to 0 so we can start accumulating a new
2483        * batch.
2484        *
2485        * Note: in the case where vertex_count == 0, this neutralizes the
2486        * effect of any call to EndPrimitive() that the shader may have
2487        * made before outputting its first vertex.
2488        */
2489       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2490       inst->force_writemask_all = true;
2491       abld.emit(BRW_OPCODE_ENDIF);
2492    }
2493 
2494    emit_urb_writes(vertex_count);
2495 
2496    /* In stream mode we have to set control data bits for all vertices
2497     * unless we have disabled control data bits completely (which we do
2498     * do for GL_POINTS outputs that don't use streams).
2499     */
2500    if (gs_compile->control_data_header_size_bits > 0 &&
2501        gs_prog_data->control_data_format ==
2502           GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2503       set_gs_stream_control_data_bits(vertex_count, stream_id);
2504    }
2505 }
2506 
2507 void
emit_gs_input_load(const fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2508 fs_visitor::emit_gs_input_load(const fs_reg &dst,
2509                                const nir_src &vertex_src,
2510                                unsigned base_offset,
2511                                const nir_src &offset_src,
2512                                unsigned num_components,
2513                                unsigned first_component)
2514 {
2515    assert(type_sz(dst.type) == 4);
2516    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2517    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2518 
2519    /* TODO: figure out push input layout for invocations == 1 */
2520    if (gs_prog_data->invocations == 1 &&
2521        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2522        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2523       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2524                        nir_src_as_uint(vertex_src) * push_reg_count;
2525       for (unsigned i = 0; i < num_components; i++) {
2526          bld.MOV(offset(dst, bld, i),
2527                  fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2528       }
2529       return;
2530    }
2531 
2532    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2533    assert(gs_prog_data->base.include_vue_handles);
2534 
2535    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2536    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2537 
2538    if (gs_prog_data->invocations == 1) {
2539       if (nir_src_is_const(vertex_src)) {
2540          /* The vertex index is constant; just select the proper URB handle. */
2541          icp_handle =
2542             retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2543                    BRW_REGISTER_TYPE_UD);
2544       } else {
2545          /* The vertex index is non-constant.  We need to use indirect
2546           * addressing to fetch the proper URB handle.
2547           *
2548           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2549           * indicating that channel <n> should read the handle from
2550           * DWord <n>.  We convert that to bytes by multiplying by 4.
2551           *
2552           * Next, we convert the vertex index to bytes by multiplying
2553           * by 32 (shifting by 5), and add the two together.  This is
2554           * the final indirect byte offset.
2555           */
2556          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2557          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2558          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2559          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2560 
2561          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2562          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2563          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2564          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2565          /* Convert vertex_index to bytes (multiply by 32) */
2566          bld.SHL(vertex_offset_bytes,
2567                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2568                  brw_imm_ud(5u));
2569          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2570 
2571          /* Use first_icp_handle as the base offset.  There is one register
2572           * of URB handles per vertex, so inform the register allocator that
2573           * we might read up to nir->info.gs.vertices_in registers.
2574           */
2575          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2576                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2577                   fs_reg(icp_offset_bytes),
2578                   brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2579       }
2580    } else {
2581       assert(gs_prog_data->invocations > 1);
2582 
2583       if (nir_src_is_const(vertex_src)) {
2584          unsigned vertex = nir_src_as_uint(vertex_src);
2585          assert(devinfo->ver >= 9 || vertex <= 5);
2586          bld.MOV(icp_handle,
2587                  retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2588                         BRW_REGISTER_TYPE_UD));
2589       } else {
2590          /* The vertex index is non-constant.  We need to use indirect
2591           * addressing to fetch the proper URB handle.
2592           *
2593           */
2594          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2595 
2596          /* Convert vertex_index to bytes (multiply by 4) */
2597          bld.SHL(icp_offset_bytes,
2598                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2599                  brw_imm_ud(2u));
2600 
2601          /* Use first_icp_handle as the base offset.  There is one DWord
2602           * of URB handles per vertex, so inform the register allocator that
2603           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2604           */
2605          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2606                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2607                   fs_reg(icp_offset_bytes),
2608                   brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2609                              REG_SIZE));
2610       }
2611    }
2612 
2613    fs_inst *inst;
2614    fs_reg indirect_offset = get_nir_src(offset_src);
2615 
2616    if (nir_src_is_const(offset_src)) {
2617       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2618       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2619 
2620       /* Constant indexing - use global offset. */
2621       if (first_component != 0) {
2622          unsigned read_components = num_components + first_component;
2623          fs_reg tmp = bld.vgrf(dst.type, read_components);
2624          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2625                          ARRAY_SIZE(srcs));
2626          inst->size_written = read_components *
2627                               tmp.component_size(inst->exec_size);
2628          for (unsigned i = 0; i < num_components; i++) {
2629             bld.MOV(offset(dst, bld, i),
2630                     offset(tmp, bld, i + first_component));
2631          }
2632       } else {
2633          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2634                          ARRAY_SIZE(srcs));
2635          inst->size_written = num_components *
2636                               dst.component_size(inst->exec_size);
2637       }
2638       inst->offset = base_offset + nir_src_as_uint(offset_src);
2639       inst->mlen = 1;
2640    } else {
2641       /* Indirect indexing - use per-slot offsets as well. */
2642       unsigned read_components = num_components + first_component;
2643       fs_reg tmp = bld.vgrf(dst.type, read_components);
2644 
2645       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2646       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2647       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2648 
2649       if (first_component != 0) {
2650          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2651                          srcs, ARRAY_SIZE(srcs));
2652          inst->size_written = read_components *
2653                               tmp.component_size(inst->exec_size);
2654          for (unsigned i = 0; i < num_components; i++) {
2655             bld.MOV(offset(dst, bld, i),
2656                     offset(tmp, bld, i + first_component));
2657          }
2658       } else {
2659          inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
2660                          srcs, ARRAY_SIZE(srcs));
2661          inst->size_written = num_components *
2662                               dst.component_size(inst->exec_size);
2663       }
2664       inst->offset = base_offset;
2665       inst->mlen = 2;
2666    }
2667 }
2668 
2669 fs_reg
get_indirect_offset(nir_intrinsic_instr * instr)2670 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2671 {
2672    nir_src *offset_src = nir_get_io_offset_src(instr);
2673 
2674    if (nir_src_is_const(*offset_src)) {
2675       /* The only constant offset we should find is 0.  brw_nir.c's
2676        * add_const_offset_to_base() will fold other constant offsets
2677        * into instr->const_index[0].
2678        */
2679       assert(nir_src_as_uint(*offset_src) == 0);
2680       return fs_reg();
2681    }
2682 
2683    return get_nir_src(*offset_src);
2684 }
2685 
2686 void
nir_emit_vs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)2687 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2688                                   nir_intrinsic_instr *instr)
2689 {
2690    assert(stage == MESA_SHADER_VERTEX);
2691 
2692    fs_reg dest;
2693    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2694       dest = get_nir_dest(instr->dest);
2695 
2696    switch (instr->intrinsic) {
2697    case nir_intrinsic_load_vertex_id:
2698    case nir_intrinsic_load_base_vertex:
2699       unreachable("should be lowered by nir_lower_system_values()");
2700 
2701    case nir_intrinsic_load_input: {
2702       assert(nir_dest_bit_size(instr->dest) == 32);
2703       fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2704       src = offset(src, bld, nir_intrinsic_component(instr));
2705       src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2706 
2707       for (unsigned i = 0; i < instr->num_components; i++)
2708          bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2709       break;
2710    }
2711 
2712    case nir_intrinsic_load_vertex_id_zero_base:
2713    case nir_intrinsic_load_instance_id:
2714    case nir_intrinsic_load_base_instance:
2715    case nir_intrinsic_load_draw_id:
2716    case nir_intrinsic_load_first_vertex:
2717    case nir_intrinsic_load_is_indexed_draw:
2718       unreachable("lowered by brw_nir_lower_vs_inputs");
2719 
2720    default:
2721       nir_emit_intrinsic(bld, instr);
2722       break;
2723    }
2724 }
2725 
2726 fs_reg
get_tcs_single_patch_icp_handle(const fs_builder & bld,nir_intrinsic_instr * instr)2727 fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
2728                                             nir_intrinsic_instr *instr)
2729 {
2730    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2731    const nir_src &vertex_src = instr->src[0];
2732    nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2733    fs_reg icp_handle;
2734 
2735    if (nir_src_is_const(vertex_src)) {
2736       /* Emit a MOV to resolve <0,1,0> regioning. */
2737       icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2738       unsigned vertex = nir_src_as_uint(vertex_src);
2739       bld.MOV(icp_handle,
2740               retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2741                      BRW_REGISTER_TYPE_UD));
2742    } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2743               vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2744       /* For the common case of only 1 instance, an array index of
2745        * gl_InvocationID means reading g1.  Skip all the indirect work.
2746        */
2747       icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2748    } else {
2749       /* The vertex index is non-constant.  We need to use indirect
2750        * addressing to fetch the proper URB handle.
2751        */
2752       icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2753 
2754       /* Each ICP handle is a single DWord (4 bytes) */
2755       fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2756       bld.SHL(vertex_offset_bytes,
2757               retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2758               brw_imm_ud(2u));
2759 
2760       /* Start at g1.  We might read up to 4 registers. */
2761       bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2762                retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2763                brw_imm_ud(4 * REG_SIZE));
2764    }
2765 
2766    return icp_handle;
2767 }
2768 
2769 fs_reg
get_tcs_eight_patch_icp_handle(const fs_builder & bld,nir_intrinsic_instr * instr)2770 fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,
2771                                            nir_intrinsic_instr *instr)
2772 {
2773    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2774    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2775    const nir_src &vertex_src = instr->src[0];
2776 
2777    unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;
2778 
2779    if (nir_src_is_const(vertex_src)) {
2780       return fs_reg(retype(brw_vec8_grf(first_icp_handle +
2781                                         nir_src_as_uint(vertex_src), 0),
2782                            BRW_REGISTER_TYPE_UD));
2783    }
2784 
2785    /* The vertex index is non-constant.  We need to use indirect
2786     * addressing to fetch the proper URB handle.
2787     *
2788     * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2789     * indicating that channel <n> should read the handle from
2790     * DWord <n>.  We convert that to bytes by multiplying by 4.
2791     *
2792     * Next, we convert the vertex index to bytes by multiplying
2793     * by 32 (shifting by 5), and add the two together.  This is
2794     * the final indirect byte offset.
2795     */
2796    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2797    fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2798    fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2799    fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2800    fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2801 
2802    /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2803    bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2804    /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2805    bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2806    /* Convert vertex_index to bytes (multiply by 32) */
2807    bld.SHL(vertex_offset_bytes,
2808            retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2809            brw_imm_ud(5u));
2810    bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2811 
2812    /* Use first_icp_handle as the base offset.  There is one register
2813     * of URB handles per vertex, so inform the register allocator that
2814     * we might read up to nir->info.gs.vertices_in registers.
2815     */
2816    bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2817             retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2818             icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
2819 
2820    return icp_handle;
2821 }
2822 
2823 struct brw_reg
get_tcs_output_urb_handle()2824 fs_visitor::get_tcs_output_urb_handle()
2825 {
2826    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2827 
2828    if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
2829       return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2830    } else {
2831       assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
2832       return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2833    }
2834 }
2835 
2836 void
nir_emit_tcs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)2837 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2838                                    nir_intrinsic_instr *instr)
2839 {
2840    assert(stage == MESA_SHADER_TESS_CTRL);
2841    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2842    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2843    struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2844 
2845    bool eight_patch =
2846       vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;
2847 
2848    fs_reg dst;
2849    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2850       dst = get_nir_dest(instr->dest);
2851 
2852    switch (instr->intrinsic) {
2853    case nir_intrinsic_load_primitive_id:
2854       bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)
2855                                       : brw_vec1_grf(0, 1)));
2856       break;
2857    case nir_intrinsic_load_invocation_id:
2858       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2859       break;
2860    case nir_intrinsic_load_patch_vertices_in:
2861       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2862               brw_imm_d(tcs_key->input_vertices));
2863       break;
2864 
2865    case nir_intrinsic_control_barrier: {
2866       if (tcs_prog_data->instances == 1)
2867          break;
2868 
2869       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2870       fs_reg m0_2 = component(m0, 2);
2871 
2872       const fs_builder chanbld = bld.exec_all().group(1, 0);
2873 
2874       /* Zero the message header */
2875       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2876 
2877       if (devinfo->verx10 >= 125) {
2878          /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
2879          fs_reg m0_10ub = component(retype(m0, BRW_REGISTER_TYPE_UB), 10);
2880          fs_reg r0_11ub =
2881             stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11),
2882                    0, 1, 0);
2883          bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
2884       } else if (devinfo->ver >= 11) {
2885          chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2886                      brw_imm_ud(INTEL_MASK(30, 24)));
2887 
2888          /* Set the Barrier Count and the enable bit */
2889          chanbld.OR(m0_2, m0_2,
2890                     brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2891       } else {
2892          /* Copy "Barrier ID" from r0.2, bits 16:13 */
2893          chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2894                      brw_imm_ud(INTEL_MASK(16, 13)));
2895 
2896          /* Shift it up to bits 27:24. */
2897          chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2898 
2899          /* Set the Barrier Count and the enable bit */
2900          chanbld.OR(m0_2, m0_2,
2901                     brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2902       }
2903 
2904       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2905       break;
2906    }
2907 
2908    case nir_intrinsic_load_input:
2909       unreachable("nir_lower_io should never give us these.");
2910       break;
2911 
2912    case nir_intrinsic_load_per_vertex_input: {
2913       assert(nir_dest_bit_size(instr->dest) == 32);
2914       fs_reg indirect_offset = get_indirect_offset(instr);
2915       unsigned imm_offset = instr->const_index[0];
2916       fs_inst *inst;
2917 
2918       fs_reg icp_handle =
2919          eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)
2920                      : get_tcs_single_patch_icp_handle(bld, instr);
2921 
2922       /* We can only read two double components with each URB read, so
2923        * we send two read messages in that case, each one loading up to
2924        * two double components.
2925        */
2926       unsigned num_components = instr->num_components;
2927       unsigned first_component = nir_intrinsic_component(instr);
2928 
2929       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2930       srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2931 
2932       if (indirect_offset.file == BAD_FILE) {
2933          /* Constant indexing - use global offset. */
2934          if (first_component != 0) {
2935             unsigned read_components = num_components + first_component;
2936             fs_reg tmp = bld.vgrf(dst.type, read_components);
2937             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2938                             ARRAY_SIZE(srcs));
2939             for (unsigned i = 0; i < num_components; i++) {
2940                bld.MOV(offset(dst, bld, i),
2941                        offset(tmp, bld, i + first_component));
2942             }
2943          } else {
2944             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2945                             ARRAY_SIZE(srcs));
2946          }
2947          inst->offset = imm_offset;
2948          inst->mlen = 1;
2949       } else {
2950          /* Indirect indexing - use per-slot offsets as well. */
2951          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2952 
2953          if (first_component != 0) {
2954             unsigned read_components = num_components + first_component;
2955             fs_reg tmp = bld.vgrf(dst.type, read_components);
2956             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2957                             srcs, ARRAY_SIZE(srcs));
2958             for (unsigned i = 0; i < num_components; i++) {
2959                bld.MOV(offset(dst, bld, i),
2960                        offset(tmp, bld, i + first_component));
2961             }
2962          } else {
2963             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
2964                             srcs, ARRAY_SIZE(srcs));
2965          }
2966          inst->offset = imm_offset;
2967          inst->mlen = 2;
2968       }
2969       inst->size_written = (num_components + first_component) *
2970                            inst->dst.component_size(inst->exec_size);
2971 
2972       /* Copy the temporary to the destination to deal with writemasking.
2973        *
2974        * Also attempt to deal with gl_PointSize being in the .w component.
2975        */
2976       if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2977          assert(type_sz(dst.type) == 4);
2978          inst->dst = bld.vgrf(dst.type, 4);
2979          inst->size_written = 4 * REG_SIZE;
2980          bld.MOV(dst, offset(inst->dst, bld, 3));
2981       }
2982       break;
2983    }
2984 
2985    case nir_intrinsic_load_output:
2986    case nir_intrinsic_load_per_vertex_output: {
2987       assert(nir_dest_bit_size(instr->dest) == 32);
2988       fs_reg indirect_offset = get_indirect_offset(instr);
2989       unsigned imm_offset = instr->const_index[0];
2990       unsigned first_component = nir_intrinsic_component(instr);
2991 
2992       struct brw_reg output_handles = get_tcs_output_urb_handle();
2993 
2994       fs_inst *inst;
2995       if (indirect_offset.file == BAD_FILE) {
2996          /* This MOV replicates the output handle to all enabled channels
2997           * is SINGLE_PATCH mode.
2998           */
2999          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
3000          bld.MOV(patch_handle, output_handles);
3001 
3002          {
3003             fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3004             srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
3005 
3006             if (first_component != 0) {
3007                unsigned read_components =
3008                   instr->num_components + first_component;
3009                fs_reg tmp = bld.vgrf(dst.type, read_components);
3010                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3011                                srcs, ARRAY_SIZE(srcs));
3012                inst->size_written = read_components * REG_SIZE;
3013                for (unsigned i = 0; i < instr->num_components; i++) {
3014                   bld.MOV(offset(dst, bld, i),
3015                           offset(tmp, bld, i + first_component));
3016                }
3017             } else {
3018                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3019                                srcs, ARRAY_SIZE(srcs));
3020                inst->size_written = instr->num_components * REG_SIZE;
3021             }
3022             inst->offset = imm_offset;
3023             inst->mlen = 1;
3024          }
3025       } else {
3026          /* Indirect indexing - use per-slot offsets as well. */
3027          fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3028          srcs[URB_LOGICAL_SRC_HANDLE] = output_handles;
3029          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3030 
3031          if (first_component != 0) {
3032             unsigned read_components =
3033                instr->num_components + first_component;
3034             fs_reg tmp = bld.vgrf(dst.type, read_components);
3035             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3036                             srcs, ARRAY_SIZE(srcs));
3037             inst->size_written = read_components * REG_SIZE;
3038             for (unsigned i = 0; i < instr->num_components; i++) {
3039                bld.MOV(offset(dst, bld, i),
3040                        offset(tmp, bld, i + first_component));
3041             }
3042          } else {
3043             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3044                             srcs, ARRAY_SIZE(srcs));
3045             inst->size_written = instr->num_components * REG_SIZE;
3046          }
3047          inst->offset = imm_offset;
3048          inst->mlen = 2;
3049       }
3050       break;
3051    }
3052 
3053    case nir_intrinsic_store_output:
3054    case nir_intrinsic_store_per_vertex_output: {
3055       assert(nir_src_bit_size(instr->src[0]) == 32);
3056       fs_reg value = get_nir_src(instr->src[0]);
3057       fs_reg indirect_offset = get_indirect_offset(instr);
3058       unsigned imm_offset = instr->const_index[0];
3059       unsigned mask = instr->const_index[1];
3060 
3061       if (mask == 0)
3062          break;
3063 
3064       unsigned num_components = util_last_bit(mask);
3065 
3066       /* We can only pack two 64-bit components in a single message, so send
3067        * 2 messages if we have more components
3068        */
3069       unsigned first_component = nir_intrinsic_component(instr);
3070       mask = mask << first_component;
3071 
3072       fs_reg mask_reg;
3073       if (mask != WRITEMASK_XYZW)
3074          mask_reg = brw_imm_ud(mask << 16);
3075 
3076       fs_reg sources[4];
3077 
3078       for (unsigned i = 0; i < num_components; i++) {
3079          if (!(mask & (1 << (i + first_component))))
3080             continue;
3081 
3082          sources[i + first_component] = offset(value, bld, i);
3083       }
3084 
3085       unsigned header_size = 1 + unsigned(indirect_offset.file != BAD_FILE) +
3086          unsigned(mask != WRITEMASK_XYZW);
3087       const unsigned length = num_components + first_component;
3088 
3089       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3090       srcs[URB_LOGICAL_SRC_HANDLE] = get_tcs_output_urb_handle();
3091       srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3092       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
3093       srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, alloc.allocate(length),
3094                                           BRW_REGISTER_TYPE_F);
3095       bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
3096 
3097       fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
3098                                srcs, ARRAY_SIZE(srcs));
3099       inst->offset = imm_offset;
3100       inst->mlen = header_size + length;
3101       break;
3102    }
3103 
3104    default:
3105       nir_emit_intrinsic(bld, instr);
3106       break;
3107    }
3108 }
3109 
3110 void
nir_emit_tes_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)3111 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
3112                                    nir_intrinsic_instr *instr)
3113 {
3114    assert(stage == MESA_SHADER_TESS_EVAL);
3115    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
3116 
3117    fs_reg dest;
3118    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3119       dest = get_nir_dest(instr->dest);
3120 
3121    switch (instr->intrinsic) {
3122    case nir_intrinsic_load_primitive_id:
3123       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
3124       break;
3125    case nir_intrinsic_load_tess_coord:
3126       /* gl_TessCoord is part of the payload in g1-3 */
3127       for (unsigned i = 0; i < 3; i++) {
3128          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
3129       }
3130       break;
3131 
3132    case nir_intrinsic_load_input:
3133    case nir_intrinsic_load_per_vertex_input: {
3134       assert(nir_dest_bit_size(instr->dest) == 32);
3135       fs_reg indirect_offset = get_indirect_offset(instr);
3136       unsigned imm_offset = instr->const_index[0];
3137       unsigned first_component = nir_intrinsic_component(instr);
3138 
3139       fs_inst *inst;
3140       if (indirect_offset.file == BAD_FILE) {
3141          /* Arbitrarily only push up to 32 vec4 slots worth of data,
3142           * which is 16 registers (since each holds 2 vec4 slots).
3143           */
3144          const unsigned max_push_slots = 32;
3145          if (imm_offset < max_push_slots) {
3146             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
3147             for (int i = 0; i < instr->num_components; i++) {
3148                unsigned comp = 4 * (imm_offset % 2) + i + first_component;
3149                bld.MOV(offset(dest, bld, i), component(src, comp));
3150             }
3151 
3152             tes_prog_data->base.urb_read_length =
3153                MAX2(tes_prog_data->base.urb_read_length,
3154                     (imm_offset / 2) + 1);
3155          } else {
3156             /* Replicate the patch handle to all enabled channels */
3157             fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3158             srcs[URB_LOGICAL_SRC_HANDLE] =
3159                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
3160 
3161             if (first_component != 0) {
3162                unsigned read_components =
3163                   instr->num_components + first_component;
3164                fs_reg tmp = bld.vgrf(dest.type, read_components);
3165                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3166                                srcs, ARRAY_SIZE(srcs));
3167                inst->size_written = read_components * REG_SIZE;
3168                for (unsigned i = 0; i < instr->num_components; i++) {
3169                   bld.MOV(offset(dest, bld, i),
3170                           offset(tmp, bld, i + first_component));
3171                }
3172             } else {
3173                inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3174                                srcs, ARRAY_SIZE(srcs));
3175                inst->size_written = instr->num_components * REG_SIZE;
3176             }
3177             inst->mlen = 1;
3178             inst->offset = imm_offset;
3179          }
3180       } else {
3181          /* Indirect indexing - use per-slot offsets as well. */
3182 
3183          /* We can only read two double components with each URB read, so
3184           * we send two read messages in that case, each one loading up to
3185           * two double components.
3186           */
3187          unsigned num_components = instr->num_components;
3188 
3189          fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3190          srcs[URB_LOGICAL_SRC_HANDLE] =
3191             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
3192          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3193 
3194          if (first_component != 0) {
3195             unsigned read_components =
3196                 num_components + first_component;
3197             fs_reg tmp = bld.vgrf(dest.type, read_components);
3198             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3199                             srcs, ARRAY_SIZE(srcs));
3200             for (unsigned i = 0; i < num_components; i++) {
3201                bld.MOV(offset(dest, bld, i),
3202                        offset(tmp, bld, i + first_component));
3203             }
3204          } else {
3205             inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3206                             srcs, ARRAY_SIZE(srcs));
3207          }
3208          inst->mlen = 2;
3209          inst->offset = imm_offset;
3210          inst->size_written = (num_components + first_component) *
3211                               inst->dst.component_size(inst->exec_size);
3212       }
3213       break;
3214    }
3215    default:
3216       nir_emit_intrinsic(bld, instr);
3217       break;
3218    }
3219 }
3220 
3221 void
nir_emit_gs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)3222 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
3223                                   nir_intrinsic_instr *instr)
3224 {
3225    assert(stage == MESA_SHADER_GEOMETRY);
3226    fs_reg indirect_offset;
3227 
3228    fs_reg dest;
3229    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3230       dest = get_nir_dest(instr->dest);
3231 
3232    switch (instr->intrinsic) {
3233    case nir_intrinsic_load_primitive_id:
3234       assert(stage == MESA_SHADER_GEOMETRY);
3235       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
3236       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
3237               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
3238       break;
3239 
3240    case nir_intrinsic_load_input:
3241       unreachable("load_input intrinsics are invalid for the GS stage");
3242 
3243    case nir_intrinsic_load_per_vertex_input:
3244       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
3245                          instr->src[1], instr->num_components,
3246                          nir_intrinsic_component(instr));
3247       break;
3248 
3249    case nir_intrinsic_emit_vertex_with_counter:
3250       emit_gs_vertex(instr->src[0], instr->const_index[0]);
3251       break;
3252 
3253    case nir_intrinsic_end_primitive_with_counter:
3254       emit_gs_end_primitive(instr->src[0]);
3255       break;
3256 
3257    case nir_intrinsic_set_vertex_and_primitive_count:
3258       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
3259       break;
3260 
3261    case nir_intrinsic_load_invocation_id: {
3262       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3263       assert(val.file != BAD_FILE);
3264       dest.type = val.type;
3265       bld.MOV(dest, val);
3266       break;
3267    }
3268 
3269    default:
3270       nir_emit_intrinsic(bld, instr);
3271       break;
3272    }
3273 }
3274 
3275 /**
3276  * Fetch the current render target layer index.
3277  */
3278 static fs_reg
fetch_render_target_array_index(const fs_builder & bld)3279 fetch_render_target_array_index(const fs_builder &bld)
3280 {
3281    if (bld.shader->devinfo->ver >= 12) {
3282       /* The render target array index is provided in the thread payload as
3283        * bits 26:16 of r1.1.
3284        */
3285       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3286       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
3287               brw_imm_uw(0x7ff));
3288       return idx;
3289    } else if (bld.shader->devinfo->ver >= 6) {
3290       /* The render target array index is provided in the thread payload as
3291        * bits 26:16 of r0.0.
3292        */
3293       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3294       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3295               brw_imm_uw(0x7ff));
3296       return idx;
3297    } else {
3298       /* Pre-SNB we only ever render into the first layer of the framebuffer
3299        * since layered rendering is not implemented.
3300        */
3301       return brw_imm_ud(0);
3302    }
3303 }
3304 
3305 /**
3306  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3307  * framebuffer at the current fragment coordinates and sample index.
3308  */
3309 fs_inst *
emit_non_coherent_fb_read(const fs_builder & bld,const fs_reg & dst,unsigned target)3310 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3311                                       unsigned target)
3312 {
3313    const struct intel_device_info *devinfo = bld.shader->devinfo;
3314 
3315    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3316    const brw_wm_prog_key *wm_key =
3317       reinterpret_cast<const brw_wm_prog_key *>(key);
3318    assert(!wm_key->coherent_fb_fetch);
3319 
3320    /* Calculate the fragment coordinates. */
3321    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3322    bld.MOV(offset(coords, bld, 0), pixel_x);
3323    bld.MOV(offset(coords, bld, 1), pixel_y);
3324    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3325 
3326    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3327     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3328     * shouldn't be necessary to recompile based on whether the framebuffer is
3329     * CMS or UMS.
3330     */
3331    if (wm_key->multisample_fbo &&
3332        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3333       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup();
3334 
3335    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3336    const fs_reg mcs = wm_key->multisample_fbo ?
3337       emit_mcs_fetch(coords, 3, brw_imm_ud(target), fs_reg()) : fs_reg();
3338 
3339    /* Use either a normal or a CMS texel fetch message depending on whether
3340     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3341     * message just in case the framebuffer uses 16x multisampling, it should
3342     * be equivalent to the normal CMS fetch for lower multisampling modes.
3343     */
3344    opcode op;
3345    if (wm_key->multisample_fbo) {
3346       /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
3347        * multisampling, it should be equivalent to the normal CMS fetch for
3348        * lower multisampling modes.
3349        *
3350        * On Gfx12HP, there is only CMS_W variant available.
3351        */
3352       if (devinfo->verx10 >= 125)
3353          op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
3354       else if (devinfo->ver >= 9)
3355          op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
3356       else
3357          op = SHADER_OPCODE_TXF_CMS_LOGICAL;
3358    } else {
3359       op = SHADER_OPCODE_TXF_LOGICAL;
3360    }
3361 
3362    /* Emit the instruction. */
3363    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3364    srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3365    srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3366    srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3367    srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3368    srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(target);
3369    srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3370    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3371    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3372 
3373    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3374    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3375 
3376    return inst;
3377 }
3378 
3379 /**
3380  * Actual coherent framebuffer read implemented using the native render target
3381  * read message.  Requires SKL+.
3382  */
3383 static fs_inst *
emit_coherent_fb_read(const fs_builder & bld,const fs_reg & dst,unsigned target)3384 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3385 {
3386    assert(bld.shader->devinfo->ver >= 9);
3387    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3388    inst->target = target;
3389    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3390 
3391    return inst;
3392 }
3393 
3394 static fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,fs_reg * regs,unsigned n)3395 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3396 {
3397    if (n && regs[0].file != BAD_FILE) {
3398       return regs[0];
3399 
3400    } else {
3401       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3402 
3403       for (unsigned i = 0; i < n; i++)
3404          regs[i] = tmp;
3405 
3406       return tmp;
3407    }
3408 }
3409 
3410 static fs_reg
alloc_frag_output(fs_visitor * v,unsigned location)3411 alloc_frag_output(fs_visitor *v, unsigned location)
3412 {
3413    assert(v->stage == MESA_SHADER_FRAGMENT);
3414    const brw_wm_prog_key *const key =
3415       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3416    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3417    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3418 
3419    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3420       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3421 
3422    else if (l == FRAG_RESULT_COLOR)
3423       return alloc_temporary(v->bld, 4, v->outputs,
3424                              MAX2(key->nr_color_regions, 1));
3425 
3426    else if (l == FRAG_RESULT_DEPTH)
3427       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3428 
3429    else if (l == FRAG_RESULT_STENCIL)
3430       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3431 
3432    else if (l == FRAG_RESULT_SAMPLE_MASK)
3433       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3434 
3435    else if (l >= FRAG_RESULT_DATA0 &&
3436             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3437       return alloc_temporary(v->bld, 4,
3438                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3439 
3440    else
3441       unreachable("Invalid location");
3442 }
3443 
3444 void
nir_emit_fs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)3445 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3446                                   nir_intrinsic_instr *instr)
3447 {
3448    assert(stage == MESA_SHADER_FRAGMENT);
3449 
3450    fs_reg dest;
3451    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3452       dest = get_nir_dest(instr->dest);
3453 
3454    switch (instr->intrinsic) {
3455    case nir_intrinsic_load_front_face:
3456       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3457               emit_frontfacing_interpolation());
3458       break;
3459 
3460    case nir_intrinsic_load_sample_pos:
3461    case nir_intrinsic_load_sample_pos_or_center: {
3462       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3463       assert(sample_pos.file != BAD_FILE);
3464       dest.type = sample_pos.type;
3465       bld.MOV(dest, sample_pos);
3466       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3467       break;
3468    }
3469 
3470    case nir_intrinsic_load_layer_id:
3471       dest.type = BRW_REGISTER_TYPE_UD;
3472       bld.MOV(dest, fetch_render_target_array_index(bld));
3473       break;
3474 
3475    case nir_intrinsic_is_helper_invocation:
3476       emit_is_helper_invocation(dest);
3477       break;
3478 
3479    case nir_intrinsic_load_helper_invocation:
3480    case nir_intrinsic_load_sample_mask_in:
3481    case nir_intrinsic_load_sample_id:
3482    case nir_intrinsic_load_frag_shading_rate: {
3483       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3484       fs_reg val = nir_system_values[sv];
3485       assert(val.file != BAD_FILE);
3486       dest.type = val.type;
3487       bld.MOV(dest, val);
3488       break;
3489    }
3490 
3491    case nir_intrinsic_store_output: {
3492       const fs_reg src = get_nir_src(instr->src[0]);
3493       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3494       const unsigned location = nir_intrinsic_base(instr) +
3495          SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3496       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3497                                      src.type);
3498 
3499       for (unsigned j = 0; j < instr->num_components; j++)
3500          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3501                  offset(src, bld, j));
3502 
3503       break;
3504    }
3505 
3506    case nir_intrinsic_load_output: {
3507       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3508                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3509       assert(l >= FRAG_RESULT_DATA0);
3510       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3511       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3512       const fs_reg tmp = bld.vgrf(dest.type, 4);
3513 
3514       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3515          emit_coherent_fb_read(bld, tmp, target);
3516       else
3517          emit_non_coherent_fb_read(bld, tmp, target);
3518 
3519       for (unsigned j = 0; j < instr->num_components; j++) {
3520          bld.MOV(offset(dest, bld, j),
3521                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3522       }
3523 
3524       break;
3525    }
3526 
3527    case nir_intrinsic_demote:
3528    case nir_intrinsic_discard:
3529    case nir_intrinsic_terminate:
3530    case nir_intrinsic_demote_if:
3531    case nir_intrinsic_discard_if:
3532    case nir_intrinsic_terminate_if: {
3533       /* We track our discarded pixels in f0.1/f1.0.  By predicating on it, we
3534        * can update just the flag bits that aren't yet discarded.  If there's
3535        * no condition, we emit a CMP of g0 != g0, so all currently executing
3536        * channels will get turned off.
3537        */
3538       fs_inst *cmp = NULL;
3539       if (instr->intrinsic == nir_intrinsic_demote_if ||
3540           instr->intrinsic == nir_intrinsic_discard_if ||
3541           instr->intrinsic == nir_intrinsic_terminate_if) {
3542          nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3543 
3544          if (alu != NULL &&
3545              alu->op != nir_op_bcsel &&
3546              (devinfo->ver > 5 ||
3547               (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE ||
3548               alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
3549               alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
3550               alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
3551               alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
3552               alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
3553             /* Re-emit the instruction that generated the Boolean value, but
3554              * do not store it.  Since this instruction will be conditional,
3555              * other instructions that want to use the real Boolean value may
3556              * get garbage.  This was a problem for piglit's fs-discard-exit-2
3557              * test.
3558              *
3559              * Ideally we'd detect that the instruction cannot have a
3560              * conditional modifier before emitting the instructions.  Alas,
3561              * that is nigh impossible.  Instead, we're going to assume the
3562              * instruction (or last instruction) generated can have a
3563              * conditional modifier.  If it cannot, fallback to the old-style
3564              * compare, and hope dead code elimination will clean up the
3565              * extra instructions generated.
3566              */
3567             nir_emit_alu(bld, alu, false);
3568 
3569             cmp = (fs_inst *) instructions.get_tail();
3570             if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
3571                if (cmp->can_do_cmod())
3572                   cmp->conditional_mod = BRW_CONDITIONAL_Z;
3573                else
3574                   cmp = NULL;
3575             } else {
3576                /* The old sequence that would have been generated is,
3577                 * basically, bool_result == false.  This is equivalent to
3578                 * !bool_result, so negate the old modifier.
3579                 */
3580                cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
3581             }
3582          }
3583 
3584          if (cmp == NULL) {
3585             cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3586                           brw_imm_d(0), BRW_CONDITIONAL_Z);
3587          }
3588       } else {
3589          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3590                                        BRW_REGISTER_TYPE_UW));
3591          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3592       }
3593 
3594       cmp->predicate = BRW_PREDICATE_NORMAL;
3595       cmp->flag_subreg = sample_mask_flag_subreg(this);
3596 
3597       fs_inst *jump = bld.emit(BRW_OPCODE_HALT);
3598       jump->flag_subreg = sample_mask_flag_subreg(this);
3599       jump->predicate_inverse = true;
3600 
3601       if (instr->intrinsic == nir_intrinsic_terminate ||
3602           instr->intrinsic == nir_intrinsic_terminate_if) {
3603          jump->predicate = BRW_PREDICATE_NORMAL;
3604       } else {
3605          /* Only jump when the whole quad is demoted.  For historical
3606           * reasons this is also used for discard.
3607           */
3608          jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
3609       }
3610 
3611       if (devinfo->ver < 7)
3612          limit_dispatch_width(
3613             16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
3614       break;
3615    }
3616 
3617    case nir_intrinsic_load_input: {
3618       /* In Fragment Shaders load_input is used either for flat inputs or
3619        * per-primitive inputs.
3620        */
3621       assert(nir_dest_bit_size(instr->dest) == 32);
3622       unsigned base = nir_intrinsic_base(instr);
3623       unsigned comp = nir_intrinsic_component(instr);
3624       unsigned num_components = instr->num_components;
3625 
3626       /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
3627 
3628       /* Special case fields in the VUE header */
3629       if (base == VARYING_SLOT_LAYER)
3630          comp = 1;
3631       else if (base == VARYING_SLOT_VIEWPORT)
3632          comp = 2;
3633 
3634       if (BITFIELD64_BIT(base) & nir->info.per_primitive_inputs) {
3635          assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
3636          for (unsigned int i = 0; i < num_components; i++) {
3637             bld.MOV(offset(dest, bld, i),
3638                     retype(component(per_primitive_reg(base), comp + i), dest.type));
3639          }
3640       } else {
3641          for (unsigned int i = 0; i < num_components; i++) {
3642             bld.MOV(offset(dest, bld, i),
3643                     retype(component(interp_reg(base, comp + i), 3), dest.type));
3644          }
3645       }
3646       break;
3647    }
3648 
3649    case nir_intrinsic_load_fs_input_interp_deltas: {
3650       assert(stage == MESA_SHADER_FRAGMENT);
3651       assert(nir_src_as_uint(instr->src[0]) == 0);
3652       fs_reg interp = interp_reg(nir_intrinsic_base(instr),
3653                                  nir_intrinsic_component(instr));
3654       dest.type = BRW_REGISTER_TYPE_F;
3655       bld.MOV(offset(dest, bld, 0), component(interp, 3));
3656       bld.MOV(offset(dest, bld, 1), component(interp, 1));
3657       bld.MOV(offset(dest, bld, 2), component(interp, 0));
3658       break;
3659    }
3660 
3661    case nir_intrinsic_load_barycentric_pixel:
3662    case nir_intrinsic_load_barycentric_centroid:
3663    case nir_intrinsic_load_barycentric_sample: {
3664       /* Use the delta_xy values computed from the payload */
3665       enum brw_barycentric_mode bary = brw_barycentric_mode(instr);
3666       const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0),
3667                               offset(this->delta_xy[bary], bld, 1) };
3668       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3669       break;
3670    }
3671 
3672    case nir_intrinsic_load_barycentric_at_sample: {
3673       const glsl_interp_mode interpolation =
3674          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3675 
3676       if (nir_src_is_const(instr->src[0])) {
3677          unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3678 
3679          emit_pixel_interpolater_send(bld,
3680                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3681                                       dest,
3682                                       fs_reg(), /* src */
3683                                       brw_imm_ud(msg_data),
3684                                       interpolation);
3685       } else {
3686          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3687                                           BRW_REGISTER_TYPE_UD);
3688 
3689          if (nir_src_is_always_uniform(instr->src[0])) {
3690             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3691             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3692             bld.exec_all().group(1, 0)
3693                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3694             emit_pixel_interpolater_send(bld,
3695                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3696                                          dest,
3697                                          fs_reg(), /* src */
3698                                          component(msg_data, 0),
3699                                          interpolation);
3700          } else {
3701             /* Make a loop that sends a message to the pixel interpolater
3702              * for the sample number in each live channel. If there are
3703              * multiple channels with the same sample number then these
3704              * will be handled simultaneously with a single iteration of
3705              * the loop.
3706              */
3707             bld.emit(BRW_OPCODE_DO);
3708 
3709             /* Get the next live sample number into sample_id_reg */
3710             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3711 
3712             /* Set the flag register so that we can perform the send
3713              * message on all channels that have the same sample number
3714              */
3715             bld.CMP(bld.null_reg_ud(),
3716                     sample_src, sample_id,
3717                     BRW_CONDITIONAL_EQ);
3718             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3719             bld.exec_all().group(1, 0)
3720                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3721             fs_inst *inst =
3722                emit_pixel_interpolater_send(bld,
3723                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3724                                             dest,
3725                                             fs_reg(), /* src */
3726                                             component(msg_data, 0),
3727                                             interpolation);
3728             set_predicate(BRW_PREDICATE_NORMAL, inst);
3729 
3730             /* Continue the loop if there are any live channels left */
3731             set_predicate_inv(BRW_PREDICATE_NORMAL,
3732                               true, /* inverse */
3733                               bld.emit(BRW_OPCODE_WHILE));
3734          }
3735       }
3736       break;
3737    }
3738 
3739    case nir_intrinsic_load_barycentric_at_offset: {
3740       const glsl_interp_mode interpolation =
3741          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3742 
3743       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3744 
3745       if (const_offset) {
3746          assert(nir_src_bit_size(instr->src[0]) == 32);
3747          unsigned off_x = const_offset[0].u32 & 0xf;
3748          unsigned off_y = const_offset[1].u32 & 0xf;
3749 
3750          emit_pixel_interpolater_send(bld,
3751                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3752                                       dest,
3753                                       fs_reg(), /* src */
3754                                       brw_imm_ud(off_x | (off_y << 4)),
3755                                       interpolation);
3756       } else {
3757          fs_reg src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_D);
3758          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3759          emit_pixel_interpolater_send(bld,
3760                                       opcode,
3761                                       dest,
3762                                       src,
3763                                       brw_imm_ud(0u),
3764                                       interpolation);
3765       }
3766       break;
3767    }
3768 
3769    case nir_intrinsic_load_frag_coord:
3770       emit_fragcoord_interpolation(dest);
3771       break;
3772 
3773    case nir_intrinsic_load_interpolated_input: {
3774       assert(instr->src[0].ssa &&
3775              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3776       nir_intrinsic_instr *bary_intrinsic =
3777          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3778       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3779       enum glsl_interp_mode interp_mode =
3780          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3781       fs_reg dst_xy;
3782 
3783       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3784           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3785          /* Use the result of the PI message. */
3786          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3787       } else {
3788          /* Use the delta_xy values computed from the payload */
3789          enum brw_barycentric_mode bary = brw_barycentric_mode(bary_intrinsic);
3790          dst_xy = this->delta_xy[bary];
3791       }
3792 
3793       for (unsigned int i = 0; i < instr->num_components; i++) {
3794          fs_reg interp =
3795             component(interp_reg(nir_intrinsic_base(instr),
3796                                  nir_intrinsic_component(instr) + i), 0);
3797          interp.type = BRW_REGISTER_TYPE_F;
3798          dest.type = BRW_REGISTER_TYPE_F;
3799 
3800          if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3801             fs_reg tmp = vgrf(glsl_type::float_type);
3802             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3803             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3804          } else {
3805             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3806          }
3807       }
3808       break;
3809    }
3810 
3811    default:
3812       nir_emit_intrinsic(bld, instr);
3813       break;
3814    }
3815 }
3816 
3817 void
nir_emit_cs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)3818 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3819                                   nir_intrinsic_instr *instr)
3820 {
3821    assert(gl_shader_stage_uses_workgroup(stage));
3822    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3823 
3824    fs_reg dest;
3825    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3826       dest = get_nir_dest(instr->dest);
3827 
3828    switch (instr->intrinsic) {
3829    case nir_intrinsic_control_barrier:
3830       /* The whole workgroup fits in a single HW thread, so all the
3831        * invocations are already executed lock-step.  Instead of an actual
3832        * barrier just emit a scheduling fence, that will generate no code.
3833        */
3834       if (!nir->info.workgroup_size_variable &&
3835           workgroup_size() <= dispatch_width) {
3836          bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
3837          break;
3838       }
3839 
3840       emit_barrier();
3841       cs_prog_data->uses_barrier = true;
3842       break;
3843 
3844    case nir_intrinsic_load_subgroup_id:
3845       if (devinfo->verx10 >= 125)
3846          bld.AND(retype(dest, BRW_REGISTER_TYPE_UD),
3847                  retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
3848                  brw_imm_ud(INTEL_MASK(7, 0)));
3849       else
3850          bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3851       break;
3852 
3853    case nir_intrinsic_load_local_invocation_id:
3854    case nir_intrinsic_load_workgroup_id: {
3855       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3856       fs_reg val = nir_system_values[sv];
3857       assert(val.file != BAD_FILE);
3858       dest.type = val.type;
3859       for (unsigned i = 0; i < 3; i++)
3860          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3861       break;
3862    }
3863 
3864    case nir_intrinsic_load_num_workgroups: {
3865       assert(nir_dest_bit_size(instr->dest) == 32);
3866 
3867       cs_prog_data->uses_num_work_groups = true;
3868 
3869       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3870       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(0);
3871       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3872       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */
3873       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0);
3874       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
3875       fs_inst *inst =
3876          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3877                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3878       inst->size_written = 3 * dispatch_width * 4;
3879       break;
3880    }
3881 
3882    case nir_intrinsic_shared_atomic_add:
3883    case nir_intrinsic_shared_atomic_imin:
3884    case nir_intrinsic_shared_atomic_umin:
3885    case nir_intrinsic_shared_atomic_imax:
3886    case nir_intrinsic_shared_atomic_umax:
3887    case nir_intrinsic_shared_atomic_and:
3888    case nir_intrinsic_shared_atomic_or:
3889    case nir_intrinsic_shared_atomic_xor:
3890    case nir_intrinsic_shared_atomic_exchange:
3891    case nir_intrinsic_shared_atomic_comp_swap:
3892       nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
3893       break;
3894    case nir_intrinsic_shared_atomic_fmin:
3895    case nir_intrinsic_shared_atomic_fmax:
3896    case nir_intrinsic_shared_atomic_fcomp_swap:
3897       nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
3898       break;
3899 
3900    case nir_intrinsic_load_shared: {
3901       assert(devinfo->ver >= 7);
3902       assert(nir_intrinsic_base(instr) == 0);
3903 
3904       const unsigned bit_size = nir_dest_bit_size(instr->dest);
3905       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3906       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
3907       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
3908       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3909       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
3910 
3911       /* Make dest unsigned because that's what the temporary will be */
3912       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3913 
3914       /* Read the vector */
3915       assert(nir_dest_bit_size(instr->dest) <= 32);
3916       assert(nir_intrinsic_align(instr) > 0);
3917       if (nir_dest_bit_size(instr->dest) == 32 &&
3918           nir_intrinsic_align(instr) >= 4) {
3919          assert(nir_dest_num_components(instr->dest) <= 4);
3920          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3921          fs_inst *inst =
3922             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3923                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3924          inst->size_written = instr->num_components * dispatch_width * 4;
3925       } else {
3926          assert(nir_dest_num_components(instr->dest) == 1);
3927          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3928 
3929          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
3930          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
3931                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
3932          bld.MOV(dest, subscript(read_result, dest.type, 0));
3933       }
3934       break;
3935    }
3936 
3937    case nir_intrinsic_store_shared: {
3938       assert(devinfo->ver >= 7);
3939       assert(nir_intrinsic_base(instr) == 0);
3940 
3941       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3942       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3943       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
3944       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3945       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3946       /* No point in masking with sample mask, here we're handling compute
3947        * intrinsics.
3948        */
3949       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
3950 
3951       fs_reg data = get_nir_src(instr->src[0]);
3952       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3953 
3954       assert(nir_src_bit_size(instr->src[0]) <= 32);
3955       assert(nir_intrinsic_write_mask(instr) ==
3956              (1u << instr->num_components) - 1);
3957       assert(nir_intrinsic_align(instr) > 0);
3958       if (nir_src_bit_size(instr->src[0]) == 32 &&
3959           nir_intrinsic_align(instr) >= 4) {
3960          assert(nir_src_num_components(instr->src[0]) <= 4);
3961          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3962          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3963          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3964                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3965       } else {
3966          assert(nir_src_num_components(instr->src[0]) == 1);
3967          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3968 
3969          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3970          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
3971 
3972          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
3973                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3974       }
3975       break;
3976    }
3977 
3978    case nir_intrinsic_load_workgroup_size: {
3979       /* For non-variable case, this should've been lowered already. */
3980       assert(nir->info.workgroup_size_variable);
3981 
3982       assert(compiler->lower_variable_group_size);
3983       assert(gl_shader_stage_is_compute(stage));
3984 
3985       for (unsigned i = 0; i < 3; i++) {
3986          bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
3987             group_size[i]);
3988       }
3989       break;
3990    }
3991 
3992    default:
3993       nir_emit_intrinsic(bld, instr);
3994       break;
3995    }
3996 }
3997 
3998 static void
emit_rt_lsc_fence(const fs_builder & bld,enum lsc_fence_scope scope,enum lsc_flush_type flush_type)3999 emit_rt_lsc_fence(const fs_builder &bld,
4000                   enum lsc_fence_scope scope,
4001                   enum lsc_flush_type flush_type)
4002 {
4003    const intel_device_info *devinfo = bld.shader->devinfo;
4004 
4005    const fs_builder ubld = bld.exec_all().group(8, 0);
4006    fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4007    fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
4008                              brw_imm_ud(0) /* desc */,
4009                              brw_imm_ud(0) /* ex_desc */,
4010                              brw_vec8_grf(0, 0) /* payload */);
4011    send->sfid = GFX12_SFID_UGM;
4012    send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true);
4013    send->mlen = 1; /* g0 header */
4014    send->ex_mlen = 0;
4015    send->size_written = REG_SIZE; /* Temp write for scheduling */
4016    send->send_has_side_effects = true;
4017 
4018    ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
4019 }
4020 
4021 
4022 void
nir_emit_bs_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)4023 fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
4024                                   nir_intrinsic_instr *instr)
4025 {
4026    assert(brw_shader_stage_is_bindless(stage));
4027 
4028    fs_reg dest;
4029    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4030       dest = get_nir_dest(instr->dest);
4031 
4032    switch (instr->intrinsic) {
4033    case nir_intrinsic_load_btd_global_arg_addr_intel:
4034       bld.MOV(dest, retype(brw_vec1_grf(2, 0), dest.type));
4035       break;
4036 
4037    case nir_intrinsic_load_btd_local_arg_addr_intel:
4038       bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type));
4039       break;
4040 
4041    case nir_intrinsic_load_btd_shader_type_intel: {
4042       fs_reg ud_dest = retype(dest, BRW_REGISTER_TYPE_UD);
4043       bld.MOV(ud_dest, retype(brw_vec1_grf(0, 3), ud_dest.type));
4044       bld.AND(ud_dest, ud_dest, brw_imm_ud(0xf));
4045       break;
4046    }
4047 
4048    default:
4049       nir_emit_intrinsic(bld, instr);
4050       break;
4051    }
4052 }
4053 
4054 static fs_reg
brw_nir_reduction_op_identity(const fs_builder & bld,nir_op op,brw_reg_type type)4055 brw_nir_reduction_op_identity(const fs_builder &bld,
4056                               nir_op op, brw_reg_type type)
4057 {
4058    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4059    switch (type_sz(type)) {
4060    case 1:
4061       if (type == BRW_REGISTER_TYPE_UB) {
4062          return brw_imm_uw(value.u8);
4063       } else {
4064          assert(type == BRW_REGISTER_TYPE_B);
4065          return brw_imm_w(value.i8);
4066       }
4067    case 2:
4068       return retype(brw_imm_uw(value.u16), type);
4069    case 4:
4070       return retype(brw_imm_ud(value.u32), type);
4071    case 8:
4072       if (type == BRW_REGISTER_TYPE_DF)
4073          return setup_imm_df(bld, value.f64);
4074       else
4075          return retype(brw_imm_u64(value.u64), type);
4076    default:
4077       unreachable("Invalid type size");
4078    }
4079 }
4080 
4081 static opcode
brw_op_for_nir_reduction_op(nir_op op)4082 brw_op_for_nir_reduction_op(nir_op op)
4083 {
4084    switch (op) {
4085    case nir_op_iadd: return BRW_OPCODE_ADD;
4086    case nir_op_fadd: return BRW_OPCODE_ADD;
4087    case nir_op_imul: return BRW_OPCODE_MUL;
4088    case nir_op_fmul: return BRW_OPCODE_MUL;
4089    case nir_op_imin: return BRW_OPCODE_SEL;
4090    case nir_op_umin: return BRW_OPCODE_SEL;
4091    case nir_op_fmin: return BRW_OPCODE_SEL;
4092    case nir_op_imax: return BRW_OPCODE_SEL;
4093    case nir_op_umax: return BRW_OPCODE_SEL;
4094    case nir_op_fmax: return BRW_OPCODE_SEL;
4095    case nir_op_iand: return BRW_OPCODE_AND;
4096    case nir_op_ior:  return BRW_OPCODE_OR;
4097    case nir_op_ixor: return BRW_OPCODE_XOR;
4098    default:
4099       unreachable("Invalid reduction operation");
4100    }
4101 }
4102 
4103 static brw_conditional_mod
brw_cond_mod_for_nir_reduction_op(nir_op op)4104 brw_cond_mod_for_nir_reduction_op(nir_op op)
4105 {
4106    switch (op) {
4107    case nir_op_iadd: return BRW_CONDITIONAL_NONE;
4108    case nir_op_fadd: return BRW_CONDITIONAL_NONE;
4109    case nir_op_imul: return BRW_CONDITIONAL_NONE;
4110    case nir_op_fmul: return BRW_CONDITIONAL_NONE;
4111    case nir_op_imin: return BRW_CONDITIONAL_L;
4112    case nir_op_umin: return BRW_CONDITIONAL_L;
4113    case nir_op_fmin: return BRW_CONDITIONAL_L;
4114    case nir_op_imax: return BRW_CONDITIONAL_GE;
4115    case nir_op_umax: return BRW_CONDITIONAL_GE;
4116    case nir_op_fmax: return BRW_CONDITIONAL_GE;
4117    case nir_op_iand: return BRW_CONDITIONAL_NONE;
4118    case nir_op_ior:  return BRW_CONDITIONAL_NONE;
4119    case nir_op_ixor: return BRW_CONDITIONAL_NONE;
4120    default:
4121       unreachable("Invalid reduction operation");
4122    }
4123 }
4124 
4125 fs_reg
get_nir_image_intrinsic_image(const brw::fs_builder & bld,nir_intrinsic_instr * instr)4126 fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
4127                                           nir_intrinsic_instr *instr)
4128 {
4129    fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
4130    fs_reg surf_index = image;
4131 
4132    return bld.emit_uniformize(surf_index);
4133 }
4134 
4135 fs_reg
get_nir_ssbo_intrinsic_index(const brw::fs_builder & bld,nir_intrinsic_instr * instr)4136 fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
4137                                          nir_intrinsic_instr *instr)
4138 {
4139    /* SSBO stores are weird in that their index is in src[1] */
4140    const bool is_store =
4141       instr->intrinsic == nir_intrinsic_store_ssbo ||
4142       instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4143    const unsigned src = is_store ? 1 : 0;
4144 
4145    if (nir_src_is_const(instr->src[src])) {
4146       return brw_imm_ud(nir_src_as_uint(instr->src[src]));
4147    } else {
4148       return bld.emit_uniformize(get_nir_src(instr->src[src]));
4149    }
4150 }
4151 
4152 /**
4153  * The offsets we get from NIR act as if each SIMD channel has it's own blob
4154  * of contiguous space.  However, if we actually place each SIMD channel in
4155  * it's own space, we end up with terrible cache performance because each SIMD
4156  * channel accesses a different cache line even when they're all accessing the
4157  * same byte offset.  To deal with this problem, we swizzle the address using
4158  * a simple algorithm which ensures that any time a SIMD message reads or
4159  * writes the same address, it's all in the same cache line.  We have to keep
4160  * the bottom two bits fixed so that we can read/write up to a dword at a time
4161  * and the individual element is contiguous.  We do this by splitting the
4162  * address as follows:
4163  *
4164  *    31                             4-6           2          0
4165  *    +-------------------------------+------------+----------+
4166  *    |        Hi address bits        | chan index | addr low |
4167  *    +-------------------------------+------------+----------+
4168  *
4169  * In other words, the bottom two address bits stay, and the top 30 get
4170  * shifted up so that we can stick the SIMD channel index in the middle.  This
4171  * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4172  * at the same logical offset, the scratch read/write instruction acts on
4173  * continuous elements and we get good cache locality.
4174  */
4175 fs_reg
swizzle_nir_scratch_addr(const brw::fs_builder & bld,const fs_reg & nir_addr,bool in_dwords)4176 fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld,
4177                                      const fs_reg &nir_addr,
4178                                      bool in_dwords)
4179 {
4180    const fs_reg &chan_index =
4181       nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4182    const unsigned chan_index_bits = ffs(dispatch_width) - 1;
4183 
4184    fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4185    if (in_dwords) {
4186       /* In this case, we know the address is aligned to a DWORD and we want
4187        * the final address in DWORDs.
4188        */
4189       bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
4190       bld.OR(addr, addr, chan_index);
4191    } else {
4192       /* This case substantially more annoying because we have to pay
4193        * attention to those pesky two bottom bits.
4194        */
4195       fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
4196       bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
4197       bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
4198       fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4199       bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
4200       bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
4201       bld.OR(addr, addr, addr_hi);
4202       bld.OR(addr, addr, chan_addr);
4203    }
4204    return addr;
4205 }
4206 
4207 static unsigned
choose_oword_block_size_dwords(unsigned dwords)4208 choose_oword_block_size_dwords(unsigned dwords)
4209 {
4210    unsigned block;
4211    if (dwords >= 32) {
4212       block = 32;
4213    } else if (dwords >= 16) {
4214       block = 16;
4215    } else {
4216       block = 8;
4217    }
4218    assert(block <= dwords);
4219    return block;
4220 }
4221 
4222 static void
increment_a64_address(const fs_builder & bld,fs_reg address,uint32_t v)4223 increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
4224 {
4225    if (bld.shader->devinfo->has_64bit_int) {
4226       bld.ADD(address, address, brw_imm_ud(v));
4227    } else {
4228       fs_reg low = retype(address, BRW_REGISTER_TYPE_UD);
4229       fs_reg high = offset(low, bld, 1);
4230 
4231       /* Add low and if that overflows, add carry to high. */
4232       bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
4233       bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
4234    }
4235 }
4236 
4237 static fs_reg
emit_fence(const fs_builder & bld,enum opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4238 emit_fence(const fs_builder &bld, enum opcode opcode,
4239            uint8_t sfid, uint32_t desc,
4240            bool commit_enable, uint8_t bti)
4241 {
4242    assert(opcode == SHADER_OPCODE_INTERLOCK ||
4243           opcode == SHADER_OPCODE_MEMORY_FENCE);
4244 
4245    fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
4246    fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
4247                              brw_imm_ud(commit_enable),
4248                              brw_imm_ud(bti));
4249    fence->sfid = sfid;
4250    fence->desc = desc;
4251 
4252    return dst;
4253 }
4254 
4255 static uint32_t
lsc_fence_descriptor_for_intrinsic(const struct intel_device_info * devinfo,nir_intrinsic_instr * instr)4256 lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
4257                                    nir_intrinsic_instr *instr)
4258 {
4259    assert(devinfo->has_lsc);
4260 
4261    enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
4262    enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
4263 
4264    if (nir_intrinsic_has_memory_scope(instr)) {
4265       switch (nir_intrinsic_memory_scope(instr)) {
4266       case NIR_SCOPE_DEVICE:
4267       case NIR_SCOPE_QUEUE_FAMILY:
4268          scope = LSC_FENCE_TILE;
4269          flush_type = LSC_FLUSH_TYPE_EVICT;
4270          break;
4271       case NIR_SCOPE_WORKGROUP:
4272          scope = LSC_FENCE_THREADGROUP;
4273          flush_type = LSC_FLUSH_TYPE_EVICT;
4274          break;
4275       case NIR_SCOPE_SHADER_CALL:
4276       case NIR_SCOPE_INVOCATION:
4277       case NIR_SCOPE_SUBGROUP:
4278       case NIR_SCOPE_NONE:
4279          break;
4280       }
4281    } else {
4282       /* No scope defined. */
4283       scope = LSC_FENCE_TILE;
4284       flush_type = LSC_FLUSH_TYPE_EVICT;
4285    }
4286    return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
4287 }
4288 
4289 void
nir_emit_intrinsic(const fs_builder & bld,nir_intrinsic_instr * instr)4290 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
4291 {
4292    fs_reg dest;
4293    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4294       dest = get_nir_dest(instr->dest);
4295 
4296    switch (instr->intrinsic) {
4297    case nir_intrinsic_image_load:
4298    case nir_intrinsic_image_store:
4299    case nir_intrinsic_image_atomic_add:
4300    case nir_intrinsic_image_atomic_imin:
4301    case nir_intrinsic_image_atomic_umin:
4302    case nir_intrinsic_image_atomic_imax:
4303    case nir_intrinsic_image_atomic_umax:
4304    case nir_intrinsic_image_atomic_and:
4305    case nir_intrinsic_image_atomic_or:
4306    case nir_intrinsic_image_atomic_xor:
4307    case nir_intrinsic_image_atomic_exchange:
4308    case nir_intrinsic_image_atomic_comp_swap:
4309    case nir_intrinsic_bindless_image_load:
4310    case nir_intrinsic_bindless_image_store:
4311    case nir_intrinsic_bindless_image_atomic_add:
4312    case nir_intrinsic_bindless_image_atomic_imin:
4313    case nir_intrinsic_bindless_image_atomic_umin:
4314    case nir_intrinsic_bindless_image_atomic_imax:
4315    case nir_intrinsic_bindless_image_atomic_umax:
4316    case nir_intrinsic_bindless_image_atomic_and:
4317    case nir_intrinsic_bindless_image_atomic_or:
4318    case nir_intrinsic_bindless_image_atomic_xor:
4319    case nir_intrinsic_bindless_image_atomic_exchange:
4320    case nir_intrinsic_bindless_image_atomic_comp_swap: {
4321       /* Get some metadata from the image intrinsic. */
4322       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
4323 
4324       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4325 
4326       switch (instr->intrinsic) {
4327       case nir_intrinsic_image_load:
4328       case nir_intrinsic_image_store:
4329       case nir_intrinsic_image_atomic_add:
4330       case nir_intrinsic_image_atomic_imin:
4331       case nir_intrinsic_image_atomic_umin:
4332       case nir_intrinsic_image_atomic_imax:
4333       case nir_intrinsic_image_atomic_umax:
4334       case nir_intrinsic_image_atomic_and:
4335       case nir_intrinsic_image_atomic_or:
4336       case nir_intrinsic_image_atomic_xor:
4337       case nir_intrinsic_image_atomic_exchange:
4338       case nir_intrinsic_image_atomic_comp_swap:
4339          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4340             get_nir_image_intrinsic_image(bld, instr);
4341          break;
4342 
4343       default:
4344          /* Bindless */
4345          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4346             bld.emit_uniformize(get_nir_src(instr->src[0]));
4347          break;
4348       }
4349 
4350       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4351       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4352          brw_imm_ud(nir_image_intrinsic_coord_components(instr));
4353 
4354       /* Emit an image load, store or atomic op. */
4355       if (instr->intrinsic == nir_intrinsic_image_load ||
4356           instr->intrinsic == nir_intrinsic_bindless_image_load) {
4357          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4358          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4359          fs_inst *inst =
4360             bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4361                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4362          inst->size_written = instr->num_components * dispatch_width * 4;
4363       } else if (instr->intrinsic == nir_intrinsic_image_store ||
4364                  instr->intrinsic == nir_intrinsic_bindless_image_store) {
4365          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4366          srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
4367          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
4368          bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4369                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4370       } else {
4371          unsigned num_srcs = info->num_srcs;
4372          int op = brw_aop_for_nir_intrinsic(instr);
4373          if (op == BRW_AOP_INC || op == BRW_AOP_DEC) {
4374             assert(num_srcs == 4);
4375             num_srcs = 3;
4376          }
4377 
4378          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4379 
4380          fs_reg data;
4381          if (num_srcs >= 4)
4382             data = get_nir_src(instr->src[3]);
4383          if (num_srcs >= 5) {
4384             fs_reg tmp = bld.vgrf(data.type, 2);
4385             fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
4386             bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4387             data = tmp;
4388          }
4389          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4390          srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
4391 
4392          bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4393                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4394       }
4395       break;
4396    }
4397 
4398    case nir_intrinsic_image_size:
4399    case nir_intrinsic_bindless_image_size: {
4400       /* Cube image sizes should have previously been lowered to a 2D array */
4401       assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
4402 
4403       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4404        * into will handle the binding table index for us in the geneerator.
4405        * Incidentally, this means that we can handle bindless with exactly the
4406        * same code.
4407        */
4408       fs_reg image = retype(get_nir_src_imm(instr->src[0]),
4409                             BRW_REGISTER_TYPE_UD);
4410       image = bld.emit_uniformize(image);
4411 
4412       assert(nir_src_as_uint(instr->src[1]) == 0);
4413 
4414       fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4415       if (instr->intrinsic == nir_intrinsic_image_size)
4416          srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4417       else
4418          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4419       srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
4420       srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
4421       srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
4422 
4423       /* Since the image size is always uniform, we can just emit a SIMD8
4424        * query instruction and splat the result out.
4425        */
4426       const fs_builder ubld = bld.exec_all().group(8, 0);
4427 
4428       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4429       fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4430                                 tmp, srcs, ARRAY_SIZE(srcs));
4431       inst->size_written = 4 * REG_SIZE;
4432 
4433       for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
4434          bld.MOV(offset(retype(dest, tmp.type), bld, c),
4435                  component(offset(tmp, ubld, c), 0));
4436       }
4437       break;
4438    }
4439 
4440    case nir_intrinsic_image_load_raw_intel: {
4441       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4442       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4443          get_nir_image_intrinsic_image(bld, instr);
4444       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4445       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4446       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4447       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4448 
4449       fs_inst *inst =
4450          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4451                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4452       inst->size_written = instr->num_components * dispatch_width * 4;
4453       break;
4454    }
4455 
4456    case nir_intrinsic_image_store_raw_intel: {
4457       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4458       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4459          get_nir_image_intrinsic_image(bld, instr);
4460       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4461       srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
4462       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4463       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4464       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
4465 
4466       bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4467                fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4468       break;
4469    }
4470 
4471    case nir_intrinsic_scoped_barrier:
4472       assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE);
4473       FALLTHROUGH;
4474    case nir_intrinsic_group_memory_barrier:
4475    case nir_intrinsic_memory_barrier_shared:
4476    case nir_intrinsic_memory_barrier_buffer:
4477    case nir_intrinsic_memory_barrier_image:
4478    case nir_intrinsic_memory_barrier:
4479    case nir_intrinsic_begin_invocation_interlock:
4480    case nir_intrinsic_end_invocation_interlock: {
4481       bool ugm_fence, slm_fence, tgm_fence, urb_fence;
4482       const enum opcode opcode =
4483          instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
4484          SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
4485 
4486       switch (instr->intrinsic) {
4487       case nir_intrinsic_scoped_barrier: {
4488          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
4489          ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
4490          slm_fence = modes & nir_var_mem_shared;
4491          tgm_fence = modes & nir_var_image;
4492          urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
4493          break;
4494       }
4495 
4496       case nir_intrinsic_begin_invocation_interlock:
4497       case nir_intrinsic_end_invocation_interlock:
4498          /* For beginInvocationInterlockARB(), we will generate a memory fence
4499           * but with a different opcode so that generator can pick SENDC
4500           * instead of SEND.
4501           *
4502           * For endInvocationInterlockARB(), we need to insert a memory fence which
4503           * stalls in the shader until the memory transactions prior to that
4504           * fence are complete.  This ensures that the shader does not end before
4505           * any writes from its critical section have landed.  Otherwise, you can
4506           * end up with a case where the next invocation on that pixel properly
4507           * stalls for previous FS invocation on its pixel to complete but
4508           * doesn't actually wait for the dataport memory transactions from that
4509           * thread to land before submitting its own.
4510           *
4511           * Handling them here will allow the logic for IVB render cache (see
4512           * below) to be reused.
4513           */
4514          assert(stage == MESA_SHADER_FRAGMENT);
4515          ugm_fence = tgm_fence = true;
4516          slm_fence = urb_fence = false;
4517          break;
4518 
4519       default:
4520          ugm_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared &&
4521                      instr->intrinsic != nir_intrinsic_memory_barrier_image;
4522          slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
4523                      instr->intrinsic == nir_intrinsic_memory_barrier ||
4524                      instr->intrinsic == nir_intrinsic_memory_barrier_shared;
4525          tgm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
4526                      instr->intrinsic == nir_intrinsic_memory_barrier ||
4527                      instr->intrinsic == nir_intrinsic_memory_barrier_image;
4528          urb_fence = instr->intrinsic == nir_intrinsic_memory_barrier;
4529          break;
4530       }
4531 
4532       if (nir->info.shared_size > 0) {
4533          assert(gl_shader_stage_uses_workgroup(stage));
4534       } else {
4535          slm_fence = false;
4536       }
4537 
4538       /* If the workgroup fits in a single HW thread, the messages for SLM are
4539        * processed in-order and the shader itself is already synchronized so
4540        * the memory fence is not necessary.
4541        *
4542        * TODO: Check if applies for many HW threads sharing same Data Port.
4543        */
4544       if (!nir->info.workgroup_size_variable &&
4545           slm_fence && workgroup_size() <= dispatch_width)
4546          slm_fence = false;
4547 
4548       switch (stage) {
4549          case MESA_SHADER_TESS_CTRL:
4550          case MESA_SHADER_TASK:
4551          case MESA_SHADER_MESH:
4552             break;
4553          default:
4554             urb_fence = false;
4555             break;
4556       }
4557 
4558       unsigned fence_regs_count = 0;
4559       fs_reg fence_regs[4] = {};
4560 
4561       const fs_builder ubld = bld.group(8, 0);
4562 
4563       if (devinfo->has_lsc) {
4564          assert(devinfo->verx10 >= 125);
4565          uint32_t desc =
4566             lsc_fence_descriptor_for_intrinsic(devinfo, instr);
4567          if (ugm_fence) {
4568             fence_regs[fence_regs_count++] =
4569                emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
4570                           true /* commit_enable */,
4571                           0 /* bti; ignored for LSC */);
4572          }
4573 
4574          if (tgm_fence) {
4575             fence_regs[fence_regs_count++] =
4576                emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
4577                           true /* commit_enable */,
4578                           0 /* bti; ignored for LSC */);
4579          }
4580 
4581          if (slm_fence) {
4582             assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
4583             fence_regs[fence_regs_count++] =
4584                emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
4585                           true /* commit_enable */,
4586                           0 /* BTI; ignored for LSC */);
4587          }
4588 
4589          if (urb_fence) {
4590             assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
4591             fence_regs[fence_regs_count++] =
4592                emit_fence(ubld, opcode, BRW_SFID_URB, desc,
4593                           true /* commit_enable */,
4594                           0 /* BTI; ignored for LSC */);
4595          }
4596       } else if (devinfo->ver >= 11) {
4597          if (tgm_fence || ugm_fence || urb_fence) {
4598             fence_regs[fence_regs_count++] =
4599                emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
4600                           true /* commit_enable HSD ES # 1404612949 */,
4601                           0 /* BTI = 0 means data cache */);
4602          }
4603 
4604          if (slm_fence) {
4605             assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
4606             fence_regs[fence_regs_count++] =
4607                emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
4608                           true /* commit_enable HSD ES # 1404612949 */,
4609                           GFX7_BTI_SLM);
4610          }
4611       } else {
4612          /* Prior to Icelake, they're all lumped into a single cache except on
4613           * Ivy Bridge and Bay Trail where typed messages actually go through
4614           * the render cache.  There, we need both fences because we may
4615           * access storage images as either typed or untyped.
4616           */
4617          const bool render_fence = tgm_fence && devinfo->verx10 == 70;
4618 
4619          /* Simulation also complains on Gfx9 if we do not enable commit.
4620           */
4621          const bool commit_enable = render_fence ||
4622             instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
4623             devinfo->ver == 9;
4624 
4625          if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
4626             fence_regs[fence_regs_count++] =
4627                emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
4628                           commit_enable, 0 /* BTI */);
4629          }
4630 
4631          if (render_fence) {
4632             fence_regs[fence_regs_count++] =
4633                emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
4634                           commit_enable, /* bti */ 0);
4635          }
4636       }
4637 
4638       assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
4639 
4640       /* There are four cases where we want to insert a stall:
4641        *
4642        *  1. If we're a nir_intrinsic_end_invocation_interlock.  This is
4643        *     required to ensure that the shader EOT doesn't happen until
4644        *     after the fence returns.  Otherwise, we might end up with the
4645        *     next shader invocation for that pixel not respecting our fence
4646        *     because it may happen on a different HW thread.
4647        *
4648        *  2. If we have multiple fences.  This is required to ensure that
4649        *     they all complete and nothing gets weirdly out-of-order.
4650        *
4651        *  3. If we have no fences.  In this case, we need at least a
4652        *     scheduling barrier to keep the compiler from moving things
4653        *     around in an invalid way.
4654        *
4655        *  4. On platforms with LSC.
4656        */
4657       if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
4658           fence_regs_count != 1 || devinfo->has_lsc) {
4659          ubld.exec_all().group(1, 0).emit(
4660             FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
4661             fence_regs, fence_regs_count);
4662       }
4663 
4664       break;
4665    }
4666 
4667    case nir_intrinsic_memory_barrier_tcs_patch:
4668       break;
4669 
4670    case nir_intrinsic_shader_clock: {
4671       /* We cannot do anything if there is an event, so ignore it for now */
4672       const fs_reg shader_clock = get_timestamp(bld);
4673       const fs_reg srcs[] = { component(shader_clock, 0),
4674                               component(shader_clock, 1) };
4675       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4676       break;
4677    }
4678 
4679    case nir_intrinsic_image_samples:
4680       /* The driver does not support multi-sampled images. */
4681       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
4682       break;
4683 
4684    case nir_intrinsic_load_reloc_const_intel: {
4685       uint32_t id = nir_intrinsic_param_idx(instr);
4686       bld.emit(SHADER_OPCODE_MOV_RELOC_IMM,
4687                dest, brw_imm_ud(id));
4688       break;
4689    }
4690 
4691    case nir_intrinsic_load_uniform: {
4692       /* Offsets are in bytes but they should always aligned to
4693        * the type size
4694        */
4695       assert(instr->const_index[0] % 4 == 0 ||
4696              instr->const_index[0] % type_sz(dest.type) == 0);
4697 
4698       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
4699 
4700       if (nir_src_is_const(instr->src[0])) {
4701          unsigned load_offset = nir_src_as_uint(instr->src[0]);
4702          assert(load_offset % type_sz(dest.type) == 0);
4703          /* For 16-bit types we add the module of the const_index[0]
4704           * offset to access to not 32-bit aligned element
4705           */
4706          src.offset = load_offset + instr->const_index[0] % 4;
4707 
4708          for (unsigned j = 0; j < instr->num_components; j++) {
4709             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4710          }
4711       } else {
4712          fs_reg indirect = retype(get_nir_src(instr->src[0]),
4713                                   BRW_REGISTER_TYPE_UD);
4714 
4715          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4716           * go past the end of the uniform.  In order to keep the n'th
4717           * component from running past, we subtract off the size of all but
4718           * one component of the vector.
4719           */
4720          assert(instr->const_index[1] >=
4721                 instr->num_components * (int) type_sz(dest.type));
4722          unsigned read_size = instr->const_index[1] -
4723             (instr->num_components - 1) * type_sz(dest.type);
4724 
4725          bool supports_64bit_indirects =
4726             devinfo->platform != INTEL_PLATFORM_CHV && !intel_device_info_is_9lp(devinfo);
4727 
4728          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4729             for (unsigned j = 0; j < instr->num_components; j++) {
4730                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4731                         offset(dest, bld, j), offset(src, bld, j),
4732                         indirect, brw_imm_ud(read_size));
4733             }
4734          } else {
4735             const unsigned num_mov_indirects =
4736                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4737             /* We read a little bit less per MOV INDIRECT, as they are now
4738              * 32-bits ones instead of 64-bit. Fix read_size then.
4739              */
4740             const unsigned read_size_32bit = read_size -
4741                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4742             for (unsigned j = 0; j < instr->num_components; j++) {
4743                for (unsigned i = 0; i < num_mov_indirects; i++) {
4744                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4745                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4746                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4747                            indirect, brw_imm_ud(read_size_32bit));
4748                }
4749             }
4750          }
4751       }
4752       break;
4753    }
4754 
4755    case nir_intrinsic_load_ubo: {
4756       fs_reg surf_index;
4757       if (nir_src_is_const(instr->src[0])) {
4758          const unsigned index = nir_src_as_uint(instr->src[0]);
4759          surf_index = brw_imm_ud(index);
4760       } else {
4761          /* The block index is not a constant. Evaluate the index expression
4762           * per-channel and add the base UBO index; we have to select a value
4763           * from any live channel.
4764           */
4765          surf_index = vgrf(glsl_type::uint_type);
4766          bld.MOV(surf_index, get_nir_src(instr->src[0]));
4767          surf_index = bld.emit_uniformize(surf_index);
4768       }
4769 
4770       if (!nir_src_is_const(instr->src[1])) {
4771          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4772                                      BRW_REGISTER_TYPE_UD);
4773 
4774          for (int i = 0; i < instr->num_components; i++)
4775             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4776                                        base_offset, i * type_sz(dest.type),
4777                                        nir_dest_bit_size(instr->dest) / 8);
4778 
4779          prog_data->has_ubo_pull = true;
4780       } else {
4781          /* Even if we are loading doubles, a pull constant load will load
4782           * a 32-bit vec4, so should only reserve vgrf space for that. If we
4783           * need to load a full dvec4 we will have to emit 2 loads. This is
4784           * similar to demote_pull_constants(), except that in that case we
4785           * see individual accesses to each component of the vector and then
4786           * we let CSE deal with duplicate loads. Here we see a vector access
4787           * and we have to split it if necessary.
4788           */
4789          const unsigned type_size = type_sz(dest.type);
4790          const unsigned load_offset = nir_src_as_uint(instr->src[1]);
4791 
4792          /* See if we've selected this as a push constant candidate */
4793          if (nir_src_is_const(instr->src[0])) {
4794             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
4795             const unsigned offset_256b = load_offset / 32;
4796 
4797             fs_reg push_reg;
4798             for (int i = 0; i < 4; i++) {
4799                const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4800                if (range->block == ubo_block &&
4801                    offset_256b >= range->start &&
4802                    offset_256b < range->start + range->length) {
4803 
4804                   push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4805                   push_reg.offset = load_offset - 32 * range->start;
4806                   break;
4807                }
4808             }
4809 
4810             if (push_reg.file != BAD_FILE) {
4811                for (unsigned i = 0; i < instr->num_components; i++) {
4812                   bld.MOV(offset(dest, bld, i),
4813                           byte_offset(push_reg, i * type_size));
4814                }
4815                break;
4816             }
4817          }
4818 
4819          prog_data->has_ubo_pull = true;
4820 
4821          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4822          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4823          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4824 
4825          for (unsigned c = 0; c < instr->num_components;) {
4826             const unsigned base = load_offset + c * type_size;
4827             /* Number of usable components in the next block-aligned load. */
4828             const unsigned count = MIN2(instr->num_components - c,
4829                                         (block_sz - base % block_sz) / type_size);
4830 
4831             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4832                       packed_consts, surf_index,
4833                       brw_imm_ud(base & ~(block_sz - 1)));
4834 
4835             const fs_reg consts =
4836                retype(byte_offset(packed_consts, base & (block_sz - 1)),
4837                       dest.type);
4838 
4839             for (unsigned d = 0; d < count; d++)
4840                bld.MOV(offset(dest, bld, c + d), component(consts, d));
4841 
4842             c += count;
4843          }
4844       }
4845       break;
4846    }
4847 
4848    case nir_intrinsic_load_global:
4849    case nir_intrinsic_load_global_constant: {
4850       assert(devinfo->ver >= 8);
4851 
4852       assert(nir_dest_bit_size(instr->dest) <= 32);
4853       assert(nir_intrinsic_align(instr) > 0);
4854       fs_reg srcs[A64_LOGICAL_NUM_SRCS];
4855       srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[0]);
4856       srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
4857       srcs[A64_LOGICAL_ENABLE_HELPERS] =
4858          brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
4859 
4860       if (nir_dest_bit_size(instr->dest) == 32 &&
4861           nir_intrinsic_align(instr) >= 4) {
4862          assert(nir_dest_num_components(instr->dest) <= 4);
4863 
4864          srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
4865 
4866          fs_inst *inst =
4867             bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
4868                      srcs, A64_LOGICAL_NUM_SRCS);
4869          inst->size_written = instr->num_components *
4870                               inst->dst.component_size(inst->exec_size);
4871       } else {
4872          const unsigned bit_size = nir_dest_bit_size(instr->dest);
4873          assert(nir_dest_num_components(instr->dest) == 1);
4874          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4875 
4876          srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size);
4877 
4878          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
4879                   srcs, A64_LOGICAL_NUM_SRCS);
4880          bld.MOV(dest, subscript(tmp, dest.type, 0));
4881       }
4882       break;
4883    }
4884 
4885    case nir_intrinsic_store_global: {
4886       assert(devinfo->ver >= 8);
4887 
4888       assert(nir_src_bit_size(instr->src[0]) <= 32);
4889       assert(nir_intrinsic_write_mask(instr) ==
4890              (1u << instr->num_components) - 1);
4891       assert(nir_intrinsic_align(instr) > 0);
4892 
4893       fs_reg srcs[A64_LOGICAL_NUM_SRCS];
4894       srcs[A64_LOGICAL_ADDRESS] = get_nir_src(instr->src[1]);
4895       srcs[A64_LOGICAL_ENABLE_HELPERS] =
4896          brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
4897 
4898       if (nir_src_bit_size(instr->src[0]) == 32 &&
4899           nir_intrinsic_align(instr) >= 4) {
4900          assert(nir_src_num_components(instr->src[0]) <= 4);
4901 
4902          srcs[A64_LOGICAL_SRC] = get_nir_src(instr->src[0]); /* Data */
4903          srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
4904 
4905          bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(),
4906                   srcs, A64_LOGICAL_NUM_SRCS);
4907       } else {
4908          assert(nir_src_num_components(instr->src[0]) == 1);
4909          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4910          brw_reg_type data_type =
4911             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4912          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4913          bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
4914 
4915          srcs[A64_LOGICAL_SRC] = tmp;
4916          srcs[A64_LOGICAL_ARG] = brw_imm_ud(nir_src_bit_size(instr->src[0]));
4917 
4918          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(),
4919                   srcs, A64_LOGICAL_NUM_SRCS);
4920       }
4921       break;
4922    }
4923 
4924    case nir_intrinsic_global_atomic_add:
4925    case nir_intrinsic_global_atomic_imin:
4926    case nir_intrinsic_global_atomic_umin:
4927    case nir_intrinsic_global_atomic_imax:
4928    case nir_intrinsic_global_atomic_umax:
4929    case nir_intrinsic_global_atomic_and:
4930    case nir_intrinsic_global_atomic_or:
4931    case nir_intrinsic_global_atomic_xor:
4932    case nir_intrinsic_global_atomic_exchange:
4933    case nir_intrinsic_global_atomic_comp_swap:
4934       nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
4935       break;
4936    case nir_intrinsic_global_atomic_fadd:
4937    case nir_intrinsic_global_atomic_fmin:
4938    case nir_intrinsic_global_atomic_fmax:
4939    case nir_intrinsic_global_atomic_fcomp_swap:
4940       nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
4941       break;
4942 
4943    case nir_intrinsic_load_global_const_block_intel: {
4944       assert(nir_dest_bit_size(instr->dest) == 32);
4945       assert(instr->num_components == 8 || instr->num_components == 16);
4946 
4947       const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
4948       fs_reg load_val;
4949 
4950       bool is_pred_const = nir_src_is_const(instr->src[1]);
4951       if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
4952          /* In this case, we don't want the UBO load at all.  We really
4953           * shouldn't get here but it's possible.
4954           */
4955          load_val = brw_imm_ud(0);
4956       } else {
4957          /* The uniform process may stomp the flag so do this first */
4958          fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0]));
4959 
4960          load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4961 
4962          /* If the predicate is constant and we got here, then it's non-zero
4963           * and we don't need the predicate at all.
4964           */
4965          if (!is_pred_const) {
4966             /* Load the predicate */
4967             fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1]));
4968             fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
4969             mov->conditional_mod = BRW_CONDITIONAL_NZ;
4970 
4971             /* Stomp the destination with 0 if we're OOB */
4972             mov = ubld.MOV(load_val, brw_imm_ud(0));
4973             mov->predicate = BRW_PREDICATE_NORMAL;
4974             mov->predicate_inverse = true;
4975          }
4976 
4977          fs_reg srcs[A64_LOGICAL_NUM_SRCS];
4978          srcs[A64_LOGICAL_ADDRESS] = addr;
4979          srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
4980          srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
4981          /* This intrinsic loads memory from a uniform address, sometimes
4982           * shared across lanes. We never need to mask it.
4983           */
4984          srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
4985 
4986          fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
4987                                    load_val, srcs, A64_LOGICAL_NUM_SRCS);
4988          if (!is_pred_const)
4989             load->predicate = BRW_PREDICATE_NORMAL;
4990       }
4991 
4992       /* From the HW perspective, we just did a single SIMD16 instruction
4993        * which loaded a dword in each SIMD channel.  From NIR's perspective,
4994        * this instruction returns a vec16.  Any users of this data in the
4995        * back-end will expect a vec16 per SIMD channel so we have to emit a
4996        * pile of MOVs to resolve this discrepancy.  Fortunately, copy-prop
4997        * will generally clean them up for us.
4998        */
4999       for (unsigned i = 0; i < instr->num_components; i++) {
5000          bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
5001                  component(load_val, i));
5002       }
5003       break;
5004    }
5005 
5006    case nir_intrinsic_load_ssbo: {
5007       assert(devinfo->ver >= 7);
5008 
5009       const unsigned bit_size = nir_dest_bit_size(instr->dest);
5010       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5011       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5012          get_nir_ssbo_intrinsic_index(bld, instr);
5013       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5014       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5015       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
5016 
5017       /* Make dest unsigned because that's what the temporary will be */
5018       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
5019 
5020       /* Read the vector */
5021       assert(nir_dest_bit_size(instr->dest) <= 32);
5022       assert(nir_intrinsic_align(instr) > 0);
5023       if (nir_dest_bit_size(instr->dest) == 32 &&
5024           nir_intrinsic_align(instr) >= 4) {
5025          assert(nir_dest_num_components(instr->dest) <= 4);
5026          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
5027          fs_inst *inst =
5028             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5029                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5030          inst->size_written = instr->num_components * dispatch_width * 4;
5031       } else {
5032          assert(nir_dest_num_components(instr->dest) == 1);
5033          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
5034 
5035          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
5036          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5037                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5038          bld.MOV(dest, subscript(read_result, dest.type, 0));
5039       }
5040       break;
5041    }
5042 
5043    case nir_intrinsic_store_ssbo: {
5044       assert(devinfo->ver >= 7);
5045 
5046       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5047       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5048       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5049          get_nir_ssbo_intrinsic_index(bld, instr);
5050       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
5051       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5052       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5053 
5054       fs_reg data = get_nir_src(instr->src[0]);
5055       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
5056 
5057       assert(nir_src_bit_size(instr->src[0]) <= 32);
5058       assert(nir_intrinsic_write_mask(instr) ==
5059              (1u << instr->num_components) - 1);
5060       assert(nir_intrinsic_align(instr) > 0);
5061       if (nir_src_bit_size(instr->src[0]) == 32 &&
5062           nir_intrinsic_align(instr) >= 4) {
5063          assert(nir_src_num_components(instr->src[0]) <= 4);
5064          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5065          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
5066          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5067                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5068       } else {
5069          assert(nir_src_num_components(instr->src[0]) == 1);
5070          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
5071 
5072          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
5073          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5074 
5075          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5076                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5077       }
5078       break;
5079    }
5080 
5081    case nir_intrinsic_store_output: {
5082       assert(nir_src_bit_size(instr->src[0]) == 32);
5083       fs_reg src = get_nir_src(instr->src[0]);
5084 
5085       unsigned store_offset = nir_src_as_uint(instr->src[1]);
5086       unsigned num_components = instr->num_components;
5087       unsigned first_component = nir_intrinsic_component(instr);
5088 
5089       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
5090                                       4 * store_offset), src.type);
5091       for (unsigned j = 0; j < num_components; j++) {
5092          bld.MOV(offset(new_dest, bld, j + first_component),
5093                  offset(src, bld, j));
5094       }
5095       break;
5096    }
5097 
5098    case nir_intrinsic_ssbo_atomic_add:
5099    case nir_intrinsic_ssbo_atomic_imin:
5100    case nir_intrinsic_ssbo_atomic_umin:
5101    case nir_intrinsic_ssbo_atomic_imax:
5102    case nir_intrinsic_ssbo_atomic_umax:
5103    case nir_intrinsic_ssbo_atomic_and:
5104    case nir_intrinsic_ssbo_atomic_or:
5105    case nir_intrinsic_ssbo_atomic_xor:
5106    case nir_intrinsic_ssbo_atomic_exchange:
5107    case nir_intrinsic_ssbo_atomic_comp_swap:
5108       nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
5109       break;
5110    case nir_intrinsic_ssbo_atomic_fadd:
5111    case nir_intrinsic_ssbo_atomic_fmin:
5112    case nir_intrinsic_ssbo_atomic_fmax:
5113    case nir_intrinsic_ssbo_atomic_fcomp_swap:
5114       nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
5115       break;
5116 
5117    case nir_intrinsic_get_ssbo_size: {
5118       assert(nir_src_num_components(instr->src[0]) == 1);
5119       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
5120                             nir_src_as_uint(instr->src[0]) : 0;
5121 
5122       /* A resinfo's sampler message is used to get the buffer size.  The
5123        * SIMD8's writeback message consists of four registers and SIMD16's
5124        * writeback message consists of 8 destination registers (two per each
5125        * component).  Because we are only interested on the first channel of
5126        * the first returned component, where resinfo returns the buffer size
5127        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
5128        * the dispatch width.
5129        */
5130       const fs_builder ubld = bld.exec_all().group(8, 0);
5131       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5132       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
5133 
5134       /* Set LOD = 0 */
5135       ubld.MOV(src_payload, brw_imm_d(0));
5136 
5137       const unsigned index = ssbo_index;
5138       fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
5139                                 src_payload, brw_imm_ud(index));
5140       inst->header_size = 0;
5141       inst->mlen = 1;
5142       inst->size_written = 4 * REG_SIZE;
5143 
5144       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
5145        *
5146        * "Out-of-bounds checking is always performed at a DWord granularity. If
5147        * any part of the DWord is out-of-bounds then the whole DWord is
5148        * considered out-of-bounds."
5149        *
5150        * This implies that types with size smaller than 4-bytes need to be
5151        * padded if they don't complete the last dword of the buffer. But as we
5152        * need to maintain the original size we need to reverse the padding
5153        * calculation to return the correct size to know the number of elements
5154        * of an unsized array. As we stored in the last two bits of the surface
5155        * size the needed padding for the buffer, we calculate here the
5156        * original buffer_size reversing the surface_size calculation:
5157        *
5158        * surface_size = isl_align(buffer_size, 4) +
5159        *                (isl_align(buffer_size) - buffer_size)
5160        *
5161        * buffer_size = surface_size & ~3 - surface_size & 3
5162        */
5163 
5164       fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5165       fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5166       fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5167 
5168       ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
5169       ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
5170       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
5171 
5172       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
5173       break;
5174    }
5175 
5176    case nir_intrinsic_load_scratch: {
5177       assert(devinfo->ver >= 7);
5178 
5179       assert(nir_dest_num_components(instr->dest) == 1);
5180       const unsigned bit_size = nir_dest_bit_size(instr->dest);
5181       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5182 
5183       if (devinfo->verx10 >= 125) {
5184          const fs_builder ubld = bld.exec_all().group(1, 0);
5185          fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
5186          ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
5187                           brw_imm_ud(~0x3ffu));
5188          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
5189       } else if (devinfo->ver >= 8) {
5190          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5191             brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5192       } else {
5193          srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
5194       }
5195 
5196       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5197       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
5198       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
5199       const fs_reg nir_addr = get_nir_src(instr->src[0]);
5200 
5201       /* Make dest unsigned because that's what the temporary will be */
5202       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
5203 
5204       /* Read the vector */
5205       assert(nir_dest_num_components(instr->dest) == 1);
5206       assert(nir_dest_bit_size(instr->dest) <= 32);
5207       assert(nir_intrinsic_align(instr) > 0);
5208       if (nir_dest_bit_size(instr->dest) == 32 &&
5209           nir_intrinsic_align(instr) >= 4) {
5210          if (devinfo->verx10 >= 125) {
5211             assert(nir_dest_bit_size(instr->dest) == 32 &&
5212                    nir_intrinsic_align(instr) >= 4);
5213 
5214             srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5215                swizzle_nir_scratch_addr(bld, nir_addr, false);
5216             srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
5217 
5218             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5219                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5220          } else {
5221             /* The offset for a DWORD scattered message is in dwords. */
5222             srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5223                swizzle_nir_scratch_addr(bld, nir_addr, true);
5224 
5225             bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
5226                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5227          }
5228       } else {
5229          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5230             swizzle_nir_scratch_addr(bld, nir_addr, false);
5231 
5232          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
5233          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5234                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5235          bld.MOV(dest, read_result);
5236       }
5237 
5238       shader_stats.fill_count += DIV_ROUND_UP(dispatch_width, 16);
5239       break;
5240    }
5241 
5242    case nir_intrinsic_store_scratch: {
5243       assert(devinfo->ver >= 7);
5244 
5245       assert(nir_src_num_components(instr->src[0]) == 1);
5246       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5247       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5248 
5249       if (devinfo->verx10 >= 125) {
5250          const fs_builder ubld = bld.exec_all().group(1, 0);
5251          fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
5252          ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
5253                           brw_imm_ud(~0x3ffu));
5254          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
5255       } else if (devinfo->ver >= 8) {
5256          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5257             brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5258       } else {
5259          srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
5260       }
5261 
5262       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5263       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
5264       /**
5265        * While this instruction has side-effects, it should not be predicated
5266        * on sample mask, because otherwise fs helper invocations would
5267        * load undefined values from scratch memory. And scratch memory
5268        * load-stores are produced from operations without side-effects, thus
5269        * they should not have different behaviour in the helper invocations.
5270        */
5271       srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
5272       const fs_reg nir_addr = get_nir_src(instr->src[1]);
5273 
5274       fs_reg data = get_nir_src(instr->src[0]);
5275       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
5276 
5277       assert(nir_src_num_components(instr->src[0]) == 1);
5278       assert(nir_src_bit_size(instr->src[0]) <= 32);
5279       assert(nir_intrinsic_write_mask(instr) == 1);
5280       assert(nir_intrinsic_align(instr) > 0);
5281       if (nir_src_bit_size(instr->src[0]) == 32 &&
5282           nir_intrinsic_align(instr) >= 4) {
5283          if (devinfo->verx10 >= 125) {
5284             srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5285 
5286             srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5287                swizzle_nir_scratch_addr(bld, nir_addr, false);
5288             srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
5289 
5290             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5291                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5292          } else {
5293             srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5294 
5295             /* The offset for a DWORD scattered message is in dwords. */
5296             srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5297                swizzle_nir_scratch_addr(bld, nir_addr, true);
5298 
5299             bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
5300                      fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5301          }
5302       } else {
5303          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
5304          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5305 
5306          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5307             swizzle_nir_scratch_addr(bld, nir_addr, false);
5308 
5309          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5310                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5311       }
5312       shader_stats.spill_count += DIV_ROUND_UP(dispatch_width, 16);
5313       break;
5314    }
5315 
5316    case nir_intrinsic_load_subgroup_size:
5317       /* This should only happen for fragment shaders because every other case
5318        * is lowered in NIR so we can optimize on it.
5319        */
5320       assert(stage == MESA_SHADER_FRAGMENT);
5321       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
5322       break;
5323 
5324    case nir_intrinsic_load_subgroup_invocation:
5325       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
5326               nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
5327       break;
5328 
5329    case nir_intrinsic_load_subgroup_eq_mask:
5330    case nir_intrinsic_load_subgroup_ge_mask:
5331    case nir_intrinsic_load_subgroup_gt_mask:
5332    case nir_intrinsic_load_subgroup_le_mask:
5333    case nir_intrinsic_load_subgroup_lt_mask:
5334       unreachable("not reached");
5335 
5336    case nir_intrinsic_vote_any: {
5337       const fs_builder ubld = bld.exec_all().group(1, 0);
5338 
5339       /* The any/all predicates do not consider channel enables. To prevent
5340        * dead channels from affecting the result, we initialize the flag with
5341        * with the identity value for the logical operation.
5342        */
5343       if (dispatch_width == 32) {
5344          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5345          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
5346                          brw_imm_ud(0));
5347       } else {
5348          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
5349       }
5350       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
5351 
5352       /* For some reason, the any/all predicates don't work properly with
5353        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5354        * doesn't read the correct subset of the flag register and you end up
5355        * getting garbage in the second half.  Work around this by using a pair
5356        * of 1-wide MOVs and scattering the result.
5357        */
5358       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
5359       ubld.MOV(res1, brw_imm_d(0));
5360       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
5361                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
5362                                            BRW_PREDICATE_ALIGN1_ANY32H,
5363                     ubld.MOV(res1, brw_imm_d(-1)));
5364 
5365       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
5366       break;
5367    }
5368    case nir_intrinsic_vote_all: {
5369       const fs_builder ubld = bld.exec_all().group(1, 0);
5370 
5371       /* The any/all predicates do not consider channel enables. To prevent
5372        * dead channels from affecting the result, we initialize the flag with
5373        * with the identity value for the logical operation.
5374        */
5375       if (dispatch_width == 32) {
5376          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5377          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
5378                          brw_imm_ud(0xffffffff));
5379       } else {
5380          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
5381       }
5382       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
5383 
5384       /* For some reason, the any/all predicates don't work properly with
5385        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5386        * doesn't read the correct subset of the flag register and you end up
5387        * getting garbage in the second half.  Work around this by using a pair
5388        * of 1-wide MOVs and scattering the result.
5389        */
5390       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
5391       ubld.MOV(res1, brw_imm_d(0));
5392       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
5393                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
5394                                            BRW_PREDICATE_ALIGN1_ALL32H,
5395                     ubld.MOV(res1, brw_imm_d(-1)));
5396 
5397       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
5398       break;
5399    }
5400    case nir_intrinsic_vote_feq:
5401    case nir_intrinsic_vote_ieq: {
5402       fs_reg value = get_nir_src(instr->src[0]);
5403       if (instr->intrinsic == nir_intrinsic_vote_feq) {
5404          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5405          value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
5406             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
5407       }
5408 
5409       fs_reg uniformized = bld.emit_uniformize(value);
5410       const fs_builder ubld = bld.exec_all().group(1, 0);
5411 
5412       /* The any/all predicates do not consider channel enables. To prevent
5413        * dead channels from affecting the result, we initialize the flag with
5414        * with the identity value for the logical operation.
5415        */
5416       if (dispatch_width == 32) {
5417          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5418          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
5419                          brw_imm_ud(0xffffffff));
5420       } else {
5421          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
5422       }
5423       bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
5424 
5425       /* For some reason, the any/all predicates don't work properly with
5426        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5427        * doesn't read the correct subset of the flag register and you end up
5428        * getting garbage in the second half.  Work around this by using a pair
5429        * of 1-wide MOVs and scattering the result.
5430        */
5431       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
5432       ubld.MOV(res1, brw_imm_d(0));
5433       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
5434                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
5435                                            BRW_PREDICATE_ALIGN1_ALL32H,
5436                     ubld.MOV(res1, brw_imm_d(-1)));
5437 
5438       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
5439       break;
5440    }
5441 
5442    case nir_intrinsic_ballot: {
5443       const fs_reg value = retype(get_nir_src(instr->src[0]),
5444                                   BRW_REGISTER_TYPE_UD);
5445       struct brw_reg flag = brw_flag_reg(0, 0);
5446       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
5447        * as f0.0.  This is a problem for fragment programs as we currently use
5448        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
5449        * programs yet so this isn't a problem.  When we do, something will
5450        * have to change.
5451        */
5452       if (dispatch_width == 32)
5453          flag.type = BRW_REGISTER_TYPE_UD;
5454 
5455       bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
5456       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
5457 
5458       if (instr->dest.ssa.bit_size > 32) {
5459          dest.type = BRW_REGISTER_TYPE_UQ;
5460       } else {
5461          dest.type = BRW_REGISTER_TYPE_UD;
5462       }
5463       bld.MOV(dest, flag);
5464       break;
5465    }
5466 
5467    case nir_intrinsic_read_invocation: {
5468       const fs_reg value = get_nir_src(instr->src[0]);
5469       const fs_reg invocation = get_nir_src(instr->src[1]);
5470       fs_reg tmp = bld.vgrf(value.type);
5471 
5472       bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
5473                           bld.emit_uniformize(invocation));
5474 
5475       bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
5476       break;
5477    }
5478 
5479    case nir_intrinsic_read_first_invocation: {
5480       const fs_reg value = get_nir_src(instr->src[0]);
5481       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
5482       break;
5483    }
5484 
5485    case nir_intrinsic_shuffle: {
5486       const fs_reg value = get_nir_src(instr->src[0]);
5487       const fs_reg index = get_nir_src(instr->src[1]);
5488 
5489       bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
5490       break;
5491    }
5492 
5493    case nir_intrinsic_first_invocation: {
5494       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
5495       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
5496       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
5497               fs_reg(component(tmp, 0)));
5498       break;
5499    }
5500 
5501    case nir_intrinsic_last_invocation: {
5502       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
5503       bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
5504       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
5505               fs_reg(component(tmp, 0)));
5506       break;
5507    }
5508 
5509    case nir_intrinsic_quad_broadcast: {
5510       const fs_reg value = get_nir_src(instr->src[0]);
5511       const unsigned index = nir_src_as_uint(instr->src[1]);
5512 
5513       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
5514                value, brw_imm_ud(index), brw_imm_ud(4));
5515       break;
5516    }
5517 
5518    case nir_intrinsic_quad_swap_horizontal: {
5519       const fs_reg value = get_nir_src(instr->src[0]);
5520       const fs_reg tmp = bld.vgrf(value.type);
5521       if (devinfo->ver <= 7) {
5522          /* The hardware doesn't seem to support these crazy regions with
5523           * compressed instructions on gfx7 and earlier so we fall back to
5524           * using quad swizzles.  Fortunately, we don't support 64-bit
5525           * anything in Vulkan on gfx7.
5526           */
5527          assert(nir_src_bit_size(instr->src[0]) == 32);
5528          const fs_builder ubld = bld.exec_all();
5529          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5530                    brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));
5531          bld.MOV(retype(dest, value.type), tmp);
5532       } else {
5533          const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
5534 
5535          const fs_reg src_left = horiz_stride(value, 2);
5536          const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
5537          const fs_reg tmp_left = horiz_stride(tmp, 2);
5538          const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
5539 
5540          ubld.MOV(tmp_left, src_right);
5541          ubld.MOV(tmp_right, src_left);
5542 
5543       }
5544       bld.MOV(retype(dest, value.type), tmp);
5545       break;
5546    }
5547 
5548    case nir_intrinsic_quad_swap_vertical: {
5549       const fs_reg value = get_nir_src(instr->src[0]);
5550       if (nir_src_bit_size(instr->src[0]) == 32) {
5551          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5552          const fs_reg tmp = bld.vgrf(value.type);
5553          const fs_builder ubld = bld.exec_all();
5554          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5555                    brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
5556          bld.MOV(retype(dest, value.type), tmp);
5557       } else {
5558          /* For larger data types, we have to either emit dispatch_width many
5559           * MOVs or else fall back to doing indirects.
5560           */
5561          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5562          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5563                       brw_imm_w(0x2));
5564          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5565       }
5566       break;
5567    }
5568 
5569    case nir_intrinsic_quad_swap_diagonal: {
5570       const fs_reg value = get_nir_src(instr->src[0]);
5571       if (nir_src_bit_size(instr->src[0]) == 32) {
5572          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5573          const fs_reg tmp = bld.vgrf(value.type);
5574          const fs_builder ubld = bld.exec_all();
5575          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5576                    brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
5577          bld.MOV(retype(dest, value.type), tmp);
5578       } else {
5579          /* For larger data types, we have to either emit dispatch_width many
5580           * MOVs or else fall back to doing indirects.
5581           */
5582          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5583          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5584                       brw_imm_w(0x3));
5585          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5586       }
5587       break;
5588    }
5589 
5590    case nir_intrinsic_reduce: {
5591       fs_reg src = get_nir_src(instr->src[0]);
5592       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5593       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
5594       if (cluster_size == 0 || cluster_size > dispatch_width)
5595          cluster_size = dispatch_width;
5596 
5597       /* Figure out the source type */
5598       src.type = brw_type_for_nir_type(devinfo,
5599          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5600                         nir_src_bit_size(instr->src[0])));
5601 
5602       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
5603       opcode brw_op = brw_op_for_nir_reduction_op(redop);
5604       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
5605 
5606       /* Set up a register for all of our scratching around and initialize it
5607        * to reduction operation's identity value.
5608        */
5609       fs_reg scan = bld.vgrf(src.type);
5610       bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5611 
5612       bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
5613 
5614       dest.type = src.type;
5615       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
5616          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
5617           * the distance between clusters is at least 2 GRFs.  In this case,
5618           * we don't need the weird striding of the CLUSTER_BROADCAST
5619           * instruction and can just do regular MOVs.
5620           */
5621          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
5622          const unsigned groups =
5623             (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
5624          const unsigned group_size = dispatch_width / groups;
5625          for (unsigned i = 0; i < groups; i++) {
5626             const unsigned cluster = (i * group_size) / cluster_size;
5627             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
5628             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
5629                                          component(scan, comp));
5630          }
5631       } else {
5632          bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
5633                   brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
5634       }
5635       break;
5636    }
5637 
5638    case nir_intrinsic_inclusive_scan:
5639    case nir_intrinsic_exclusive_scan: {
5640       fs_reg src = get_nir_src(instr->src[0]);
5641       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5642 
5643       /* Figure out the source type */
5644       src.type = brw_type_for_nir_type(devinfo,
5645          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5646                         nir_src_bit_size(instr->src[0])));
5647 
5648       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
5649       opcode brw_op = brw_op_for_nir_reduction_op(redop);
5650       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
5651 
5652       /* Set up a register for all of our scratching around and initialize it
5653        * to reduction operation's identity value.
5654        */
5655       fs_reg scan = bld.vgrf(src.type);
5656       const fs_builder allbld = bld.exec_all();
5657       allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5658 
5659       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
5660          /* Exclusive scan is a bit harder because we have to do an annoying
5661           * shift of the contents before we can begin.  To make things worse,
5662           * we can't do this with a normal stride; we have to use indirects.
5663           */
5664          fs_reg shifted = bld.vgrf(src.type);
5665          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5666          allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5667                          brw_imm_w(-1));
5668          allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
5669          allbld.group(1, 0).MOV(component(shifted, 0), identity);
5670          scan = shifted;
5671       }
5672 
5673       bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
5674 
5675       bld.MOV(retype(dest, src.type), scan);
5676       break;
5677    }
5678 
5679    case nir_intrinsic_load_global_block_intel: {
5680       assert(nir_dest_bit_size(instr->dest) == 32);
5681 
5682       fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0]));
5683 
5684       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5685       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5686       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5687 
5688       const unsigned total = instr->num_components * dispatch_width;
5689       unsigned loaded = 0;
5690 
5691       while (loaded < total) {
5692          const unsigned block =
5693             choose_oword_block_size_dwords(total - loaded);
5694          const unsigned block_bytes = block * 4;
5695 
5696          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5697 
5698          fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5699          srcs[A64_LOGICAL_ADDRESS] = address;
5700          srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
5701          srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
5702          srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1);
5703          ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5704                    retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
5705                    srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
5706 
5707          increment_a64_address(ubld1, address, block_bytes);
5708          loaded += block;
5709       }
5710 
5711       assert(loaded == total);
5712       break;
5713    }
5714 
5715    case nir_intrinsic_store_global_block_intel: {
5716       assert(nir_src_bit_size(instr->src[0]) == 32);
5717 
5718       fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[1]));
5719       fs_reg src = get_nir_src(instr->src[0]);
5720 
5721       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5722       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5723       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5724 
5725       const unsigned total = instr->num_components * dispatch_width;
5726       unsigned written = 0;
5727 
5728       while (written < total) {
5729          const unsigned block =
5730             choose_oword_block_size_dwords(total - written);
5731 
5732          fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5733          srcs[A64_LOGICAL_ADDRESS] = address;
5734          srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
5735                                         BRW_REGISTER_TYPE_UD);
5736          srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
5737          srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
5738 
5739          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5740          ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(),
5741                    srcs, A64_LOGICAL_NUM_SRCS);
5742 
5743          const unsigned block_bytes = block * 4;
5744          increment_a64_address(ubld1, address, block_bytes);
5745          written += block;
5746       }
5747 
5748       assert(written == total);
5749       break;
5750    }
5751 
5752    case nir_intrinsic_load_shared_block_intel:
5753    case nir_intrinsic_load_ssbo_block_intel: {
5754       assert(nir_dest_bit_size(instr->dest) == 32);
5755 
5756       const bool is_ssbo =
5757          instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
5758       fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 1 : 0]));
5759 
5760       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5761       srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
5762          get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM));
5763       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
5764 
5765       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5766       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5767       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5768 
5769       const unsigned total = instr->num_components * dispatch_width;
5770       unsigned loaded = 0;
5771 
5772       while (loaded < total) {
5773          const unsigned block =
5774             choose_oword_block_size_dwords(total - loaded);
5775          const unsigned block_bytes = block * 4;
5776 
5777          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
5778 
5779          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5780          ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5781                    retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
5782                    srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
5783 
5784          ubld1.ADD(address, address, brw_imm_ud(block_bytes));
5785          loaded += block;
5786       }
5787 
5788       assert(loaded == total);
5789       break;
5790    }
5791 
5792    case nir_intrinsic_store_shared_block_intel:
5793    case nir_intrinsic_store_ssbo_block_intel: {
5794       assert(nir_src_bit_size(instr->src[0]) == 32);
5795 
5796       const bool is_ssbo =
5797          instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
5798 
5799       fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 2 : 1]));
5800       fs_reg src = get_nir_src(instr->src[0]);
5801 
5802       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5803       srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
5804          get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM));
5805       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
5806 
5807       const fs_builder ubld1 = bld.exec_all().group(1, 0);
5808       const fs_builder ubld8 = bld.exec_all().group(8, 0);
5809       const fs_builder ubld16 = bld.exec_all().group(16, 0);
5810 
5811       const unsigned total = instr->num_components * dispatch_width;
5812       unsigned written = 0;
5813 
5814       while (written < total) {
5815          const unsigned block =
5816             choose_oword_block_size_dwords(total - written);
5817 
5818          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
5819          srcs[SURFACE_LOGICAL_SRC_DATA] =
5820             retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD);
5821 
5822          const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5823          ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
5824                    fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5825 
5826          const unsigned block_bytes = block * 4;
5827          ubld1.ADD(address, address, brw_imm_ud(block_bytes));
5828          written += block;
5829       }
5830 
5831       assert(written == total);
5832       break;
5833    }
5834 
5835    case nir_intrinsic_load_topology_id_intel: {
5836        /* These move around basically every hardware generation, so don'
5837         * do any >= checks and fail if the platform hasn't explicitly
5838         * been enabled here.
5839         */
5840       assert(devinfo->ver == 12);
5841 
5842       /* Here is what the layout of SR0 looks like on Gfx12 :
5843        *   [13:11] : Slice ID.
5844        *   [10:9]  : Dual-SubSlice ID
5845        *   [8]     : SubSlice ID
5846        *   [7]     : EUID[2] (aka EU Row ID)
5847        *   [6]     : Reserved
5848        *   [5:4]   : EUID[1:0]
5849        *   [2:0]   : Thread ID
5850        */
5851       fs_reg raw_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
5852       bld.emit(SHADER_OPCODE_READ_SR_REG, raw_id, brw_imm_ud(0));
5853       switch (nir_intrinsic_base(instr)) {
5854       case BRW_TOPOLOGY_ID_DSS:
5855          bld.AND(raw_id, raw_id, brw_imm_ud(0x3fff));
5856          /* Get rid of anything below dualsubslice */
5857          bld.SHR(retype(dest, BRW_REGISTER_TYPE_UD), raw_id, brw_imm_ud(9));
5858          break;
5859       case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: {
5860          limit_dispatch_width(16, "Topology helper for Ray queries, "
5861                               "not supported in SIMD32 mode.");
5862          fs_reg dst = retype(dest, BRW_REGISTER_TYPE_UD);
5863 
5864          /* EU[3:0] << 7
5865           *
5866           * The 4bit EU[3:0] we need to build for ray query memory addresses
5867           * computations is a bit odd :
5868           *
5869           *   EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
5870           *   EU[2]   = raw_id[8]   (identified as SubSlice ID)
5871           *   EU[3]   = raw_id[7]   (identified as EUID[2] or Row ID)
5872           */
5873          {
5874             fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
5875             bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(7, 7)));
5876             bld.SHL(dst, tmp, brw_imm_ud(3));
5877             bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(8, 8)));
5878             bld.SHL(tmp, tmp, brw_imm_ud(1));
5879             bld.OR(dst, dst, tmp);
5880             bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(5, 4)));
5881             bld.SHL(tmp, tmp, brw_imm_ud(3));
5882             bld.OR(dst, dst, tmp);
5883          }
5884 
5885          /* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */
5886          {
5887             bld.AND(raw_id, raw_id, brw_imm_ud(INTEL_MASK(2, 0)));
5888             bld.SHL(raw_id, raw_id, brw_imm_ud(4));
5889             bld.OR(dst, dst, raw_id);
5890          }
5891 
5892          /* LaneID[0:3] << 0 (We build up LaneID by putting the right number
5893           *                   in each lane)
5894           */
5895          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
5896          const fs_builder ubld8 = bld.exec_all().group(8, 0);
5897          ubld8.MOV(quarter(tmp, 0), brw_imm_v(0x76543210));
5898          if (bld.dispatch_width() == 16) {
5899             /* Sets 0xfedcba98 to the upper part of the register. */
5900             ubld8.ADD(quarter(tmp, 1), quarter(tmp, 0), brw_imm_ud(8));
5901          }
5902          bld.ADD(dst, dst, tmp);
5903          break;
5904       }
5905       default:
5906          unreachable("Invalid topology id type");
5907       }
5908       break;
5909    }
5910 
5911    case nir_intrinsic_load_btd_stack_id_intel:
5912       if (stage == MESA_SHADER_COMPUTE) {
5913          assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
5914       } else {
5915          assert(brw_shader_stage_is_bindless(stage));
5916       }
5917       /* Stack IDs are always in R1 regardless of whether we're coming from a
5918        * bindless shader or a regular compute shader.
5919        */
5920       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
5921               retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
5922       break;
5923 
5924    case nir_intrinsic_btd_spawn_intel:
5925       if (stage == MESA_SHADER_COMPUTE) {
5926          assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
5927       } else {
5928          assert(brw_shader_stage_is_bindless(stage));
5929       }
5930       /* Make sure all the pointers to resume shaders have landed where other
5931        * threads can see them.
5932        */
5933       emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
5934 
5935       bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
5936                bld.emit_uniformize(get_nir_src(instr->src[0])),
5937                get_nir_src(instr->src[1]));
5938       break;
5939 
5940    case nir_intrinsic_btd_retire_intel:
5941       if (stage == MESA_SHADER_COMPUTE) {
5942          assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
5943       } else {
5944          assert(brw_shader_stage_is_bindless(stage));
5945       }
5946       /* Make sure all the pointers to resume shaders have landed where other
5947        * threads can see them.
5948        */
5949       emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
5950       bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
5951       break;
5952 
5953    case nir_intrinsic_trace_ray_intel: {
5954       const bool synchronous = nir_intrinsic_synchronous(instr);
5955       assert(brw_shader_stage_is_bindless(stage) || synchronous);
5956 
5957       /* Make sure all the previous RT structure writes are visible to the RT
5958        * fixed function within the DSS, as well as stack pointers to resume
5959        * shaders.
5960        */
5961       emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
5962 
5963       fs_reg srcs[RT_LOGICAL_NUM_SRCS];
5964 
5965       fs_reg globals = get_nir_src(instr->src[0]);
5966       srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals);
5967       srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(instr->src[1]);
5968       srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(instr->src[2]);
5969       srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
5970       bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(),
5971                srcs, RT_LOGICAL_NUM_SRCS);
5972 
5973       /* There is no actual value to use in the destination register of the
5974        * synchronous trace instruction. All of the communication with the HW
5975        * unit happens through memory reads/writes. So to ensure that the
5976        * operation has completed before we go read the results in memory, we
5977        * need a barrier followed by an invalidate before accessing memory.
5978        */
5979       if (synchronous) {
5980          bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
5981          emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE);
5982       }
5983       break;
5984    }
5985 
5986    default:
5987 #ifndef NDEBUG
5988       assert(instr->intrinsic < nir_num_intrinsics);
5989       fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
5990 #endif
5991       unreachable("unknown intrinsic");
5992    }
5993 }
5994 
5995 void
nir_emit_ssbo_atomic(const fs_builder & bld,int op,nir_intrinsic_instr * instr)5996 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
5997                                  int op, nir_intrinsic_instr *instr)
5998 {
5999    /* The BTI untyped atomic messages only support 32-bit atomics.  If you
6000     * just look at the big table of messages in the Vol 7 of the SKL PRM, they
6001     * appear to exist.  However, if you look at Vol 2a, there are no message
6002     * descriptors provided for Qword atomic ops except for A64 messages.
6003     */
6004    assert(nir_dest_bit_size(instr->dest) == 32 ||
6005           (nir_dest_bit_size(instr->dest) == 64 && devinfo->has_lsc));
6006 
6007    fs_reg dest;
6008    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
6009       dest = get_nir_dest(instr->dest);
6010 
6011    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6012    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
6013    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
6014    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6015    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
6016    srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
6017 
6018    fs_reg data;
6019    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
6020       data = get_nir_src(instr->src[2]);
6021 
6022    if (op == BRW_AOP_CMPWR) {
6023       fs_reg tmp = bld.vgrf(data.type, 2);
6024       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
6025       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6026       data = tmp;
6027    }
6028    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6029 
6030    /* Emit the actual atomic operation */
6031 
6032    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6033             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6034 }
6035 
6036 void
nir_emit_ssbo_atomic_float(const fs_builder & bld,int op,nir_intrinsic_instr * instr)6037 fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
6038                                        int op, nir_intrinsic_instr *instr)
6039 {
6040    fs_reg dest;
6041    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
6042       dest = get_nir_dest(instr->dest);
6043 
6044    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6045    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
6046    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
6047    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6048    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
6049    srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
6050 
6051    fs_reg data = get_nir_src(instr->src[2]);
6052    if (op == BRW_AOP_FCMPWR) {
6053       fs_reg tmp = bld.vgrf(data.type, 2);
6054       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
6055       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6056       data = tmp;
6057    }
6058    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6059 
6060    /* Emit the actual atomic operation */
6061 
6062    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
6063             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6064 }
6065 
6066 void
nir_emit_shared_atomic(const fs_builder & bld,int op,nir_intrinsic_instr * instr)6067 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
6068                                    int op, nir_intrinsic_instr *instr)
6069 {
6070    fs_reg dest;
6071    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
6072       dest = get_nir_dest(instr->dest);
6073 
6074    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6075    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
6076    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6077    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
6078    srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
6079 
6080    fs_reg data;
6081    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
6082       data = get_nir_src(instr->src[1]);
6083    if (op == BRW_AOP_CMPWR) {
6084       fs_reg tmp = bld.vgrf(data.type, 2);
6085       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
6086       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6087       data = tmp;
6088    }
6089    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6090 
6091    /* Get the offset */
6092    if (nir_src_is_const(instr->src[0])) {
6093       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6094          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
6095    } else {
6096       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
6097       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6098 	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
6099 	      brw_imm_ud(instr->const_index[0]));
6100    }
6101 
6102    /* Emit the actual atomic operation operation */
6103 
6104    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6105             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6106 }
6107 
6108 void
nir_emit_shared_atomic_float(const fs_builder & bld,int op,nir_intrinsic_instr * instr)6109 fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
6110                                          int op, nir_intrinsic_instr *instr)
6111 {
6112    fs_reg dest;
6113    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
6114       dest = get_nir_dest(instr->dest);
6115 
6116    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6117    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
6118    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6119    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
6120    srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
6121 
6122    fs_reg data = get_nir_src(instr->src[1]);
6123    if (op == BRW_AOP_FCMPWR) {
6124       fs_reg tmp = bld.vgrf(data.type, 2);
6125       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
6126       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6127       data = tmp;
6128    }
6129    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6130 
6131    /* Get the offset */
6132    if (nir_src_is_const(instr->src[0])) {
6133       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6134          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
6135    } else {
6136       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
6137       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6138 	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
6139 	      brw_imm_ud(instr->const_index[0]));
6140    }
6141 
6142    /* Emit the actual atomic operation operation */
6143 
6144    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
6145             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6146 }
6147 
6148 static fs_reg
expand_to_32bit(const fs_builder & bld,const fs_reg & src)6149 expand_to_32bit(const fs_builder &bld, const fs_reg &src)
6150 {
6151    if (type_sz(src.type) == 2) {
6152       fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
6153       bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW));
6154       return src32;
6155    } else {
6156       return src;
6157    }
6158 }
6159 
6160 void
nir_emit_global_atomic(const fs_builder & bld,int op,nir_intrinsic_instr * instr)6161 fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
6162                                    int op, nir_intrinsic_instr *instr)
6163 {
6164    fs_reg dest;
6165    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
6166       dest = get_nir_dest(instr->dest);
6167 
6168    fs_reg addr = get_nir_src(instr->src[0]);
6169 
6170    fs_reg data;
6171    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
6172       data = expand_to_32bit(bld, get_nir_src(instr->src[1]));
6173 
6174    if (op == BRW_AOP_CMPWR) {
6175       fs_reg tmp = bld.vgrf(data.type, 2);
6176       fs_reg sources[2] = {
6177          data,
6178          expand_to_32bit(bld, get_nir_src(instr->src[2]))
6179       };
6180       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6181       data = tmp;
6182    }
6183 
6184    fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6185    srcs[A64_LOGICAL_ADDRESS] = addr;
6186    srcs[A64_LOGICAL_SRC] = data;
6187    srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
6188    srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
6189 
6190    switch (nir_dest_bit_size(instr->dest)) {
6191    case 16: {
6192       fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
6193       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL, dest32,
6194                srcs, A64_LOGICAL_NUM_SRCS);
6195       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);
6196       break;
6197    }
6198    case 32:
6199       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
6200                srcs, A64_LOGICAL_NUM_SRCS);
6201       break;
6202    case 64:
6203       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, dest,
6204                srcs, A64_LOGICAL_NUM_SRCS);
6205       break;
6206    default:
6207       unreachable("Unsupported bit size");
6208    }
6209 }
6210 
6211 void
nir_emit_global_atomic_float(const fs_builder & bld,int op,nir_intrinsic_instr * instr)6212 fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
6213                                          int op, nir_intrinsic_instr *instr)
6214 {
6215    assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
6216    fs_reg dest = get_nir_dest(instr->dest);
6217 
6218    fs_reg addr = get_nir_src(instr->src[0]);
6219 
6220    assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);
6221    fs_reg data = expand_to_32bit(bld, get_nir_src(instr->src[1]));
6222 
6223    if (op == BRW_AOP_FCMPWR) {
6224       fs_reg tmp = bld.vgrf(data.type, 2);
6225       fs_reg sources[2] = {
6226          data,
6227          expand_to_32bit(bld, get_nir_src(instr->src[2]))
6228       };
6229       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6230       data = tmp;
6231    }
6232 
6233    fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6234    srcs[A64_LOGICAL_ADDRESS] = addr;
6235    srcs[A64_LOGICAL_SRC] = data;
6236    srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
6237    srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
6238 
6239    switch (nir_dest_bit_size(instr->dest)) {
6240    case 16: {
6241       fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
6242       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL, dest32,
6243                srcs, A64_LOGICAL_NUM_SRCS);
6244       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);
6245       break;
6246    }
6247    case 32:
6248       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL, dest,
6249                srcs, A64_LOGICAL_NUM_SRCS);
6250       break;
6251    case 64:
6252       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL, dest,
6253                srcs, A64_LOGICAL_NUM_SRCS);
6254       break;
6255    default:
6256       unreachable("Unsupported bit size");
6257    }
6258 }
6259 
6260 void
nir_emit_texture(const fs_builder & bld,nir_tex_instr * instr)6261 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
6262 {
6263    unsigned texture = instr->texture_index;
6264    unsigned sampler = instr->sampler_index;
6265 
6266    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
6267 
6268    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
6269    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
6270 
6271    int lod_components = 0;
6272 
6273    /* The hardware requires a LOD for buffer textures */
6274    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6275       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
6276 
6277    uint32_t header_bits = 0;
6278    for (unsigned i = 0; i < instr->num_srcs; i++) {
6279       fs_reg src = get_nir_src(instr->src[i].src);
6280       switch (instr->src[i].src_type) {
6281       case nir_tex_src_bias:
6282          srcs[TEX_LOGICAL_SRC_LOD] =
6283             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
6284          break;
6285       case nir_tex_src_comparator:
6286          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
6287          break;
6288       case nir_tex_src_coord:
6289          switch (instr->op) {
6290          case nir_texop_txf:
6291          case nir_texop_txf_ms:
6292          case nir_texop_txf_ms_mcs_intel:
6293          case nir_texop_samples_identical:
6294             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
6295             break;
6296          default:
6297             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
6298             break;
6299          }
6300 
6301          /* Wa_14013363432:
6302           *
6303           * Compiler should send U,V,R parameters even if V,R are 0.
6304           */
6305          if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && devinfo->verx10 == 125)
6306             assert(instr->coord_components >= 3u);
6307          break;
6308       case nir_tex_src_ddx:
6309          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
6310          lod_components = nir_tex_instr_src_size(instr, i);
6311          break;
6312       case nir_tex_src_ddy:
6313          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
6314          break;
6315       case nir_tex_src_lod:
6316          switch (instr->op) {
6317          case nir_texop_txs:
6318             srcs[TEX_LOGICAL_SRC_LOD] =
6319                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
6320             break;
6321          case nir_texop_txf:
6322             srcs[TEX_LOGICAL_SRC_LOD] =
6323                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
6324             break;
6325          default:
6326             srcs[TEX_LOGICAL_SRC_LOD] =
6327                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
6328             break;
6329          }
6330          break;
6331       case nir_tex_src_min_lod:
6332          srcs[TEX_LOGICAL_SRC_MIN_LOD] =
6333             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
6334          break;
6335       case nir_tex_src_ms_index:
6336          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
6337          break;
6338 
6339       case nir_tex_src_offset: {
6340          uint32_t offset_bits = 0;
6341          if (brw_texture_offset(instr, i, &offset_bits)) {
6342             header_bits |= offset_bits;
6343          } else {
6344             /* On gfx12.5+, if the offsets are not both constant and in the
6345              * {-8,7} range, nir_lower_tex() will have already lowered the
6346              * source offset. So we should never reach this point.
6347              */
6348             assert(devinfo->verx10 < 125);
6349             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
6350                retype(src, BRW_REGISTER_TYPE_D);
6351          }
6352          break;
6353       }
6354 
6355       case nir_tex_src_projector:
6356          unreachable("should be lowered");
6357 
6358       case nir_tex_src_texture_offset: {
6359          /* Emit code to evaluate the actual indexing expression */
6360          fs_reg tmp = vgrf(glsl_type::uint_type);
6361          bld.ADD(tmp, src, brw_imm_ud(texture));
6362          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
6363          break;
6364       }
6365 
6366       case nir_tex_src_sampler_offset: {
6367          /* Emit code to evaluate the actual indexing expression */
6368          fs_reg tmp = vgrf(glsl_type::uint_type);
6369          bld.ADD(tmp, src, brw_imm_ud(sampler));
6370          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
6371          break;
6372       }
6373 
6374       case nir_tex_src_texture_handle:
6375          assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
6376          srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
6377          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
6378          break;
6379 
6380       case nir_tex_src_sampler_handle:
6381          assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
6382          srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
6383          srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
6384          break;
6385 
6386       case nir_tex_src_ms_mcs_intel:
6387          assert(instr->op == nir_texop_txf_ms);
6388          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
6389          break;
6390 
6391       default:
6392          unreachable("unknown texture source");
6393       }
6394    }
6395 
6396    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
6397        (instr->op == nir_texop_txf_ms ||
6398         instr->op == nir_texop_samples_identical)) {
6399       if (devinfo->ver >= 7 &&
6400           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
6401          srcs[TEX_LOGICAL_SRC_MCS] =
6402             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
6403                            instr->coord_components,
6404                            srcs[TEX_LOGICAL_SRC_SURFACE],
6405                            srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
6406       } else {
6407          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
6408       }
6409    }
6410 
6411    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
6412    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
6413 
6414    enum opcode opcode;
6415    switch (instr->op) {
6416    case nir_texop_tex:
6417       opcode = SHADER_OPCODE_TEX_LOGICAL;
6418       break;
6419    case nir_texop_txb:
6420       opcode = FS_OPCODE_TXB_LOGICAL;
6421       break;
6422    case nir_texop_txl:
6423       opcode = SHADER_OPCODE_TXL_LOGICAL;
6424       break;
6425    case nir_texop_txd:
6426       opcode = SHADER_OPCODE_TXD_LOGICAL;
6427       break;
6428    case nir_texop_txf:
6429       opcode = SHADER_OPCODE_TXF_LOGICAL;
6430       break;
6431    case nir_texop_txf_ms:
6432       /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
6433        * Functions - 3D Sampler - Messages - Message Format:
6434        *
6435        *   ld2dms REMOVEDBY(GEN:HAS:1406788836)
6436        */
6437       if (devinfo->verx10 >= 125)
6438          opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
6439       else if ((key_tex->msaa_16 & (1 << sampler)))
6440          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
6441       else
6442          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
6443       break;
6444    case nir_texop_txf_ms_mcs_intel:
6445       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
6446       break;
6447    case nir_texop_query_levels:
6448    case nir_texop_txs:
6449       opcode = SHADER_OPCODE_TXS_LOGICAL;
6450       break;
6451    case nir_texop_lod:
6452       opcode = SHADER_OPCODE_LOD_LOGICAL;
6453       break;
6454    case nir_texop_tg4:
6455       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
6456          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
6457       else
6458          opcode = SHADER_OPCODE_TG4_LOGICAL;
6459       break;
6460    case nir_texop_texture_samples:
6461       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
6462       break;
6463    case nir_texop_samples_identical: {
6464       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
6465 
6466       /* If mcs is an immediate value, it means there is no MCS.  In that case
6467        * just return false.
6468        */
6469       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
6470          bld.MOV(dst, brw_imm_ud(0u));
6471       } else if ((key_tex->msaa_16 & (1 << sampler))) {
6472          fs_reg tmp = vgrf(glsl_type::uint_type);
6473          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
6474                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
6475          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
6476       } else {
6477          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
6478                  BRW_CONDITIONAL_EQ);
6479       }
6480       return;
6481    }
6482    default:
6483       unreachable("unknown texture opcode");
6484    }
6485 
6486    if (instr->op == nir_texop_tg4) {
6487       if (instr->component == 1 &&
6488           key_tex->gather_channel_quirk_mask & (1 << texture)) {
6489          /* gather4 sampler is broken for green channel on RG32F --
6490           * we must ask for blue instead.
6491           */
6492          header_bits |= 2 << 16;
6493       } else {
6494          header_bits |= instr->component << 16;
6495       }
6496    }
6497 
6498    fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
6499    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
6500    inst->offset = header_bits;
6501 
6502    const unsigned dest_size = nir_tex_instr_dest_size(instr);
6503    if (devinfo->ver >= 9 &&
6504        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
6505       unsigned write_mask = instr->dest.is_ssa ?
6506                             nir_ssa_def_components_read(&instr->dest.ssa):
6507                             (1 << dest_size) - 1;
6508       assert(write_mask != 0); /* dead code should have been eliminated */
6509       inst->size_written = util_last_bit(write_mask) *
6510                            inst->dst.component_size(inst->exec_size);
6511    } else {
6512       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
6513    }
6514 
6515    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
6516       inst->shadow_compare = true;
6517 
6518    fs_reg nir_dest[5];
6519    for (unsigned i = 0; i < dest_size; i++)
6520       nir_dest[i] = offset(dst, bld, i);
6521 
6522    if (instr->op == nir_texop_query_levels) {
6523       /* # levels is in .w */
6524       if (devinfo->ver <= 9) {
6525          /**
6526           * Wa_1940217:
6527           *
6528           * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
6529           * MIPCount returned is undefined instead of 0.
6530           */
6531          fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
6532          mov->conditional_mod = BRW_CONDITIONAL_NZ;
6533          nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D);
6534          fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
6535          sel->predicate = BRW_PREDICATE_NORMAL;
6536       } else {
6537          nir_dest[0] = offset(dst, bld, 3);
6538       }
6539    } else if (instr->op == nir_texop_txs &&
6540               dest_size >= 3 && devinfo->ver < 7) {
6541       /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
6542       fs_reg depth = offset(dst, bld, 2);
6543       nir_dest[2] = vgrf(glsl_type::int_type);
6544       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
6545    }
6546 
6547    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
6548 }
6549 
6550 void
nir_emit_jump(const fs_builder & bld,nir_jump_instr * instr)6551 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
6552 {
6553    switch (instr->type) {
6554    case nir_jump_break:
6555       bld.emit(BRW_OPCODE_BREAK);
6556       break;
6557    case nir_jump_continue:
6558       bld.emit(BRW_OPCODE_CONTINUE);
6559       break;
6560    case nir_jump_halt:
6561       bld.emit(BRW_OPCODE_HALT);
6562       break;
6563    case nir_jump_return:
6564    default:
6565       unreachable("unknown jump");
6566    }
6567 }
6568 
6569 /*
6570  * This helper takes a source register and un/shuffles it into the destination
6571  * register.
6572  *
6573  * If source type size is smaller than destination type size the operation
6574  * needed is a component shuffle. The opposite case would be an unshuffle. If
6575  * source/destination type size is equal a shuffle is done that would be
6576  * equivalent to a simple MOV.
6577  *
6578  * For example, if source is a 16-bit type and destination is 32-bit. A 3
6579  * components .xyz 16-bit vector on SIMD8 would be.
6580  *
6581  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
6582  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
6583  *
6584  * This helper will return the following 2 32-bit components with the 16-bit
6585  * values shuffled:
6586  *
6587  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
6588  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
6589  *
6590  * For unshuffle, the example would be the opposite, a 64-bit type source
6591  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
6592  * would be:
6593  *
6594  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
6595  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
6596  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
6597  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
6598  *
6599  * The returned result would be the following 4 32-bit components unshuffled:
6600  *
6601  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
6602  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
6603  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
6604  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
6605  *
6606  * - Source and destination register must not be overlapped.
6607  * - components units are measured in terms of the smaller type between
6608  *   source and destination because we are un/shuffling the smaller
6609  *   components from/into the bigger ones.
6610  * - first_component parameter allows skipping source components.
6611  */
6612 void
shuffle_src_to_dst(const fs_builder & bld,const fs_reg & dst,const fs_reg & src,uint32_t first_component,uint32_t components)6613 shuffle_src_to_dst(const fs_builder &bld,
6614                    const fs_reg &dst,
6615                    const fs_reg &src,
6616                    uint32_t first_component,
6617                    uint32_t components)
6618 {
6619    if (type_sz(src.type) == type_sz(dst.type)) {
6620       assert(!regions_overlap(dst,
6621          type_sz(dst.type) * bld.dispatch_width() * components,
6622          offset(src, bld, first_component),
6623          type_sz(src.type) * bld.dispatch_width() * components));
6624       for (unsigned i = 0; i < components; i++) {
6625          bld.MOV(retype(offset(dst, bld, i), src.type),
6626                  offset(src, bld, i + first_component));
6627       }
6628    } else if (type_sz(src.type) < type_sz(dst.type)) {
6629       /* Source is shuffled into destination */
6630       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
6631       assert(!regions_overlap(dst,
6632          type_sz(dst.type) * bld.dispatch_width() *
6633          DIV_ROUND_UP(components, size_ratio),
6634          offset(src, bld, first_component),
6635          type_sz(src.type) * bld.dispatch_width() * components));
6636 
6637       brw_reg_type shuffle_type =
6638          brw_reg_type_from_bit_size(8 * type_sz(src.type),
6639                                     BRW_REGISTER_TYPE_D);
6640       for (unsigned i = 0; i < components; i++) {
6641          fs_reg shuffle_component_i =
6642             subscript(offset(dst, bld, i / size_ratio),
6643                       shuffle_type, i % size_ratio);
6644          bld.MOV(shuffle_component_i,
6645                  retype(offset(src, bld, i + first_component), shuffle_type));
6646       }
6647    } else {
6648       /* Source is unshuffled into destination */
6649       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
6650       assert(!regions_overlap(dst,
6651          type_sz(dst.type) * bld.dispatch_width() * components,
6652          offset(src, bld, first_component / size_ratio),
6653          type_sz(src.type) * bld.dispatch_width() *
6654          DIV_ROUND_UP(components + (first_component % size_ratio),
6655                       size_ratio)));
6656 
6657       brw_reg_type shuffle_type =
6658          brw_reg_type_from_bit_size(8 * type_sz(dst.type),
6659                                     BRW_REGISTER_TYPE_D);
6660       for (unsigned i = 0; i < components; i++) {
6661          fs_reg shuffle_component_i =
6662             subscript(offset(src, bld, (first_component + i) / size_ratio),
6663                       shuffle_type, (first_component + i) % size_ratio);
6664          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
6665                  shuffle_component_i);
6666       }
6667    }
6668 }
6669 
6670 void
shuffle_from_32bit_read(const fs_builder & bld,const fs_reg & dst,const fs_reg & src,uint32_t first_component,uint32_t components)6671 shuffle_from_32bit_read(const fs_builder &bld,
6672                         const fs_reg &dst,
6673                         const fs_reg &src,
6674                         uint32_t first_component,
6675                         uint32_t components)
6676 {
6677    assert(type_sz(src.type) == 4);
6678 
6679    /* This function takes components in units of the destination type while
6680     * shuffle_src_to_dst takes components in units of the smallest type
6681     */
6682    if (type_sz(dst.type) > 4) {
6683       assert(type_sz(dst.type) == 8);
6684       first_component *= 2;
6685       components *= 2;
6686    }
6687 
6688    shuffle_src_to_dst(bld, dst, src, first_component, components);
6689 }
6690 
6691 fs_reg
setup_imm_df(const fs_builder & bld,double v)6692 setup_imm_df(const fs_builder &bld, double v)
6693 {
6694    const struct intel_device_info *devinfo = bld.shader->devinfo;
6695    assert(devinfo->ver >= 7);
6696 
6697    if (devinfo->ver >= 8)
6698       return brw_imm_df(v);
6699 
6700    /* gfx7.5 does not support DF immediates straightforward but the DIM
6701     * instruction allows to set the 64-bit immediate value.
6702     */
6703    if (devinfo->platform == INTEL_PLATFORM_HSW) {
6704       const fs_builder ubld = bld.exec_all().group(1, 0);
6705       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
6706       ubld.DIM(dst, brw_imm_df(v));
6707       return component(dst, 0);
6708    }
6709 
6710    /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
6711     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
6712     * the high 32-bit to suboffset 4 and then applying a stride of 0.
6713     *
6714     * Alternatively, we could also produce a normal VGRF (without stride 0)
6715     * by writing to all the channels in the VGRF, however, that would hit the
6716     * gfx7 bug where we have to split writes that span more than 1 register
6717     * into instructions with a width of 4 (otherwise the write to the second
6718     * register written runs into an execmask hardware bug) which isn't very
6719     * nice.
6720     */
6721    union {
6722       double d;
6723       struct {
6724          uint32_t i1;
6725          uint32_t i2;
6726       };
6727    } di;
6728 
6729    di.d = v;
6730 
6731    const fs_builder ubld = bld.exec_all().group(1, 0);
6732    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
6733    ubld.MOV(tmp, brw_imm_ud(di.i1));
6734    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
6735 
6736    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
6737 }
6738 
6739 fs_reg
setup_imm_b(const fs_builder & bld,int8_t v)6740 setup_imm_b(const fs_builder &bld, int8_t v)
6741 {
6742    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
6743    bld.MOV(tmp, brw_imm_w(v));
6744    return tmp;
6745 }
6746 
6747 fs_reg
setup_imm_ub(const fs_builder & bld,uint8_t v)6748 setup_imm_ub(const fs_builder &bld, uint8_t v)
6749 {
6750    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
6751    bld.MOV(tmp, brw_imm_uw(v));
6752    return tmp;
6753 }
6754