1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_fs_builder.h"
26 #include "brw_nir.h"
27 #include "brw_eu.h"
28 #include "nir.h"
29 #include "nir_intrinsics.h"
30 #include "nir_search_helpers.h"
31 #include "dev/intel_debug.h"
32 #include "util/u_math.h"
33 #include "util/bitscan.h"
34
35 #include <vector>
36
37 using namespace brw;
38
39 struct brw_fs_bind_info {
40 bool valid;
41 bool bindless;
42 unsigned block;
43 unsigned set;
44 unsigned binding;
45 };
46
47 struct nir_to_brw_state {
48 fs_visitor &s;
49 const nir_shader *nir;
50 const intel_device_info *devinfo;
51 void *mem_ctx;
52
53 /* Points to the end of the program. Annotated with the current NIR
54 * instruction when applicable.
55 */
56 fs_builder bld;
57
58 brw_reg *ssa_values;
59 struct brw_fs_bind_info *ssa_bind_infos;
60 brw_reg *system_values;
61
62 bool annotate;
63 };
64
65 static brw_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel = 0);
66 static brw_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform = false);
67 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
68
69 static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
70 static brw_reg emit_samplepos_setup(nir_to_brw_state &ntb);
71 static brw_reg emit_sampleid_setup(nir_to_brw_state &ntb);
72 static brw_reg emit_samplemaskin_setup(nir_to_brw_state &ntb);
73 static brw_reg emit_shading_rate_setup(nir_to_brw_state &ntb);
74
75 static void fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl);
76 static void fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list);
77 static void fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt);
78 static void fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop);
79 static void fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block);
80 static void fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr);
81
82 static void fs_nir_emit_memory_access(nir_to_brw_state &ntb,
83 const fs_builder &bld,
84 const fs_builder &xbld,
85 nir_intrinsic_instr *instr);
86
87 static void brw_combine_with_vec(const fs_builder &bld, const brw_reg &dst,
88 const brw_reg &src, unsigned n);
89
90 static bool
brw_texture_offset(const nir_tex_instr * tex,unsigned src,uint32_t * offset_bits_out)91 brw_texture_offset(const nir_tex_instr *tex, unsigned src,
92 uint32_t *offset_bits_out)
93 {
94 if (!nir_src_is_const(tex->src[src].src))
95 return false;
96
97 const unsigned num_components = nir_tex_instr_src_size(tex, src);
98
99 /* Combine all three offsets into a single unsigned dword:
100 *
101 * bits 11:8 - U Offset (X component)
102 * bits 7:4 - V Offset (Y component)
103 * bits 3:0 - R Offset (Z component)
104 */
105 uint32_t offset_bits = 0;
106 for (unsigned i = 0; i < num_components; i++) {
107 int offset = nir_src_comp_as_int(tex->src[src].src, i);
108
109 /* offset out of bounds; caller will handle it. */
110 if (offset > 7 || offset < -8)
111 return false;
112
113 const unsigned shift = 4 * (2 - i);
114 offset_bits |= (offset & 0xF) << shift;
115 }
116
117 *offset_bits_out = offset_bits;
118
119 return true;
120 }
121
122 static brw_reg
setup_imm_b(const fs_builder & bld,int8_t v)123 setup_imm_b(const fs_builder &bld, int8_t v)
124 {
125 const brw_reg tmp = bld.vgrf(BRW_TYPE_B);
126 bld.MOV(tmp, brw_imm_w(v));
127 return tmp;
128 }
129
130 static void
fs_nir_setup_outputs(nir_to_brw_state & ntb)131 fs_nir_setup_outputs(nir_to_brw_state &ntb)
132 {
133 fs_visitor &s = ntb.s;
134
135 if (s.stage == MESA_SHADER_TESS_CTRL ||
136 s.stage == MESA_SHADER_TASK ||
137 s.stage == MESA_SHADER_MESH ||
138 s.stage == MESA_SHADER_FRAGMENT ||
139 s.stage == MESA_SHADER_COMPUTE)
140 return;
141
142 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
143
144 /* Calculate the size of output registers in a separate pass, before
145 * allocating them. With ARB_enhanced_layouts, multiple output variables
146 * may occupy the same slot, but have different type sizes.
147 */
148 nir_foreach_shader_out_variable(var, s.nir) {
149 const int loc = var->data.driver_location;
150 const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
151 vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
152 }
153
154 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
155 if (vec4s[loc] == 0) {
156 loc++;
157 continue;
158 }
159
160 unsigned reg_size = vec4s[loc];
161
162 /* Check if there are any ranges that start within this range and extend
163 * past it. If so, include them in this allocation.
164 */
165 for (unsigned i = 1; i < reg_size; i++) {
166 assert(i + loc < ARRAY_SIZE(vec4s));
167 reg_size = MAX2(vec4s[i + loc] + i, reg_size);
168 }
169
170 brw_reg reg = ntb.bld.vgrf(BRW_TYPE_F, 4 * reg_size);
171 for (unsigned i = 0; i < reg_size; i++) {
172 assert(loc + i < ARRAY_SIZE(s.outputs));
173 s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
174 }
175
176 loc += reg_size;
177 }
178 }
179
180 static void
fs_nir_setup_uniforms(fs_visitor & s)181 fs_nir_setup_uniforms(fs_visitor &s)
182 {
183 const intel_device_info *devinfo = s.devinfo;
184
185 /* Only the first compile gets to set up uniforms. */
186 if (s.uniforms)
187 return;
188
189 s.uniforms = s.nir->num_uniforms / 4;
190
191 if (gl_shader_stage_is_compute(s.stage) && devinfo->verx10 < 125) {
192 /* Add uniforms for builtins after regular NIR uniforms. */
193 assert(s.uniforms == s.prog_data->nr_params);
194
195 /* Subgroup ID must be the last uniform on the list. This will make
196 * easier later to split between cross thread and per thread
197 * uniforms.
198 */
199 uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1);
200 *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
201 s.uniforms++;
202 }
203 }
204
205 static brw_reg
emit_work_group_id_setup(nir_to_brw_state & ntb)206 emit_work_group_id_setup(nir_to_brw_state &ntb)
207 {
208 fs_visitor &s = ntb.s;
209 const fs_builder &bld = ntb.bld.scalar_group();
210
211 assert(gl_shader_stage_is_compute(s.stage));
212
213 brw_reg id = bld.vgrf(BRW_TYPE_UD, 3);
214
215 id.is_scalar = true;
216
217 struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_TYPE_UD));
218 bld.MOV(id, r0_1);
219
220 struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_TYPE_UD));
221 struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_TYPE_UD));
222 bld.MOV(offset(id, bld, 1), r0_6);
223 bld.MOV(offset(id, bld, 2), r0_7);
224
225 return id;
226 }
227
228 static bool
emit_system_values_block(nir_to_brw_state & ntb,nir_block * block)229 emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
230 {
231 fs_visitor &s = ntb.s;
232 brw_reg *reg;
233
234 nir_foreach_instr(instr, block) {
235 if (instr->type != nir_instr_type_intrinsic)
236 continue;
237
238 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
239 switch (intrin->intrinsic) {
240 case nir_intrinsic_load_vertex_id:
241 case nir_intrinsic_load_base_vertex:
242 unreachable("should be lowered by nir_lower_system_values().");
243
244 case nir_intrinsic_load_vertex_id_zero_base:
245 case nir_intrinsic_load_is_indexed_draw:
246 case nir_intrinsic_load_first_vertex:
247 case nir_intrinsic_load_instance_id:
248 case nir_intrinsic_load_base_instance:
249 unreachable("should be lowered by brw_nir_lower_vs_inputs().");
250 break;
251
252 case nir_intrinsic_load_draw_id:
253 /* For Task/Mesh, draw_id will be handled later in
254 * nir_emit_mesh_task_intrinsic().
255 */
256 if (!gl_shader_stage_is_mesh(s.stage))
257 unreachable("should be lowered by brw_nir_lower_vs_inputs().");
258 break;
259
260 case nir_intrinsic_load_invocation_id:
261 if (s.stage == MESA_SHADER_TESS_CTRL)
262 break;
263 assert(s.stage == MESA_SHADER_GEOMETRY);
264 reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
265 if (reg->file == BAD_FILE) {
266 *reg = s.gs_payload().instance_id;
267 }
268 break;
269
270 case nir_intrinsic_load_sample_pos:
271 case nir_intrinsic_load_sample_pos_or_center:
272 assert(s.stage == MESA_SHADER_FRAGMENT);
273 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
274 if (reg->file == BAD_FILE)
275 *reg = emit_samplepos_setup(ntb);
276 break;
277
278 case nir_intrinsic_load_sample_id:
279 assert(s.stage == MESA_SHADER_FRAGMENT);
280 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
281 if (reg->file == BAD_FILE)
282 *reg = emit_sampleid_setup(ntb);
283 break;
284
285 case nir_intrinsic_load_sample_mask_in:
286 assert(s.stage == MESA_SHADER_FRAGMENT);
287 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
288 if (reg->file == BAD_FILE)
289 *reg = emit_samplemaskin_setup(ntb);
290 break;
291
292 case nir_intrinsic_load_workgroup_id:
293 if (gl_shader_stage_is_mesh(s.stage))
294 unreachable("should be lowered by nir_lower_compute_system_values().");
295 assert(gl_shader_stage_is_compute(s.stage));
296 reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
297 if (reg->file == BAD_FILE)
298 *reg = emit_work_group_id_setup(ntb);
299 break;
300
301 case nir_intrinsic_load_helper_invocation:
302 assert(s.stage == MESA_SHADER_FRAGMENT);
303 reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
304 if (reg->file == BAD_FILE) {
305 const fs_builder abld =
306 ntb.bld.annotate("gl_HelperInvocation");
307
308 /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
309 * pixel mask is in g1.7 of the thread payload.
310 *
311 * We move the per-channel pixel enable bit to the low bit of each
312 * channel by shifting the byte containing the pixel mask by the
313 * vector immediate 0x76543210UV.
314 *
315 * The region of <1,8,0> reads only 1 byte (the pixel masks for
316 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
317 * masks for 2 and 3) in SIMD16.
318 */
319 brw_reg shifted = abld.vgrf(BRW_TYPE_UW);
320
321 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
322 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
323 /* According to the "PS Thread Payload for Normal
324 * Dispatch" pages on the BSpec, the dispatch mask is
325 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
326 * gfx6+.
327 */
328 const struct brw_reg reg = s.devinfo->ver >= 20 ?
329 xe2_vec1_grf(i, 15) : brw_vec1_grf(i + 1, 7);
330 hbld.SHR(offset(shifted, hbld, i),
331 stride(retype(reg, BRW_TYPE_UB), 1, 8, 0),
332 brw_imm_v(0x76543210));
333 }
334
335 /* A set bit in the pixel mask means the channel is enabled, but
336 * that is the opposite of gl_HelperInvocation so we need to invert
337 * the mask.
338 *
339 * The negate source-modifier bit of logical instructions on Gfx8+
340 * performs 1's complement negation, so we can use that instead of
341 * a NOT instruction.
342 */
343 brw_reg inverted = negate(shifted);
344
345 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
346 * with 1 and negating.
347 */
348 brw_reg anded = abld.vgrf(BRW_TYPE_UD);
349 abld.AND(anded, inverted, brw_imm_uw(1));
350
351 *reg = abld.MOV(negate(retype(anded, BRW_TYPE_D)));
352 }
353 break;
354
355 case nir_intrinsic_load_frag_shading_rate:
356 reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
357 if (reg->file == BAD_FILE)
358 *reg = emit_shading_rate_setup(ntb);
359 break;
360
361 default:
362 break;
363 }
364 }
365
366 return true;
367 }
368
369 static void
fs_nir_emit_system_values(nir_to_brw_state & ntb)370 fs_nir_emit_system_values(nir_to_brw_state &ntb)
371 {
372 fs_visitor &s = ntb.s;
373
374 ntb.system_values = ralloc_array(ntb.mem_ctx, brw_reg, SYSTEM_VALUE_MAX);
375 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
376 ntb.system_values[i] = brw_reg();
377 }
378
379 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
380 nir_foreach_block(block, impl)
381 emit_system_values_block(ntb, block);
382 }
383
384 static void
fs_nir_emit_impl(nir_to_brw_state & ntb,nir_function_impl * impl)385 fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl)
386 {
387 ntb.ssa_values = rzalloc_array(ntb.mem_ctx, brw_reg, impl->ssa_alloc);
388 ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc);
389
390 fs_nir_emit_cf_list(ntb, &impl->body);
391 }
392
393 static void
fs_nir_emit_cf_list(nir_to_brw_state & ntb,exec_list * list)394 fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list)
395 {
396 exec_list_validate(list);
397 foreach_list_typed(nir_cf_node, node, node, list) {
398 switch (node->type) {
399 case nir_cf_node_if:
400 fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
401 break;
402
403 case nir_cf_node_loop:
404 fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
405 break;
406
407 case nir_cf_node_block:
408 fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
409 break;
410
411 default:
412 unreachable("Invalid CFG node block");
413 }
414 }
415 }
416
417 static void
fs_nir_emit_if(nir_to_brw_state & ntb,nir_if * if_stmt)418 fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt)
419 {
420 const fs_builder &bld = ntb.bld;
421
422 bool invert;
423 brw_reg cond_reg;
424
425 /* If the condition has the form !other_condition, use other_condition as
426 * the source, but invert the predicate on the if instruction.
427 */
428 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
429 if (cond != NULL && cond->op == nir_op_inot) {
430 invert = true;
431 cond_reg = get_nir_src(ntb, cond->src[0].src, cond->src[0].swizzle[0]);
432 } else {
433 invert = false;
434 cond_reg = get_nir_src(ntb, if_stmt->condition);
435 }
436
437 /* first, put the condition into f0 */
438 fs_inst *inst = bld.MOV(bld.null_reg_d(),
439 retype(cond_reg, BRW_TYPE_D));
440 inst->conditional_mod = BRW_CONDITIONAL_NZ;
441
442 fs_inst *iff = bld.IF(BRW_PREDICATE_NORMAL);
443 iff->predicate_inverse = invert;
444
445 fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
446
447 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
448 bld.emit(BRW_OPCODE_ELSE);
449 fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
450 }
451
452 fs_inst *endif = bld.emit(BRW_OPCODE_ENDIF);
453
454 /* Peephole: replace IF-JUMP-ENDIF with predicated jump */
455 if (endif->prev->prev == iff) {
456 fs_inst *jump = (fs_inst *) endif->prev;
457 if (jump->predicate == BRW_PREDICATE_NONE &&
458 (jump->opcode == BRW_OPCODE_BREAK ||
459 jump->opcode == BRW_OPCODE_CONTINUE)) {
460 jump->predicate = iff->predicate;
461 jump->predicate_inverse = iff->predicate_inverse;
462 iff->exec_node::remove();
463 endif->exec_node::remove();
464 }
465 }
466 }
467
468 static void
fs_nir_emit_loop(nir_to_brw_state & ntb,nir_loop * loop)469 fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
470 {
471 const fs_builder &bld = ntb.bld;
472
473 assert(!nir_loop_has_continue_construct(loop));
474 bld.emit(BRW_OPCODE_DO);
475
476 fs_nir_emit_cf_list(ntb, &loop->body);
477
478 fs_inst *peep_while = bld.emit(BRW_OPCODE_WHILE);
479
480 /* Peephole: replace (+f0) break; while with (-f0) while */
481 fs_inst *peep_break = (fs_inst *) peep_while->prev;
482
483 if (peep_break->opcode == BRW_OPCODE_BREAK &&
484 peep_break->predicate != BRW_PREDICATE_NONE) {
485 peep_while->predicate = peep_break->predicate;
486 peep_while->predicate_inverse = !peep_break->predicate_inverse;
487 peep_break->exec_node::remove();
488 }
489 }
490
491 static void
fs_nir_emit_block(nir_to_brw_state & ntb,nir_block * block)492 fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
493 {
494 fs_builder bld = ntb.bld;
495
496 nir_foreach_instr(instr, block) {
497 fs_nir_emit_instr(ntb, instr);
498 }
499
500 ntb.bld = bld;
501 }
502
503 /**
504 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
505 * match instr.
506 */
507 static bool
optimize_extract_to_float(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,const brw_reg & result)508 optimize_extract_to_float(nir_to_brw_state &ntb, const fs_builder &bld,
509 nir_alu_instr *instr, const brw_reg &result)
510 {
511 const intel_device_info *devinfo = ntb.devinfo;
512
513 /* No fast path for f16 (yet) or f64. */
514 assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
515
516 if (!instr->src[0].src.ssa->parent_instr)
517 return false;
518
519 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
520 return false;
521
522 nir_alu_instr *src0 =
523 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
524
525 unsigned bytes;
526 bool is_signed;
527
528 switch (src0->op) {
529 case nir_op_extract_u8:
530 case nir_op_extract_u16:
531 bytes = src0->op == nir_op_extract_u8 ? 1 : 2;
532
533 /* i2f(extract_u8(a, b)) and u2f(extract_u8(a, b)) produce the same
534 * result. Ditto for extract_u16.
535 */
536 is_signed = false;
537 break;
538
539 case nir_op_extract_i8:
540 case nir_op_extract_i16:
541 bytes = src0->op == nir_op_extract_i8 ? 1 : 2;
542
543 /* The fast path can't handle u2f(extract_i8(a, b)) because the implicit
544 * sign extension of the extract_i8 is lost. For example,
545 * u2f(extract_i8(0x0000ff00, 1)) should produce 4294967295.0, but a
546 * fast path could either give 255.0 (by implementing the fast path as
547 * u2f(extract_u8(x))) or -1.0 (by implementing the fast path as
548 * i2f(extract_i8(x))). At one point in time, we incorrectly implemented
549 * the former.
550 */
551 if (instr->op != nir_op_i2f32)
552 return false;
553
554 is_signed = true;
555 break;
556
557 default:
558 return false;
559 }
560
561 unsigned element = nir_src_as_uint(src0->src[1].src);
562
563 /* Element type to extract.*/
564 const brw_reg_type type = brw_int_type(bytes, is_signed);
565
566 brw_reg op0 = get_nir_src(ntb, src0->src[0].src, -1);
567 op0.type = brw_type_for_nir_type(devinfo,
568 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
569 nir_src_bit_size(src0->src[0].src)));
570
571 /* It is not documented in the Bspec, but DG2 and newer platforms cannot do
572 * direct byte-to-float conversions from scalars. MR !30140 has more
573 * details. If the optimization is applied in cases that would require
574 * lower_regioning to do some lowering, the code generated will be much,
575 * much worse.
576 */
577 if (devinfo->verx10 >= 125 && bytes == 1) {
578 /* If the source truly scalar, for example from the UNIFORM file, skip
579 * the optimize_extract_to_float optimization.
580 *
581 * Note: is_scalar values won't have zero stride until after the call to
582 * offset() below that applies the swizzle.
583 */
584 if (is_uniform(op0))
585 return false;
586
587 /* If the dispatch width matches the scalar allocation width, then
588 * is_scalar can be demoted to non-is_scalar. This prevents offset() and
589 * component() (both called below) from setting the stride to zero, and
590 * that avoids the awful code generated by lower_regioning.
591 */
592 if (op0.is_scalar) {
593 const unsigned allocation_width = 8 * reg_unit(ntb.devinfo);
594 if (ntb.bld.dispatch_width() != allocation_width)
595 return false;
596
597 assert(bld.dispatch_width() == allocation_width);
598 op0.is_scalar = false;
599 }
600 }
601
602 op0 = offset(op0, bld, src0->src[0].swizzle[0]);
603
604 /* If the dispatch width matches the scalar allocation width, offset() will
605 * not modify the stride, but having source stride <0;1,0> is advantageous.
606 */
607 if (op0.is_scalar)
608 op0 = component(op0, 0);
609
610 /* Bspec "Register Region Restrictions" for Xe says:
611 *
612 * "In case of all float point data types used in destination
613 *
614 * 1. Register Regioning patterns where register data bit location of
615 * the LSB of the channels are changed between source and destination
616 * are not supported on Src0 and Src1 except for broadcast of a
617 * scalar."
618 *
619 * This restriction is enfored in brw_lower_regioning. There is no
620 * reason to generate an optimized instruction that brw_lower_regioning
621 * will have to break up later.
622 */
623 if (devinfo->verx10 >= 125 && element != 0 && !is_uniform(op0))
624 return false;
625
626 bld.MOV(result, subscript(op0, type, element));
627 return true;
628 }
629
630 static bool
optimize_frontfacing_ternary(nir_to_brw_state & ntb,nir_alu_instr * instr,const brw_reg & result)631 optimize_frontfacing_ternary(nir_to_brw_state &ntb,
632 nir_alu_instr *instr,
633 const brw_reg &result)
634 {
635 const intel_device_info *devinfo = ntb.devinfo;
636 fs_visitor &s = ntb.s;
637
638 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
639 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
640 return false;
641
642 if (!nir_src_is_const(instr->src[1].src) ||
643 !nir_src_is_const(instr->src[2].src))
644 return false;
645
646 const float value1 = nir_src_as_float(instr->src[1].src);
647 const float value2 = nir_src_as_float(instr->src[2].src);
648 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
649 return false;
650
651 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
652 assert(value1 == -value2);
653
654 brw_reg tmp = ntb.bld.vgrf(BRW_TYPE_D);
655
656 if (devinfo->ver >= 20) {
657 /* Gfx20+ has separate back-facing bits for each pair of
658 * subspans in order to support multiple polygons, so we need to
659 * use a <1;8,0> region in order to select the correct word for
660 * each channel. Unfortunately they're no longer aligned to the
661 * sign bit of a 16-bit word, so a left shift is necessary.
662 */
663 brw_reg ff = ntb.bld.vgrf(BRW_TYPE_UW);
664
665 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
666 const fs_builder hbld = ntb.bld.group(16, i);
667 const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
668 BRW_TYPE_UW);
669 hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), brw_imm_ud(4));
670 }
671
672 if (value1 == -1.0f)
673 ff.negate = true;
674
675 ntb.bld.OR(subscript(tmp, BRW_TYPE_UW, 1), ff,
676 brw_imm_uw(0x3f80));
677
678 } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
679 /* According to the BSpec "PS Thread Payload for Normal
680 * Dispatch", the front/back facing interpolation bit is stored
681 * as bit 15 of either the R1.1 or R1.6 poly info field, for the
682 * first and second polygons respectively in multipolygon PS
683 * dispatch mode.
684 */
685 assert(s.dispatch_width == 16);
686
687 for (unsigned i = 0; i < s.max_polygons; i++) {
688 const fs_builder hbld = ntb.bld.group(8, i);
689 struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
690 BRW_TYPE_UW);
691
692 if (value1 == -1.0f)
693 g1.negate = true;
694
695 hbld.OR(subscript(offset(tmp, hbld, i), BRW_TYPE_UW, 1),
696 g1, brw_imm_uw(0x3f80));
697 }
698
699 } else if (devinfo->ver >= 12) {
700 /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
701 brw_reg g1 = brw_reg(retype(brw_vec1_grf(1, 1), BRW_TYPE_W));
702
703 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
704 *
705 * or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W
706 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
707 *
708 * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
709 */
710 if (value1 == -1.0f)
711 g1.negate = true;
712
713 ntb.bld.OR(subscript(tmp, BRW_TYPE_W, 1),
714 g1, brw_imm_uw(0x3f80));
715 } else {
716 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
717 brw_reg g0 = brw_reg(retype(brw_vec1_grf(0, 0), BRW_TYPE_W));
718
719 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
720 *
721 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
722 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
723 *
724 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
725 *
726 * This negation looks like it's safe in practice, because bits 0:4 will
727 * surely be TRIANGLES
728 */
729
730 if (value1 == -1.0f) {
731 g0.negate = true;
732 }
733
734 ntb.bld.OR(subscript(tmp, BRW_TYPE_W, 1),
735 g0, brw_imm_uw(0x3f80));
736 }
737 ntb.bld.AND(retype(result, BRW_TYPE_D), tmp, brw_imm_d(0xbf800000));
738
739 return true;
740 }
741
742 static brw_rnd_mode
brw_rnd_mode_from_nir_op(const nir_op op)743 brw_rnd_mode_from_nir_op (const nir_op op) {
744 switch (op) {
745 case nir_op_f2f16_rtz:
746 return BRW_RND_MODE_RTZ;
747 case nir_op_f2f16_rtne:
748 return BRW_RND_MODE_RTNE;
749 default:
750 unreachable("Operation doesn't support rounding mode");
751 }
752 }
753
754 static brw_rnd_mode
brw_rnd_mode_from_execution_mode(unsigned execution_mode)755 brw_rnd_mode_from_execution_mode(unsigned execution_mode)
756 {
757 if (nir_has_any_rounding_mode_rtne(execution_mode))
758 return BRW_RND_MODE_RTNE;
759 if (nir_has_any_rounding_mode_rtz(execution_mode))
760 return BRW_RND_MODE_RTZ;
761 return BRW_RND_MODE_UNSPECIFIED;
762 }
763
764 static brw_reg
prepare_alu_destination_and_sources(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,brw_reg * op,bool need_dest)765 prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
766 const fs_builder &bld,
767 nir_alu_instr *instr,
768 brw_reg *op,
769 bool need_dest)
770 {
771 const intel_device_info *devinfo = ntb.devinfo;
772
773 bool all_sources_uniform = true;
774 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
775 op[i] = get_nir_src(ntb, instr->src[i].src, -1);
776 op[i].type = brw_type_for_nir_type(devinfo,
777 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
778 nir_src_bit_size(instr->src[i].src)));
779
780 /* is_scalar sources won't be is_uniform because get_nir_src was passed
781 * -1 as the channel.
782 */
783 if (!is_uniform(op[i]) && !op[i].is_scalar)
784 all_sources_uniform = false;
785 }
786
787 brw_reg result =
788 need_dest ? get_nir_def(ntb, instr->def, all_sources_uniform) : bld.null_reg_ud();
789
790 result.type = brw_type_for_nir_type(devinfo,
791 (nir_alu_type)(nir_op_infos[instr->op].output_type |
792 instr->def.bit_size));
793
794 /* Move and vecN instrutions may still be vectored. Return the raw,
795 * vectored source and destination so that fs_visitor::nir_emit_alu can
796 * handle it. Other callers should not have to handle these kinds of
797 * instructions.
798 */
799 switch (instr->op) {
800 case nir_op_mov:
801 case nir_op_vec2:
802 case nir_op_vec3:
803 case nir_op_vec4:
804 case nir_op_vec8:
805 case nir_op_vec16:
806 return result;
807 default:
808 break;
809 }
810
811 const bool is_scalar = result.is_scalar || (!need_dest && all_sources_uniform);
812 const fs_builder xbld = is_scalar ? bld.scalar_group() : bld;
813
814 /* At this point, we have dealt with any instruction that operates on
815 * more than a single channel. Therefore, we can just adjust the source
816 * and destination registers for that channel and emit the instruction.
817 */
818 unsigned channel = 0;
819 if (nir_op_infos[instr->op].output_size == 0) {
820 /* Since NIR is doing the scalarizing for us, we should only ever see
821 * vectorized operations with a single channel.
822 */
823 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
824 assert(util_bitcount(write_mask) == 1);
825 channel = ffs(write_mask) - 1;
826
827 result = offset(result, xbld, channel);
828 }
829
830 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
831 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
832 op[i] = offset(op[i], xbld, instr->src[i].swizzle[channel]);
833
834 /* If the dispatch width matches the scalar allocation width, offset()
835 * won't set the stride to zero. Force that here.
836 */
837 if (op[i].is_scalar)
838 op[i] = component(op[i], 0);
839 }
840
841 return result;
842 }
843
844 static brw_reg
resolve_source_modifiers(const fs_builder & bld,const brw_reg & src)845 resolve_source_modifiers(const fs_builder &bld, const brw_reg &src)
846 {
847 return (src.abs || src.negate) ? bld.MOV(src) : src;
848 }
849
850 static void
resolve_inot_sources(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,brw_reg * op)851 resolve_inot_sources(nir_to_brw_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
852 brw_reg *op)
853 {
854 for (unsigned i = 0; i < 2; i++) {
855 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
856
857 if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
858 /* The source of the inot is now the source of instr. */
859 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
860
861 assert(!op[i].negate);
862 op[i].negate = true;
863 } else {
864 op[i] = resolve_source_modifiers(bld, op[i]);
865 }
866 }
867 }
868
869 static bool
try_emit_b2fi_of_inot(nir_to_brw_state & ntb,const fs_builder & bld,brw_reg result,nir_alu_instr * instr)870 try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld,
871 brw_reg result,
872 nir_alu_instr *instr)
873 {
874 const intel_device_info *devinfo = bld.shader->devinfo;
875
876 if (devinfo->verx10 >= 125)
877 return false;
878
879 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
880
881 if (inot_instr == NULL || inot_instr->op != nir_op_inot)
882 return false;
883
884 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set
885 * of valid size-changing combinations is a bit more complex.
886 *
887 * The source restriction is just because I was lazy about generating the
888 * constant below.
889 */
890 if (instr->def.bit_size != 32 ||
891 nir_src_bit_size(inot_instr->src[0].src) != 32)
892 return false;
893
894 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,
895 * this is float(1 + a).
896 */
897 brw_reg op;
898
899 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
900
901 /* Ignore the saturate modifier, if there is one. The result of the
902 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
903 */
904 bld.ADD(result, op, brw_imm_d(1));
905
906 return true;
907 }
908
909 static bool
is_const_zero(const nir_src & src)910 is_const_zero(const nir_src &src)
911 {
912 return nir_src_is_const(src) && nir_src_as_int(src) == 0;
913 }
914
915 static void
fs_nir_emit_alu(nir_to_brw_state & ntb,nir_alu_instr * instr,bool need_dest)916 fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
917 bool need_dest)
918 {
919 const intel_device_info *devinfo = ntb.devinfo;
920
921 fs_inst *inst;
922 unsigned execution_mode =
923 ntb.bld.shader->nir->info.float_controls_execution_mode;
924
925 brw_reg op[NIR_MAX_VEC_COMPONENTS];
926 brw_reg result = prepare_alu_destination_and_sources(ntb, ntb.bld, instr, op, need_dest);
927
928 #ifndef NDEBUG
929 /* Everything except raw moves, some type conversions, iabs, and ineg
930 * should have 8-bit sources lowered by nir_lower_bit_size in
931 * brw_preprocess_nir or by brw_nir_lower_conversions in
932 * brw_postprocess_nir.
933 */
934 switch (instr->op) {
935 case nir_op_mov:
936 case nir_op_vec2:
937 case nir_op_vec3:
938 case nir_op_vec4:
939 case nir_op_vec8:
940 case nir_op_vec16:
941 case nir_op_i2f16:
942 case nir_op_i2f32:
943 case nir_op_i2i16:
944 case nir_op_i2i32:
945 case nir_op_u2f16:
946 case nir_op_u2f32:
947 case nir_op_u2u16:
948 case nir_op_u2u32:
949 case nir_op_iabs:
950 case nir_op_ineg:
951 case nir_op_pack_32_4x8_split:
952 break;
953
954 default:
955 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
956 assert(brw_type_size_bytes(op[i].type) > 1);
957 }
958 }
959 #endif
960
961 const fs_builder &bld = result.is_scalar ? ntb.bld.scalar_group() : ntb.bld;
962
963 switch (instr->op) {
964 case nir_op_mov:
965 case nir_op_vec2:
966 case nir_op_vec3:
967 case nir_op_vec4:
968 case nir_op_vec8:
969 case nir_op_vec16: {
970 brw_reg temp = result;
971 bool need_extra_copy = false;
972
973 nir_intrinsic_instr *store_reg =
974 nir_store_reg_for_def(&instr->def);
975 if (store_reg != NULL) {
976 nir_def *dest_reg = store_reg->src[1].ssa;
977 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
978 nir_intrinsic_instr *load_reg =
979 nir_load_reg_for_def(instr->src[i].src.ssa);
980 if (load_reg == NULL)
981 continue;
982
983 if (load_reg->src[0].ssa == dest_reg) {
984 need_extra_copy = true;
985 temp = bld.vgrf(result.type, 4);
986 break;
987 }
988 }
989 }
990
991 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
992 unsigned last_bit = util_last_bit(write_mask);
993
994 assert(last_bit <= NIR_MAX_VEC_COMPONENTS);
995 brw_reg comps[NIR_MAX_VEC_COMPONENTS];
996
997 for (unsigned i = 0; i < last_bit; i++) {
998 if (instr->op == nir_op_mov)
999 comps[i] = offset(op[0], bld, instr->src[0].swizzle[i]);
1000 else
1001 comps[i] = offset(op[i], bld, instr->src[i].swizzle[0]);
1002 }
1003
1004 if (write_mask == (1u << last_bit) - 1) {
1005 bld.VEC(temp, comps, last_bit);
1006 } else {
1007 for (unsigned i = 0; i < last_bit; i++) {
1008 if (write_mask & (1 << i))
1009 bld.MOV(offset(temp, bld, i), comps[i]);
1010 }
1011 }
1012
1013 /* In this case the source and destination registers were the same,
1014 * so we need to insert an extra set of moves in order to deal with
1015 * any swizzling.
1016 */
1017 if (need_extra_copy) {
1018 for (unsigned i = 0; i < last_bit; i++) {
1019 if (!(write_mask & (1 << i)))
1020 continue;
1021
1022 bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1023 }
1024 }
1025 return;
1026 }
1027
1028 case nir_op_i2f32:
1029 case nir_op_u2f32:
1030 if (optimize_extract_to_float(ntb, bld, instr, result))
1031 return;
1032 bld.MOV(result, op[0]);
1033 break;
1034
1035 case nir_op_f2f16_rtne:
1036 case nir_op_f2f16_rtz:
1037 case nir_op_f2f16: {
1038 brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
1039
1040 if (nir_op_f2f16 == instr->op)
1041 rnd = brw_rnd_mode_from_execution_mode(execution_mode);
1042 else
1043 rnd = brw_rnd_mode_from_nir_op(instr->op);
1044
1045 if (BRW_RND_MODE_UNSPECIFIED != rnd)
1046 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
1047
1048 assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
1049 bld.MOV(result, op[0]);
1050 break;
1051 }
1052
1053 case nir_op_b2i8:
1054 case nir_op_b2i16:
1055 case nir_op_b2i32:
1056 case nir_op_b2i64:
1057 case nir_op_b2f16:
1058 case nir_op_b2f32:
1059 case nir_op_b2f64:
1060 if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1061 break;
1062 op[0].type = BRW_TYPE_D;
1063 op[0].negate = !op[0].negate;
1064 FALLTHROUGH;
1065 case nir_op_i2f64:
1066 case nir_op_i2i64:
1067 case nir_op_u2f64:
1068 case nir_op_u2u64:
1069 case nir_op_f2f64:
1070 case nir_op_f2i64:
1071 case nir_op_f2u64:
1072 case nir_op_i2i32:
1073 case nir_op_u2u32:
1074 case nir_op_f2i32:
1075 case nir_op_f2u32:
1076 case nir_op_i2f16:
1077 case nir_op_u2f16:
1078 case nir_op_f2i16:
1079 case nir_op_f2u16:
1080 case nir_op_f2i8:
1081 case nir_op_f2u8:
1082 if (result.type == BRW_TYPE_B ||
1083 result.type == BRW_TYPE_UB ||
1084 result.type == BRW_TYPE_HF)
1085 assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
1086
1087 if (op[0].type == BRW_TYPE_B ||
1088 op[0].type == BRW_TYPE_UB ||
1089 op[0].type == BRW_TYPE_HF)
1090 assert(brw_type_size_bytes(result.type) < 8); /* brw_nir_lower_conversions */
1091
1092 bld.MOV(result, op[0]);
1093 break;
1094
1095 case nir_op_i2i8:
1096 case nir_op_u2u8:
1097 assert(brw_type_size_bytes(op[0].type) < 8); /* brw_nir_lower_conversions */
1098 FALLTHROUGH;
1099 case nir_op_i2i16:
1100 case nir_op_u2u16: {
1101 /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1102 * Emitting the instructions one by one results in two MOV instructions
1103 * that won't be propagated. By handling both instructions here, a
1104 * single MOV is emitted.
1105 */
1106 nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1107 if (extract_instr != NULL) {
1108 if (extract_instr->op == nir_op_extract_u8 ||
1109 extract_instr->op == nir_op_extract_i8) {
1110 prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
1111
1112 const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1113 const brw_reg_type type =
1114 brw_int_type(1, extract_instr->op == nir_op_extract_i8);
1115
1116 op[0] = subscript(op[0], type, byte);
1117 } else if (extract_instr->op == nir_op_extract_u16 ||
1118 extract_instr->op == nir_op_extract_i16) {
1119 prepare_alu_destination_and_sources(ntb, ntb.bld, extract_instr, op, false);
1120
1121 const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1122 const brw_reg_type type =
1123 brw_int_type(2, extract_instr->op == nir_op_extract_i16);
1124
1125 op[0] = subscript(op[0], type, word);
1126 }
1127 }
1128
1129 bld.MOV(result, op[0]);
1130 break;
1131 }
1132
1133 case nir_op_fsat:
1134 inst = bld.MOV(result, op[0]);
1135 inst->saturate = true;
1136 break;
1137
1138 case nir_op_fneg:
1139 case nir_op_ineg:
1140 op[0].negate = true;
1141 bld.MOV(result, op[0]);
1142 break;
1143
1144 case nir_op_fabs:
1145 case nir_op_iabs:
1146 op[0].negate = false;
1147 op[0].abs = true;
1148 bld.MOV(result, op[0]);
1149 break;
1150
1151 case nir_op_f2f32:
1152 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1153 brw_rnd_mode rnd =
1154 brw_rnd_mode_from_execution_mode(execution_mode);
1155 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1156 brw_imm_d(rnd));
1157 }
1158
1159 if (op[0].type == BRW_TYPE_HF)
1160 assert(brw_type_size_bytes(result.type) < 8); /* brw_nir_lower_conversions */
1161
1162 bld.MOV(result, op[0]);
1163 break;
1164
1165 case nir_op_fsign:
1166 unreachable("Should have been lowered by brw_nir_lower_fsign.");
1167
1168 case nir_op_frcp:
1169 bld.RCP(result, op[0]);
1170 break;
1171
1172 case nir_op_fexp2:
1173 bld.EXP2(result, op[0]);
1174 break;
1175
1176 case nir_op_flog2:
1177 bld.LOG2(result, op[0]);
1178 break;
1179
1180 case nir_op_fsin:
1181 bld.SIN(result, op[0]);
1182 break;
1183
1184 case nir_op_fcos:
1185 bld.COS(result, op[0]);
1186 break;
1187
1188 case nir_op_fadd:
1189 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1190 brw_rnd_mode rnd =
1191 brw_rnd_mode_from_execution_mode(execution_mode);
1192 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1193 brw_imm_d(rnd));
1194 }
1195 FALLTHROUGH;
1196 case nir_op_iadd:
1197 bld.ADD(result, op[0], op[1]);
1198 break;
1199
1200 case nir_op_iadd3:
1201 assert(instr->def.bit_size < 64);
1202 bld.ADD3(result, op[0], op[1], op[2]);
1203 break;
1204
1205 case nir_op_iadd_sat:
1206 case nir_op_uadd_sat:
1207 inst = bld.ADD(result, op[0], op[1]);
1208 inst->saturate = true;
1209 break;
1210
1211 case nir_op_isub_sat:
1212 bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1213 break;
1214
1215 case nir_op_usub_sat:
1216 bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1217 break;
1218
1219 case nir_op_irhadd:
1220 case nir_op_urhadd:
1221 assert(instr->def.bit_size < 64);
1222 bld.AVG(result, op[0], op[1]);
1223 break;
1224
1225 case nir_op_ihadd:
1226 case nir_op_uhadd: {
1227 assert(instr->def.bit_size < 64);
1228
1229 op[0] = resolve_source_modifiers(bld, op[0]);
1230 op[1] = resolve_source_modifiers(bld, op[1]);
1231
1232 /* AVG(x, y) - ((x ^ y) & 1) */
1233 brw_reg one = retype(brw_imm_ud(1), result.type);
1234 bld.ADD(result, bld.AVG(op[0], op[1]),
1235 negate(bld.AND(bld.XOR(op[0], op[1]), one)));
1236 break;
1237 }
1238
1239 case nir_op_fmul:
1240 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1241 brw_rnd_mode rnd =
1242 brw_rnd_mode_from_execution_mode(execution_mode);
1243 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1244 brw_imm_d(rnd));
1245 }
1246
1247 bld.MUL(result, op[0], op[1]);
1248 break;
1249
1250 case nir_op_imul_2x32_64:
1251 case nir_op_umul_2x32_64:
1252 bld.MUL(result, op[0], op[1]);
1253 break;
1254
1255 case nir_op_imul_32x16:
1256 case nir_op_umul_32x16: {
1257 const bool ud = instr->op == nir_op_umul_32x16;
1258 const enum brw_reg_type word_type = ud ? BRW_TYPE_UW : BRW_TYPE_W;
1259 const enum brw_reg_type dword_type = ud ? BRW_TYPE_UD : BRW_TYPE_D;
1260
1261 assert(instr->def.bit_size == 32);
1262
1263 /* Before copy propagation there are no immediate values. */
1264 assert(op[0].file != IMM && op[1].file != IMM);
1265
1266 op[1] = subscript(op[1], word_type, 0);
1267
1268 bld.MUL(result, retype(op[0], dword_type), op[1]);
1269
1270 break;
1271 }
1272
1273 case nir_op_imul:
1274 assert(instr->def.bit_size < 64);
1275 bld.MUL(result, op[0], op[1]);
1276 break;
1277
1278 case nir_op_imul_high:
1279 case nir_op_umul_high:
1280 assert(instr->def.bit_size < 64);
1281 if (instr->def.bit_size == 32) {
1282 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1283 } else {
1284 brw_reg tmp = bld.vgrf(brw_type_with_size(op[0].type, 32));
1285 bld.MUL(tmp, op[0], op[1]);
1286 bld.MOV(result, subscript(tmp, result.type, 1));
1287 }
1288 break;
1289
1290 case nir_op_idiv:
1291 case nir_op_udiv:
1292 assert(instr->def.bit_size < 64);
1293 bld.INT_QUOTIENT(result, op[0], op[1]);
1294 break;
1295
1296 case nir_op_uadd_carry:
1297 unreachable("Should have been lowered by carry_to_arith().");
1298
1299 case nir_op_usub_borrow:
1300 unreachable("Should have been lowered by borrow_to_arith().");
1301
1302 case nir_op_umod:
1303 case nir_op_irem:
1304 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1305 * appears that our hardware just does the right thing for signed
1306 * remainder.
1307 */
1308 assert(instr->def.bit_size < 64);
1309 bld.INT_REMAINDER(result, op[0], op[1]);
1310 break;
1311
1312 case nir_op_imod: {
1313 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
1314 bld.INT_REMAINDER(result, op[0], op[1]);
1315
1316 /* Math instructions don't support conditional mod */
1317 inst = bld.MOV(bld.null_reg_d(), result);
1318 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1319
1320 /* Now, we need to determine if signs of the sources are different.
1321 * When we XOR the sources, the top bit is 0 if they are the same and 1
1322 * if they are different. We can then use a conditional modifier to
1323 * turn that into a predicate. This leads us to an XOR.l instruction.
1324 *
1325 * Technically, according to the PRM, you're not allowed to use .l on a
1326 * XOR instruction. However, empirical experiments and Curro's reading
1327 * of the simulator source both indicate that it's safe.
1328 */
1329 bld.XOR(op[0], op[1], &inst);
1330 inst->predicate = BRW_PREDICATE_NORMAL;
1331 inst->conditional_mod = BRW_CONDITIONAL_L;
1332
1333 /* If the result of the initial remainder operation is non-zero and the
1334 * two sources have different signs, add in a copy of op[1] to get the
1335 * final integer modulus value.
1336 */
1337 inst = bld.ADD(result, result, op[1]);
1338 inst->predicate = BRW_PREDICATE_NORMAL;
1339 break;
1340 }
1341
1342 case nir_op_flt32:
1343 case nir_op_fge32:
1344 case nir_op_feq32:
1345 case nir_op_fneu32: {
1346 brw_reg dest = result;
1347
1348 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1349 if (bit_size != 32) {
1350 dest = bld.vgrf(op[0].type);
1351 bld.UNDEF(dest);
1352 }
1353
1354 bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
1355
1356 /* The destination will now be used as a source, so select component 0
1357 * if it's is_scalar (as is done in get_nir_src).
1358 */
1359 if (bit_size != 32 && result.is_scalar)
1360 dest = component(dest, 0);
1361
1362 if (bit_size > 32) {
1363 bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
1364 } else if(bit_size < 32) {
1365 /* When we convert the result to 32-bit we need to be careful and do
1366 * it as a signed conversion to get sign extension (for 32-bit true)
1367 */
1368 const brw_reg_type src_type =
1369 brw_type_with_size(BRW_TYPE_D, bit_size);
1370
1371 bld.MOV(retype(result, BRW_TYPE_D), retype(dest, src_type));
1372 }
1373 break;
1374 }
1375
1376 case nir_op_ilt32:
1377 case nir_op_ult32:
1378 case nir_op_ige32:
1379 case nir_op_uge32:
1380 case nir_op_ieq32:
1381 case nir_op_ine32: {
1382 brw_reg dest = result;
1383
1384 const uint32_t bit_size = brw_type_size_bits(op[0].type);
1385 if (bit_size != 32) {
1386 dest = bld.vgrf(op[0].type);
1387 bld.UNDEF(dest);
1388 }
1389
1390 bld.CMP(dest, op[0], op[1],
1391 brw_cmod_for_nir_comparison(instr->op));
1392
1393 /* The destination will now be used as a source, so select component 0
1394 * if it's is_scalar (as is done in get_nir_src).
1395 */
1396 if (bit_size != 32 && result.is_scalar)
1397 dest = component(dest, 0);
1398
1399 if (bit_size > 32) {
1400 bld.MOV(result, subscript(dest, BRW_TYPE_UD, 0));
1401 } else if (bit_size < 32) {
1402 /* When we convert the result to 32-bit we need to be careful and do
1403 * it as a signed conversion to get sign extension (for 32-bit true)
1404 */
1405 const brw_reg_type src_type =
1406 brw_type_with_size(BRW_TYPE_D, bit_size);
1407
1408 bld.MOV(retype(result, BRW_TYPE_D), retype(dest, src_type));
1409 }
1410 break;
1411 }
1412
1413 case nir_op_inot: {
1414 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1415
1416 if (inot_src_instr != NULL &&
1417 (inot_src_instr->op == nir_op_ior ||
1418 inot_src_instr->op == nir_op_ixor ||
1419 inot_src_instr->op == nir_op_iand)) {
1420 /* The sources of the source logical instruction are now the
1421 * sources of the instruction that will be generated.
1422 */
1423 prepare_alu_destination_and_sources(ntb, ntb.bld, inot_src_instr, op, false);
1424 resolve_inot_sources(ntb, bld, inot_src_instr, op);
1425
1426 /* Smash all of the sources and destination to be signed. This
1427 * doesn't matter for the operation of the instruction, but cmod
1428 * propagation fails on unsigned sources with negation (due to
1429 * fs_inst::can_do_cmod returning false).
1430 */
1431 result.type =
1432 brw_type_for_nir_type(devinfo,
1433 (nir_alu_type)(nir_type_int |
1434 instr->def.bit_size));
1435 op[0].type =
1436 brw_type_for_nir_type(devinfo,
1437 (nir_alu_type)(nir_type_int |
1438 nir_src_bit_size(inot_src_instr->src[0].src)));
1439 op[1].type =
1440 brw_type_for_nir_type(devinfo,
1441 (nir_alu_type)(nir_type_int |
1442 nir_src_bit_size(inot_src_instr->src[1].src)));
1443
1444 /* For XOR, only invert one of the sources. Arbitrarily choose
1445 * the first source.
1446 */
1447 op[0].negate = !op[0].negate;
1448 if (inot_src_instr->op != nir_op_ixor)
1449 op[1].negate = !op[1].negate;
1450
1451 switch (inot_src_instr->op) {
1452 case nir_op_ior:
1453 bld.AND(result, op[0], op[1]);
1454 return;
1455
1456 case nir_op_iand:
1457 bld.OR(result, op[0], op[1]);
1458 return;
1459
1460 case nir_op_ixor:
1461 bld.XOR(result, op[0], op[1]);
1462 return;
1463
1464 default:
1465 unreachable("impossible opcode");
1466 }
1467 }
1468 op[0] = resolve_source_modifiers(bld, op[0]);
1469 bld.NOT(result, op[0]);
1470 break;
1471 }
1472
1473 case nir_op_ixor:
1474 resolve_inot_sources(ntb, bld, instr, op);
1475 bld.XOR(result, op[0], op[1]);
1476 break;
1477 case nir_op_ior:
1478 resolve_inot_sources(ntb, bld, instr, op);
1479 bld.OR(result, op[0], op[1]);
1480 break;
1481 case nir_op_iand:
1482 resolve_inot_sources(ntb, bld, instr, op);
1483 bld.AND(result, op[0], op[1]);
1484 break;
1485
1486 case nir_op_fdot2:
1487 case nir_op_fdot3:
1488 case nir_op_fdot4:
1489 case nir_op_b32all_fequal2:
1490 case nir_op_b32all_iequal2:
1491 case nir_op_b32all_fequal3:
1492 case nir_op_b32all_iequal3:
1493 case nir_op_b32all_fequal4:
1494 case nir_op_b32all_iequal4:
1495 case nir_op_b32any_fnequal2:
1496 case nir_op_b32any_inequal2:
1497 case nir_op_b32any_fnequal3:
1498 case nir_op_b32any_inequal3:
1499 case nir_op_b32any_fnequal4:
1500 case nir_op_b32any_inequal4:
1501 unreachable("Lowered by nir_lower_alu_reductions");
1502
1503 case nir_op_ldexp:
1504 unreachable("not reached: should be handled by ldexp_to_arith()");
1505
1506 case nir_op_fsqrt:
1507 bld.SQRT(result, op[0]);
1508 break;
1509
1510 case nir_op_frsq:
1511 bld.RSQ(result, op[0]);
1512 break;
1513
1514 case nir_op_ftrunc:
1515 bld.RNDZ(result, op[0]);
1516 break;
1517
1518 case nir_op_fceil:
1519 bld.MOV(result, negate(bld.RNDD(negate(op[0]))));
1520 break;
1521 case nir_op_ffloor:
1522 bld.RNDD(result, op[0]);
1523 break;
1524 case nir_op_ffract:
1525 bld.FRC(result, op[0]);
1526 break;
1527 case nir_op_fround_even:
1528 bld.RNDE(result, op[0]);
1529 break;
1530
1531 case nir_op_fquantize2f16: {
1532 brw_reg tmp16 = bld.vgrf(BRW_TYPE_D);
1533 brw_reg tmp32 = bld.vgrf(BRW_TYPE_F);
1534
1535 /* The destination stride must be at least as big as the source stride. */
1536 tmp16 = subscript(tmp16, BRW_TYPE_HF, 0);
1537
1538 /* Check for denormal */
1539 brw_reg abs_src0 = op[0];
1540 abs_src0.abs = true;
1541 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1542 BRW_CONDITIONAL_L);
1543 /* Get the appropriately signed zero */
1544 brw_reg zero = retype(bld.AND(retype(op[0], BRW_TYPE_UD),
1545 brw_imm_ud(0x80000000)), BRW_TYPE_F);
1546 /* Do the actual F32 -> F16 -> F32 conversion */
1547 bld.MOV(tmp16, op[0]);
1548 bld.MOV(tmp32, tmp16);
1549 /* Select that or zero based on normal status */
1550 inst = bld.SEL(result, zero, tmp32);
1551 inst->predicate = BRW_PREDICATE_NORMAL;
1552 break;
1553 }
1554
1555 case nir_op_imin:
1556 case nir_op_umin:
1557 case nir_op_fmin:
1558 bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1559 break;
1560
1561 case nir_op_imax:
1562 case nir_op_umax:
1563 case nir_op_fmax:
1564 bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1565 break;
1566
1567 case nir_op_pack_snorm_2x16:
1568 case nir_op_pack_snorm_4x8:
1569 case nir_op_pack_unorm_2x16:
1570 case nir_op_pack_unorm_4x8:
1571 case nir_op_unpack_snorm_2x16:
1572 case nir_op_unpack_snorm_4x8:
1573 case nir_op_unpack_unorm_2x16:
1574 case nir_op_unpack_unorm_4x8:
1575 case nir_op_unpack_half_2x16:
1576 case nir_op_pack_half_2x16:
1577 unreachable("not reached: should be handled by lower_packing_builtins");
1578
1579 case nir_op_unpack_half_2x16_split_x:
1580 bld.MOV(result, subscript(op[0], BRW_TYPE_HF, 0));
1581 break;
1582
1583 case nir_op_unpack_half_2x16_split_y:
1584 bld.MOV(result, subscript(op[0], BRW_TYPE_HF, 1));
1585 break;
1586
1587 case nir_op_pack_64_2x32_split:
1588 case nir_op_pack_32_2x16_split:
1589 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1590 break;
1591
1592 case nir_op_pack_32_4x8_split:
1593 bld.emit(FS_OPCODE_PACK, result, op, 4);
1594 break;
1595
1596 case nir_op_unpack_64_2x32_split_x:
1597 case nir_op_unpack_64_2x32_split_y: {
1598 if (instr->op == nir_op_unpack_64_2x32_split_x)
1599 bld.MOV(result, subscript(op[0], BRW_TYPE_UD, 0));
1600 else
1601 bld.MOV(result, subscript(op[0], BRW_TYPE_UD, 1));
1602 break;
1603 }
1604
1605 case nir_op_unpack_32_2x16_split_x:
1606 case nir_op_unpack_32_2x16_split_y: {
1607 if (instr->op == nir_op_unpack_32_2x16_split_x)
1608 bld.MOV(result, subscript(op[0], BRW_TYPE_UW, 0));
1609 else
1610 bld.MOV(result, subscript(op[0], BRW_TYPE_UW, 1));
1611 break;
1612 }
1613
1614 case nir_op_fpow:
1615 bld.POW(result, op[0], op[1]);
1616 break;
1617
1618 case nir_op_bitfield_reverse:
1619 assert(instr->def.bit_size == 32);
1620 assert(nir_src_bit_size(instr->src[0].src) == 32);
1621 bld.BFREV(result, op[0]);
1622 break;
1623
1624 case nir_op_bit_count:
1625 assert(instr->def.bit_size == 32);
1626 assert(nir_src_bit_size(instr->src[0].src) < 64);
1627 bld.CBIT(result, op[0]);
1628 break;
1629
1630 case nir_op_uclz:
1631 assert(instr->def.bit_size == 32);
1632 assert(nir_src_bit_size(instr->src[0].src) == 32);
1633 bld.LZD(retype(result, BRW_TYPE_UD), op[0]);
1634 break;
1635
1636 case nir_op_ifind_msb: {
1637 assert(instr->def.bit_size == 32);
1638 assert(nir_src_bit_size(instr->src[0].src) == 32);
1639
1640 brw_reg tmp = bld.FBH(retype(op[0], BRW_TYPE_D));
1641
1642 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1643 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1644 * subtract the result from 31 to convert the MSB count into an LSB
1645 * count.
1646 */
1647 brw_reg count_from_lsb = bld.ADD(negate(tmp), brw_imm_w(31));
1648
1649 /* The high word of the FBH result will be 0xffff or 0x0000. After
1650 * calculating 31 - fbh, we can obtain the correct result for
1651 * ifind_msb(0) by ORing the (sign extended) upper word of the
1652 * intermediate result.
1653 */
1654 bld.OR(result, count_from_lsb, subscript(tmp, BRW_TYPE_W, 1));
1655 break;
1656 }
1657
1658 case nir_op_find_lsb:
1659 assert(instr->def.bit_size == 32);
1660 assert(nir_src_bit_size(instr->src[0].src) == 32);
1661 bld.FBL(result, op[0]);
1662 break;
1663
1664 case nir_op_ubitfield_extract:
1665 case nir_op_ibitfield_extract:
1666 unreachable("should have been lowered");
1667 case nir_op_ubfe:
1668 case nir_op_ibfe:
1669 assert(instr->def.bit_size < 64);
1670 bld.BFE(result, op[2], op[1], op[0]);
1671 break;
1672 case nir_op_bfm:
1673 assert(instr->def.bit_size < 64);
1674 bld.BFI1(result, op[0], op[1]);
1675 break;
1676 case nir_op_bfi:
1677 assert(instr->def.bit_size < 64);
1678
1679 /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1680 * either 0 or src0. Replacing the 0 with another value can eliminate a
1681 * temporary register.
1682 */
1683 if (is_const_zero(instr->src[2].src))
1684 bld.BFI2(result, op[0], op[1], op[0]);
1685 else
1686 bld.BFI2(result, op[0], op[1], op[2]);
1687
1688 break;
1689
1690 case nir_op_bitfield_insert:
1691 unreachable("not reached: should have been lowered");
1692
1693 /* With regards to implicit masking of the shift counts for 8- and 16-bit
1694 * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1695 * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1696 * src0) are used. The Bspec (backed by data from experimentation) state
1697 * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1698 * types.
1699 *
1700 * The match the behavior expected for the NIR opcodes, explicit masks for
1701 * 8- and 16-bit types must be added.
1702 */
1703 case nir_op_ishl:
1704 if (instr->def.bit_size < 32) {
1705 bld.SHL(result,
1706 op[0],
1707 bld.AND(op[1], brw_imm_ud(instr->def.bit_size - 1)));
1708 } else {
1709 bld.SHL(result, op[0], op[1]);
1710 }
1711
1712 break;
1713 case nir_op_ishr:
1714 if (instr->def.bit_size < 32) {
1715 bld.ASR(result,
1716 op[0],
1717 bld.AND(op[1], brw_imm_ud(instr->def.bit_size - 1)));
1718 } else {
1719 bld.ASR(result, op[0], op[1]);
1720 }
1721
1722 break;
1723 case nir_op_ushr:
1724 if (instr->def.bit_size < 32) {
1725 bld.SHR(result,
1726 op[0],
1727 bld.AND(op[1], brw_imm_ud(instr->def.bit_size - 1)));
1728 } else {
1729 bld.SHR(result, op[0], op[1]);
1730 }
1731
1732 break;
1733
1734 case nir_op_urol:
1735 bld.ROL(result, op[0], op[1]);
1736 break;
1737 case nir_op_uror:
1738 bld.ROR(result, op[0], op[1]);
1739 break;
1740
1741 case nir_op_pack_half_2x16_split:
1742 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1743 break;
1744
1745 case nir_op_sdot_4x8_iadd:
1746 case nir_op_sdot_4x8_iadd_sat:
1747 inst = bld.DP4A(retype(result, BRW_TYPE_D),
1748 retype(op[2], BRW_TYPE_D),
1749 retype(op[0], BRW_TYPE_D),
1750 retype(op[1], BRW_TYPE_D));
1751
1752 if (instr->op == nir_op_sdot_4x8_iadd_sat)
1753 inst->saturate = true;
1754 break;
1755
1756 case nir_op_udot_4x8_uadd:
1757 case nir_op_udot_4x8_uadd_sat:
1758 inst = bld.DP4A(retype(result, BRW_TYPE_UD),
1759 retype(op[2], BRW_TYPE_UD),
1760 retype(op[0], BRW_TYPE_UD),
1761 retype(op[1], BRW_TYPE_UD));
1762
1763 if (instr->op == nir_op_udot_4x8_uadd_sat)
1764 inst->saturate = true;
1765 break;
1766
1767 case nir_op_sudot_4x8_iadd:
1768 case nir_op_sudot_4x8_iadd_sat:
1769 inst = bld.DP4A(retype(result, BRW_TYPE_D),
1770 retype(op[2], BRW_TYPE_D),
1771 retype(op[0], BRW_TYPE_D),
1772 retype(op[1], BRW_TYPE_UD));
1773
1774 if (instr->op == nir_op_sudot_4x8_iadd_sat)
1775 inst->saturate = true;
1776 break;
1777
1778 case nir_op_ffma:
1779 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1780 brw_rnd_mode rnd =
1781 brw_rnd_mode_from_execution_mode(execution_mode);
1782 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1783 brw_imm_d(rnd));
1784 }
1785
1786 bld.MAD(result, op[2], op[1], op[0]);
1787 break;
1788
1789 case nir_op_flrp:
1790 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1791 brw_rnd_mode rnd =
1792 brw_rnd_mode_from_execution_mode(execution_mode);
1793 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1794 brw_imm_d(rnd));
1795 }
1796
1797 bld.LRP(result, op[0], op[1], op[2]);
1798 break;
1799
1800 case nir_op_b32csel:
1801 if (optimize_frontfacing_ternary(ntb, instr, result))
1802 return;
1803
1804 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1805 inst = bld.SEL(result, op[1], op[2]);
1806 inst->predicate = BRW_PREDICATE_NORMAL;
1807 break;
1808
1809 case nir_op_fcsel:
1810 bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_NZ);
1811 break;
1812
1813 case nir_op_fcsel_gt:
1814 bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_G);
1815 break;
1816
1817 case nir_op_fcsel_ge:
1818 bld.CSEL(result, op[1], op[2], op[0], BRW_CONDITIONAL_GE);
1819 break;
1820
1821 case nir_op_extract_u8:
1822 case nir_op_extract_i8: {
1823 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1824 unsigned byte = nir_src_as_uint(instr->src[1].src);
1825
1826 /* The PRMs say:
1827 *
1828 * BDW+
1829 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1830 * Use two instructions and a word or DWord intermediate integer type.
1831 */
1832 if (instr->def.bit_size == 64) {
1833 if (instr->op == nir_op_extract_i8) {
1834 /* If we need to sign extend, extract to a word first */
1835 brw_reg w_temp = bld.vgrf(BRW_TYPE_W);
1836 bld.MOV(w_temp, subscript(op[0], type, byte));
1837 bld.MOV(result, w_temp);
1838 } else if (byte & 1) {
1839 /* Extract the high byte from the word containing the desired byte
1840 * offset.
1841 */
1842 bld.SHR(result,
1843 subscript(op[0], BRW_TYPE_UW, byte / 2),
1844 brw_imm_uw(8));
1845 } else {
1846 /* Otherwise use an AND with 0xff and a word type */
1847 bld.AND(result,
1848 subscript(op[0], BRW_TYPE_UW, byte / 2),
1849 brw_imm_uw(0xff));
1850 }
1851 } else {
1852 bld.MOV(result, subscript(op[0], type, byte));
1853 }
1854 break;
1855 }
1856
1857 case nir_op_extract_u16:
1858 case nir_op_extract_i16: {
1859 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1860 unsigned word = nir_src_as_uint(instr->src[1].src);
1861 bld.MOV(result, subscript(op[0], type, word));
1862 break;
1863 }
1864
1865 default:
1866 unreachable("unhandled instruction");
1867 }
1868 }
1869
1870 static void
fs_nir_emit_load_const(nir_to_brw_state & ntb,nir_load_const_instr * instr)1871 fs_nir_emit_load_const(nir_to_brw_state &ntb,
1872 nir_load_const_instr *instr)
1873 {
1874 const intel_device_info *devinfo = ntb.devinfo;
1875 const fs_builder &bld = ntb.bld.scalar_group();
1876
1877 const brw_reg_type reg_type =
1878 brw_type_with_size(BRW_TYPE_D, instr->def.bit_size);
1879 brw_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1880
1881 reg.is_scalar = true;
1882
1883 brw_reg comps[NIR_MAX_VEC_COMPONENTS];
1884
1885 switch (instr->def.bit_size) {
1886 case 8:
1887 for (unsigned i = 0; i < instr->def.num_components; i++)
1888 comps[i] = setup_imm_b(bld, instr->value[i].i8);
1889 break;
1890
1891 case 16:
1892 for (unsigned i = 0; i < instr->def.num_components; i++)
1893 comps[i] = brw_imm_w(instr->value[i].i16);
1894 break;
1895
1896 case 32:
1897 for (unsigned i = 0; i < instr->def.num_components; i++)
1898 comps[i] = brw_imm_d(instr->value[i].i32);
1899 break;
1900
1901 case 64:
1902 if (!devinfo->has_64bit_int) {
1903 reg.type = BRW_TYPE_DF;
1904 for (unsigned i = 0; i < instr->def.num_components; i++)
1905 comps[i] = brw_imm_df(instr->value[i].f64);
1906 } else {
1907 for (unsigned i = 0; i < instr->def.num_components; i++)
1908 comps[i] = brw_imm_q(instr->value[i].i64);
1909 }
1910 break;
1911
1912 default:
1913 unreachable("Invalid bit size");
1914 }
1915
1916 bld.VEC(reg, comps, instr->def.num_components);
1917
1918 ntb.ssa_values[instr->def.index] = reg;
1919 }
1920
1921 static bool
get_nir_src_bindless(nir_to_brw_state & ntb,const nir_src & src)1922 get_nir_src_bindless(nir_to_brw_state &ntb, const nir_src &src)
1923 {
1924 return ntb.ssa_bind_infos[src.ssa->index].bindless;
1925 }
1926
1927 /**
1928 * Specifying -1 for channel indicates that no channel selection should be applied.
1929 */
1930 static brw_reg
get_nir_src(nir_to_brw_state & ntb,const nir_src & src,int channel)1931 get_nir_src(nir_to_brw_state &ntb, const nir_src &src, int channel)
1932 {
1933 nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
1934
1935 brw_reg reg;
1936 if (!load_reg) {
1937 if (nir_src_is_undef(src)) {
1938 const brw_reg_type reg_type =
1939 brw_type_with_size(BRW_TYPE_D, src.ssa->bit_size);
1940 reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
1941 } else {
1942 reg = ntb.ssa_values[src.ssa->index];
1943 }
1944 } else {
1945 nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
1946 /* We don't handle indirects on locals */
1947 assert(nir_intrinsic_base(load_reg) == 0);
1948 assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
1949 reg = ntb.ssa_values[decl_reg->def.index];
1950 }
1951
1952 /* To avoid floating-point denorm flushing problems, set the type by
1953 * default to an integer type - instructions that need floating point
1954 * semantics will set this to F if they need to
1955 */
1956 reg.type = brw_type_with_size(BRW_TYPE_D, nir_src_bit_size(src));
1957
1958 if (channel >= 0) {
1959 reg = offset(reg, ntb.bld, channel);
1960
1961 /* If the dispatch width matches the scalar allocation width, offset()
1962 * won't set the stride to zero. Force that here.
1963 */
1964 if (reg.is_scalar)
1965 reg = component(reg, 0);
1966 }
1967
1968 return reg;
1969 }
1970
1971 /**
1972 * Return an IMM for 32-bit constants; otherwise call get_nir_src() as normal.
1973 */
1974 static brw_reg
get_nir_src_imm(nir_to_brw_state & ntb,const nir_src & src)1975 get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
1976 {
1977 return nir_src_is_const(src) && nir_src_bit_size(src) == 32 ?
1978 brw_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
1979 }
1980
1981 static brw_reg
get_nir_def(nir_to_brw_state & ntb,const nir_def & def,bool all_sources_uniform)1982 get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform)
1983 {
1984 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
1985 bool is_scalar = false;
1986
1987 if (def.parent_instr->type == nir_instr_type_intrinsic &&
1988 store_reg == NULL) {
1989 const nir_intrinsic_instr *instr =
1990 nir_instr_as_intrinsic(def.parent_instr);
1991
1992 switch (instr->intrinsic) {
1993 case nir_intrinsic_load_btd_global_arg_addr_intel:
1994 case nir_intrinsic_load_btd_local_arg_addr_intel:
1995 case nir_intrinsic_load_btd_shader_type_intel:
1996 case nir_intrinsic_load_global_constant_uniform_block_intel:
1997 case nir_intrinsic_load_inline_data_intel:
1998 case nir_intrinsic_load_reloc_const_intel:
1999 case nir_intrinsic_load_ssbo_uniform_block_intel:
2000 case nir_intrinsic_load_ubo_uniform_block_intel:
2001 case nir_intrinsic_load_workgroup_id:
2002 is_scalar = true;
2003 break;
2004
2005 case nir_intrinsic_load_ubo:
2006 is_scalar = get_nir_src(ntb, instr->src[1]).is_scalar;
2007 break;
2008
2009 case nir_intrinsic_load_uniform:
2010 is_scalar = get_nir_src(ntb, instr->src[0]).is_scalar;
2011 break;
2012
2013 case nir_intrinsic_ballot:
2014 case nir_intrinsic_resource_intel:
2015 is_scalar = !def.divergent;
2016 break;
2017
2018 default:
2019 break;
2020 }
2021
2022 /* This cannot be is_scalar if NIR thought it was divergent. */
2023 assert(!(is_scalar && def.divergent));
2024 } else if (def.parent_instr->type == nir_instr_type_alu) {
2025 is_scalar = store_reg == NULL && all_sources_uniform && !def.divergent;
2026 }
2027
2028 const fs_builder &bld = is_scalar ? ntb.bld.scalar_group() : ntb.bld;
2029
2030 if (!store_reg) {
2031 const brw_reg_type reg_type =
2032 brw_type_with_size(def.bit_size == 8 ? BRW_TYPE_D : BRW_TYPE_F,
2033 def.bit_size);
2034 ntb.ssa_values[def.index] =
2035 bld.vgrf(reg_type, def.num_components);
2036
2037 ntb.ssa_values[def.index].is_scalar = is_scalar;
2038
2039 if (def.bit_size * bld.dispatch_width() < 8 * REG_SIZE)
2040 bld.UNDEF(ntb.ssa_values[def.index]);
2041
2042 return ntb.ssa_values[def.index];
2043 } else {
2044 nir_intrinsic_instr *decl_reg =
2045 nir_reg_get_decl(store_reg->src[1].ssa);
2046 /* We don't handle indirects on locals */
2047 assert(nir_intrinsic_base(store_reg) == 0);
2048 assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
2049 assert(!is_scalar);
2050 return ntb.ssa_values[decl_reg->def.index];
2051 }
2052 }
2053
2054 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)2055 get_nir_write_mask(const nir_def &def)
2056 {
2057 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2058 if (!store_reg) {
2059 return nir_component_mask(def.num_components);
2060 } else {
2061 return nir_intrinsic_write_mask(store_reg);
2062 }
2063 }
2064
2065 static fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum opcode opcode,const brw_reg & dst,const brw_reg & src,const brw_reg & desc,const brw_reg & flag_reg,glsl_interp_mode interpolation)2066 emit_pixel_interpolater_send(const fs_builder &bld,
2067 enum opcode opcode,
2068 const brw_reg &dst,
2069 const brw_reg &src,
2070 const brw_reg &desc,
2071 const brw_reg &flag_reg,
2072 glsl_interp_mode interpolation)
2073 {
2074 struct brw_wm_prog_data *wm_prog_data =
2075 brw_wm_prog_data(bld.shader->prog_data);
2076
2077 brw_reg srcs[INTERP_NUM_SRCS];
2078
2079 if (src.is_scalar) {
2080 srcs[INTERP_SRC_OFFSET] = bld.vgrf(src.type, 2);
2081 brw_combine_with_vec(bld, srcs[INTERP_SRC_OFFSET], src, 2);
2082 } else {
2083 srcs[INTERP_SRC_OFFSET] = src;
2084 }
2085
2086 srcs[INTERP_SRC_MSG_DESC] = desc;
2087 srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2088
2089 fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2090 /* 2 floats per slot returned */
2091 inst->size_written = 2 * dst.component_size(inst->exec_size);
2092 if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2093 inst->pi_noperspective = true;
2094 /* TGL BSpec says:
2095 * This field cannot be set to "Linear Interpolation"
2096 * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2097 */
2098 wm_prog_data->uses_nonperspective_interp_modes = true;
2099 }
2100
2101 wm_prog_data->pulls_bary = true;
2102
2103 return inst;
2104 }
2105
2106 /**
2107 * Return the specified component \p subreg of a per-polygon PS
2108 * payload register for the polygon corresponding to each channel
2109 * specified in the provided \p bld.
2110 *
2111 * \p reg specifies the payload register in REG_SIZE units for the
2112 * first polygon dispatched to the thread. This function requires
2113 * that subsequent registers on the payload contain the corresponding
2114 * register for subsequent polygons, one GRF register per polygon, if
2115 * multiple polygons are being processed by the same PS thread.
2116 *
2117 * This can be used to access the value of a "Source Depth and/or W
2118 * Attribute Vertex Deltas", "Perspective Bary Planes" or
2119 * "Non-Perspective Bary Planes" payload field conveniently for
2120 * multiple polygons as a single brw_reg.
2121 */
2122 static brw_reg
fetch_polygon_reg(const fs_builder & bld,unsigned reg,unsigned subreg)2123 fetch_polygon_reg(const fs_builder &bld, unsigned reg, unsigned subreg)
2124 {
2125 const fs_visitor *shader = bld.shader;
2126 assert(shader->stage == MESA_SHADER_FRAGMENT);
2127
2128 const struct intel_device_info *devinfo = shader->devinfo;
2129 const unsigned poly_width = shader->dispatch_width / shader->max_polygons;
2130 const unsigned poly_idx = bld.group() / poly_width;
2131 assert(bld.group() % poly_width == 0);
2132
2133 if (bld.dispatch_width() > poly_width) {
2134 assert(bld.dispatch_width() <= 2 * poly_width);
2135 const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
2136 const unsigned vstride = reg_size / brw_type_size_bytes(BRW_TYPE_F);
2137 return stride(brw_vec1_grf(reg + reg_unit(devinfo) * poly_idx, subreg),
2138 vstride, poly_width, 0);
2139 } else {
2140 return brw_vec1_grf(reg + reg_unit(devinfo) * poly_idx, subreg);
2141 }
2142 }
2143
2144 /**
2145 * Interpolate per-polygon barycentrics at a specific offset relative
2146 * to each channel fragment coordinates, optionally using
2147 * perspective-correct interpolation if requested. This is mostly
2148 * useful as replacement for the PI shared function that existed on
2149 * platforms prior to Xe2, but is expected to work on earlier
2150 * platforms since we can get the required polygon setup information
2151 * from the thread payload as far back as ICL.
2152 */
2153 static void
emit_pixel_interpolater_alu_at_offset(const fs_builder & bld,const brw_reg & dst,const brw_reg & offs,glsl_interp_mode interpolation)2154 emit_pixel_interpolater_alu_at_offset(const fs_builder &bld,
2155 const brw_reg &dst,
2156 const brw_reg &offs,
2157 glsl_interp_mode interpolation)
2158 {
2159 const fs_visitor *shader = bld.shader;
2160 assert(shader->stage == MESA_SHADER_FRAGMENT);
2161
2162 const intel_device_info *devinfo = shader->devinfo;
2163 assert(devinfo->ver >= 11);
2164
2165 const fs_thread_payload &payload = shader->fs_payload();
2166 const struct brw_wm_prog_data *wm_prog_data =
2167 brw_wm_prog_data(shader->prog_data);
2168
2169 if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2170 assert(wm_prog_data->uses_npc_bary_coefficients &&
2171 wm_prog_data->uses_nonperspective_interp_modes);
2172 } else {
2173 assert(interpolation == INTERP_MODE_SMOOTH);
2174 assert(wm_prog_data->uses_pc_bary_coefficients &&
2175 wm_prog_data->uses_depth_w_coefficients);
2176 }
2177
2178 /* Account for half-pixel X/Y coordinate offset. */
2179 const brw_reg off_x = bld.vgrf(BRW_TYPE_F);
2180 bld.ADD(off_x, offs, brw_imm_f(0.5));
2181
2182 const brw_reg off_y = bld.vgrf(BRW_TYPE_F);
2183 bld.ADD(off_y, offset(offs, bld, 1), brw_imm_f(0.5));
2184
2185 /* Process no more than two polygons at a time to avoid hitting
2186 * regioning restrictions.
2187 */
2188 const unsigned poly_width = shader->dispatch_width / shader->max_polygons;
2189
2190 for (unsigned i = 0; i < DIV_ROUND_UP(shader->max_polygons, 2); i++) {
2191 const fs_builder ibld = bld.group(MIN2(bld.dispatch_width(), 2 * poly_width), i);
2192
2193 /* Fetch needed parameters from the thread payload. */
2194 const unsigned bary_coef_reg = interpolation == INTERP_MODE_NOPERSPECTIVE ?
2195 payload.npc_bary_coef_reg : payload.pc_bary_coef_reg;
2196 const brw_reg start_x = devinfo->ver < 12 ? fetch_polygon_reg(ibld, 1, 1) :
2197 fetch_polygon_reg(ibld, bary_coef_reg,
2198 devinfo->ver >= 20 ? 6 : 2);
2199 const brw_reg start_y = devinfo->ver < 12 ? fetch_polygon_reg(ibld, 1, 6) :
2200 fetch_polygon_reg(ibld, bary_coef_reg,
2201 devinfo->ver >= 20 ? 7 : 6);
2202
2203 const brw_reg bary1_c0 = fetch_polygon_reg(ibld, bary_coef_reg,
2204 devinfo->ver >= 20 ? 2 : 3);
2205 const brw_reg bary1_cx = fetch_polygon_reg(ibld, bary_coef_reg, 1);
2206 const brw_reg bary1_cy = fetch_polygon_reg(ibld, bary_coef_reg, 0);
2207
2208 const brw_reg bary2_c0 = fetch_polygon_reg(ibld, bary_coef_reg,
2209 devinfo->ver >= 20 ? 5 : 7);
2210 const brw_reg bary2_cx = fetch_polygon_reg(ibld, bary_coef_reg,
2211 devinfo->ver >= 20 ? 4 : 5);
2212 const brw_reg bary2_cy = fetch_polygon_reg(ibld, bary_coef_reg,
2213 devinfo->ver >= 20 ? 3 : 4);
2214
2215 const brw_reg rhw_c0 = devinfo->ver >= 20 ?
2216 fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 5) :
2217 fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 7);
2218 const brw_reg rhw_cx = devinfo->ver >= 20 ?
2219 fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 4) :
2220 fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 5);
2221 const brw_reg rhw_cy = devinfo->ver >= 20 ?
2222 fetch_polygon_reg(ibld, payload.depth_w_coef_reg + 1, 3) :
2223 fetch_polygon_reg(ibld, payload.depth_w_coef_reg, 4);
2224
2225 /* Compute X/Y coordinate deltas relative to the origin of the polygon. */
2226 const brw_reg delta_x = ibld.vgrf(BRW_TYPE_F);
2227 ibld.ADD(delta_x, offset(shader->pixel_x, ibld, i), negate(start_x));
2228 ibld.ADD(delta_x, delta_x, offset(off_x, ibld, i));
2229
2230 const brw_reg delta_y = ibld.vgrf(BRW_TYPE_F);
2231 ibld.ADD(delta_y, offset(shader->pixel_y, ibld, i), negate(start_y));
2232 ibld.ADD(delta_y, delta_y, offset(off_y, ibld, i));
2233
2234 /* Evaluate the plane equations obtained above for the
2235 * barycentrics and RHW coordinate at the offset specified for
2236 * each channel. Limit arithmetic to acc_width in order to
2237 * allow the accumulator to be used for linear interpolation.
2238 */
2239 const unsigned acc_width = 16 * reg_unit(devinfo);
2240 const brw_reg rhw = ibld.vgrf(BRW_TYPE_F);
2241 const brw_reg bary1 = ibld.vgrf(BRW_TYPE_F);
2242 const brw_reg bary2 = ibld.vgrf(BRW_TYPE_F);
2243
2244 for (unsigned j = 0; j < DIV_ROUND_UP(ibld.dispatch_width(), acc_width); j++) {
2245 const fs_builder jbld = ibld.group(MIN2(ibld.dispatch_width(), acc_width), j);
2246 const brw_reg acc = suboffset(brw_acc_reg(16), jbld.group() % acc_width);
2247
2248 if (interpolation != INTERP_MODE_NOPERSPECTIVE) {
2249 jbld.MAD(acc, horiz_offset(rhw_c0, acc_width * j),
2250 horiz_offset(rhw_cx, acc_width * j), offset(delta_x, jbld, j));
2251 jbld.MAC(offset(rhw, jbld, j),
2252 horiz_offset(rhw_cy, acc_width * j), offset(delta_y, jbld, j));
2253 }
2254
2255 jbld.MAD(acc, horiz_offset(bary1_c0, acc_width * j),
2256 horiz_offset(bary1_cx, acc_width * j), offset(delta_x, jbld, j));
2257 jbld.MAC(offset(bary1, jbld, j),
2258 horiz_offset(bary1_cy, acc_width * j), offset(delta_y, jbld, j));
2259
2260 jbld.MAD(acc, horiz_offset(bary2_c0, acc_width * j),
2261 horiz_offset(bary2_cx, acc_width * j), offset(delta_x, jbld, j));
2262 jbld.MAC(offset(bary2, jbld, j),
2263 horiz_offset(bary2_cy, acc_width * j), offset(delta_y, jbld, j));
2264 }
2265
2266 /* Scale the results dividing by the interpolated RHW coordinate
2267 * if the interpolation is required to be perspective-correct.
2268 */
2269 if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2270 ibld.MOV(offset(dst, ibld, i), bary1);
2271 ibld.MOV(offset(offset(dst, bld, 1), ibld, i), bary2);
2272 } else {
2273 const brw_reg w = ibld.vgrf(BRW_TYPE_F);
2274 ibld.emit(SHADER_OPCODE_RCP, w, rhw);
2275 ibld.MUL(offset(dst, ibld, i), bary1, w);
2276 ibld.MUL(offset(offset(dst, bld, 1), ibld, i), bary2, w);
2277 }
2278 }
2279 }
2280
2281 /**
2282 * Interpolate per-polygon barycentrics at a specified sample index,
2283 * optionally using perspective-correct interpolation if requested.
2284 * This is mostly useful as replacement for the PI shared function
2285 * that existed on platforms prior to Xe2, but is expected to work on
2286 * earlier platforms since we can get the required polygon setup
2287 * information from the thread payload as far back as ICL.
2288 */
2289 static void
emit_pixel_interpolater_alu_at_sample(const fs_builder & bld,const brw_reg & dst,const brw_reg & idx,glsl_interp_mode interpolation)2290 emit_pixel_interpolater_alu_at_sample(const fs_builder &bld,
2291 const brw_reg &dst,
2292 const brw_reg &idx,
2293 glsl_interp_mode interpolation)
2294 {
2295 const fs_thread_payload &payload = bld.shader->fs_payload();
2296 const struct brw_wm_prog_data *wm_prog_data =
2297 brw_wm_prog_data(bld.shader->prog_data);
2298 const fs_builder ubld = bld.exec_all().group(16, 0);
2299 const brw_reg sample_offs_xy = ubld.vgrf(BRW_TYPE_UD);
2300 assert(wm_prog_data->uses_sample_offsets);
2301
2302 /* Interleave the X/Y coordinates of each sample in order to allow
2303 * a single indirect look-up, by using a MOV for the 16 X
2304 * coordinates, then another MOV for the 16 Y coordinates.
2305 */
2306 for (unsigned i = 0; i < 2; i++) {
2307 const brw_reg reg = retype(brw_vec16_grf(payload.sample_offsets_reg, 4 * i),
2308 BRW_TYPE_UB);
2309 ubld.MOV(subscript(sample_offs_xy, BRW_TYPE_UW, i), reg);
2310 }
2311
2312 /* Use indirect addressing to fetch the X/Y offsets of the sample
2313 * index provided for each channel.
2314 */
2315 const brw_reg idx_b = bld.vgrf(BRW_TYPE_UD);
2316 bld.MUL(idx_b, idx, brw_imm_ud(brw_type_size_bytes(BRW_TYPE_UD)));
2317
2318 const brw_reg off_xy = bld.vgrf(BRW_TYPE_UD);
2319 bld.emit(SHADER_OPCODE_MOV_INDIRECT, off_xy, component(sample_offs_xy, 0),
2320 idx_b, brw_imm_ud(16 * brw_type_size_bytes(BRW_TYPE_UD)));
2321
2322 /* Convert the selected fixed-point offsets to floating-point
2323 * offsets.
2324 */
2325 const brw_reg offs = bld.vgrf(BRW_TYPE_F, 2);
2326
2327 for (unsigned i = 0; i < 2; i++) {
2328 const brw_reg tmp = bld.vgrf(BRW_TYPE_F);
2329 bld.MOV(tmp, subscript(off_xy, BRW_TYPE_UW, i));
2330 bld.MUL(tmp, tmp, brw_imm_f(0.0625));
2331 bld.ADD(offset(offs, bld, i), tmp, brw_imm_f(-0.5));
2332 }
2333
2334 /* Interpolate at the resulting offsets. */
2335 emit_pixel_interpolater_alu_at_offset(bld, dst, offs, interpolation);
2336 }
2337
2338 /**
2339 * Computes 1 << x, given a D/UD register containing some value x.
2340 */
2341 static brw_reg
intexp2(const fs_builder & bld,const brw_reg & x)2342 intexp2(const fs_builder &bld, const brw_reg &x)
2343 {
2344 assert(x.type == BRW_TYPE_UD || x.type == BRW_TYPE_D);
2345
2346 return bld.SHL(bld.MOV(retype(brw_imm_d(1), x.type)), x);
2347 }
2348
2349 static void
emit_gs_end_primitive(nir_to_brw_state & ntb,const nir_src & vertex_count_nir_src)2350 emit_gs_end_primitive(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src)
2351 {
2352 fs_visitor &s = ntb.s;
2353 assert(s.stage == MESA_SHADER_GEOMETRY);
2354
2355 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2356
2357 if (s.gs_compile->control_data_header_size_bits == 0)
2358 return;
2359
2360 /* We can only do EndPrimitive() functionality when the control data
2361 * consists of cut bits. Fortunately, the only time it isn't is when the
2362 * output type is points, in which case EndPrimitive() is a no-op.
2363 */
2364 if (gs_prog_data->control_data_format !=
2365 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2366 return;
2367 }
2368
2369 /* Cut bits use one bit per vertex. */
2370 assert(s.gs_compile->control_data_bits_per_vertex == 1);
2371
2372 brw_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2373 vertex_count.type = BRW_TYPE_UD;
2374
2375 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2376 * vertex n, 0 otherwise. So all we need to do here is mark bit
2377 * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2378 * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2379 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2380 *
2381 * Note that if EndPrimitive() is called before emitting any vertices, this
2382 * will cause us to set bit 31 of the control_data_bits register to 1.
2383 * That's fine because:
2384 *
2385 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2386 * output, so the hardware will ignore cut bit 31.
2387 *
2388 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2389 * last vertex, so setting cut bit 31 has no effect (since the primitive
2390 * is automatically ended when the GS terminates).
2391 *
2392 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2393 * control_data_bits register to 0 when the first vertex is emitted.
2394 */
2395
2396 const fs_builder abld = ntb.bld.annotate("end primitive");
2397
2398 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2399 brw_reg prev_count = abld.ADD(vertex_count, brw_imm_ud(0xffffffffu));
2400 brw_reg mask = intexp2(abld, prev_count);
2401 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2402 * attention to the lower 5 bits of its second source argument, so on this
2403 * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2404 * ((vertex_count - 1) % 32).
2405 */
2406 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2407 }
2408
2409 brw_reg
gs_urb_per_slot_dword_index(const brw_reg & vertex_count)2410 fs_visitor::gs_urb_per_slot_dword_index(const brw_reg &vertex_count)
2411 {
2412 /* We use a single UD register to accumulate control data bits (32 bits
2413 * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
2414 * at a time.
2415 *
2416 * On platforms < Xe2:
2417 * Unfortunately,the URB_WRITE_SIMD8 message uses 128-bit (OWord)
2418 * offsets. We have select a 128-bit group via the Global and Per-Slot
2419 * Offsets, then use the Channel Mask phase to enable/disable which DWord
2420 * within that group to write. (Remember, different SIMD8 channels may
2421 * have emitted different numbers of vertices, so we may need per-slot
2422 * offsets.)
2423 *
2424 * Channel masking presents an annoying problem: we may have to replicate
2425 * the data up to 4 times:
2426 *
2427 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data,
2428 * Data.
2429 *
2430 * To avoid penalizing shaders that emit a small number of vertices, we
2431 * can avoid these sometimes: if the size of the control data header is
2432 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
2433 * land in the same 128-bit group, so we can skip per-slot offsets.
2434 *
2435 * Similarly, if the control data header is <= 32 bits, there is only one
2436 * DWord, so we can skip channel masks.
2437 */
2438 const fs_builder bld = fs_builder(this).at_end();
2439 const fs_builder abld = bld.annotate("urb per slot offset");
2440
2441 /* Figure out which DWord we're trying to write to using the formula:
2442 *
2443 * dword_index = (vertex_count - 1) * bits_per_vertex / 32
2444 *
2445 * Since bits_per_vertex is a power of two, and is known at compile
2446 * time, this can be optimized to:
2447 *
2448 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2449 */
2450 brw_reg prev_count = abld.ADD(vertex_count, brw_imm_ud(0xffffffffu));
2451 unsigned log2_bits_per_vertex =
2452 util_last_bit(gs_compile->control_data_bits_per_vertex);
2453 return abld.SHR(prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2454 }
2455
2456 brw_reg
gs_urb_channel_mask(const brw_reg & dword_index)2457 fs_visitor::gs_urb_channel_mask(const brw_reg &dword_index)
2458 {
2459 brw_reg channel_mask;
2460
2461 /* Xe2+ can do URB loads with a byte offset, so we don't need to
2462 * construct a channel mask.
2463 */
2464 if (devinfo->ver >= 20)
2465 return channel_mask;
2466
2467 /* Channel masking presents an annoying problem: we may have to replicate
2468 * the data up to 4 times:
2469 *
2470 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2471 *
2472 * To avoid penalizing shaders that emit a small number of vertices, we
2473 * can avoid these sometimes: if the size of the control data header is
2474 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
2475 * land in the same 128-bit group, so we can skip per-slot offsets.
2476 *
2477 * Similarly, if the control data header is <= 32 bits, there is only one
2478 * DWord, so we can skip channel masks.
2479 */
2480 if (gs_compile->control_data_header_size_bits <= 32)
2481 return channel_mask;
2482
2483 const fs_builder bld = fs_builder(this).at_end();
2484 const fs_builder ubld = bld.exec_all();
2485
2486 /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2487 * write to the appropriate DWORD within the OWORD.
2488 */
2489 brw_reg channel = ubld.AND(dword_index, brw_imm_ud(3u));
2490 /* Then the channel masks need to be in bits 23:16. */
2491 return ubld.SHL(intexp2(ubld, channel), brw_imm_ud(16u));
2492 }
2493
2494 void
emit_gs_control_data_bits(const brw_reg & vertex_count)2495 fs_visitor::emit_gs_control_data_bits(const brw_reg &vertex_count)
2496 {
2497 assert(stage == MESA_SHADER_GEOMETRY);
2498 assert(gs_compile->control_data_bits_per_vertex != 0);
2499
2500 const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2501
2502 const fs_builder bld = fs_builder(this).at_end();
2503 const fs_builder abld = bld.annotate("emit control data bits");
2504
2505 brw_reg dword_index = gs_urb_per_slot_dword_index(vertex_count);
2506 brw_reg channel_mask = gs_urb_channel_mask(dword_index);
2507 brw_reg per_slot_offset;
2508
2509 const unsigned max_control_data_header_size_bits =
2510 devinfo->ver >= 20 ? 32 : 128;
2511
2512 if (gs_compile->control_data_header_size_bits > max_control_data_header_size_bits) {
2513 /* Convert dword_index to bytes on Xe2+ since LSC can do operate on byte
2514 * offset granularity.
2515 */
2516 if (devinfo->ver >= 20) {
2517 per_slot_offset = abld.SHL(dword_index, brw_imm_ud(2u));
2518 } else {
2519 /* Set the per-slot offset to dword_index / 4, so that we'll write to
2520 * the appropriate OWord within the control data header.
2521 */
2522 per_slot_offset = abld.SHR(dword_index, brw_imm_ud(2u));
2523 }
2524 }
2525
2526 /* If there are channel masks, add 3 extra copies of the data. */
2527 const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2528 assert(length <= 4);
2529 brw_reg sources[4];
2530
2531 for (unsigned i = 0; i < length; i++)
2532 sources[i] = this->control_data_bits;
2533
2534 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
2535 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2536 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2537 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2538 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_TYPE_F, length);
2539 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
2540 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2541
2542 fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2543 srcs, ARRAY_SIZE(srcs));
2544
2545 /* We need to increment Global Offset by 256-bits to make room for
2546 * Broadwell's extra "Vertex Count" payload at the beginning of the
2547 * URB entry. Since this is an OWord message, Global Offset is counted
2548 * in 128-bit units, so we must set it to 2.
2549 */
2550 if (gs_prog_data->static_vertex_count == -1)
2551 inst->offset = 2;
2552 }
2553
2554 static void
set_gs_stream_control_data_bits(nir_to_brw_state & ntb,const brw_reg & vertex_count,unsigned stream_id)2555 set_gs_stream_control_data_bits(nir_to_brw_state &ntb, const brw_reg &vertex_count,
2556 unsigned stream_id)
2557 {
2558 fs_visitor &s = ntb.s;
2559
2560 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2561
2562 /* Note: we are calling this *before* increasing vertex_count, so
2563 * this->vertex_count == vertex_count - 1 in the formula above.
2564 */
2565
2566 /* Stream mode uses 2 bits per vertex */
2567 assert(s.gs_compile->control_data_bits_per_vertex == 2);
2568
2569 /* Must be a valid stream */
2570 assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2571
2572 /* Control data bits are initialized to 0 so we don't have to set any
2573 * bits when sending vertices to stream 0.
2574 */
2575 if (stream_id == 0)
2576 return;
2577
2578 const fs_builder abld = ntb.bld.annotate("set stream control data bits");
2579
2580 /* reg::sid = stream_id */
2581 brw_reg sid = abld.MOV(brw_imm_ud(stream_id));
2582
2583 /* reg:shift_count = 2 * (vertex_count - 1) */
2584 brw_reg shift_count = abld.SHL(vertex_count, brw_imm_ud(1u));
2585
2586 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2587 * attention to the lower 5 bits of its second source argument, so on this
2588 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2589 * stream_id << ((2 * (vertex_count - 1)) % 32).
2590 */
2591 brw_reg mask = abld.SHL(sid, shift_count);
2592 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2593 }
2594
2595 static void
emit_gs_vertex(nir_to_brw_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2596 emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src,
2597 unsigned stream_id)
2598 {
2599 fs_visitor &s = ntb.s;
2600
2601 assert(s.stage == MESA_SHADER_GEOMETRY);
2602
2603 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2604
2605 brw_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2606 vertex_count.type = BRW_TYPE_UD;
2607
2608 /* Haswell and later hardware ignores the "Render Stream Select" bits
2609 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2610 * and instead sends all primitives down the pipeline for rasterization.
2611 * If the SOL stage is enabled, "Render Stream Select" is honored and
2612 * primitives bound to non-zero streams are discarded after stream output.
2613 *
2614 * Since the only purpose of primives sent to non-zero streams is to
2615 * be recorded by transform feedback, we can simply discard all geometry
2616 * bound to these streams when transform feedback is disabled.
2617 */
2618 if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2619 return;
2620
2621 /* If we're outputting 32 control data bits or less, then we can wait
2622 * until the shader is over to output them all. Otherwise we need to
2623 * output them as we go. Now is the time to do it, since we're about to
2624 * output the vertex_count'th vertex, so it's guaranteed that the
2625 * control data bits associated with the (vertex_count - 1)th vertex are
2626 * correct.
2627 */
2628 if (s.gs_compile->control_data_header_size_bits > 32) {
2629 const fs_builder abld =
2630 ntb.bld.annotate("emit vertex: emit control data bits");
2631
2632 /* Only emit control data bits if we've finished accumulating a batch
2633 * of 32 bits. This is the case when:
2634 *
2635 * (vertex_count * bits_per_vertex) % 32 == 0
2636 *
2637 * (in other words, when the last 5 bits of vertex_count *
2638 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
2639 * integer n (which is always the case, since bits_per_vertex is
2640 * always 1 or 2), this is equivalent to requiring that the last 5-n
2641 * bits of vertex_count are 0:
2642 *
2643 * vertex_count & (2^(5-n) - 1) == 0
2644 *
2645 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2646 * equivalent to:
2647 *
2648 * vertex_count & (32 / bits_per_vertex - 1) == 0
2649 *
2650 * TODO: If vertex_count is an immediate, we could do some of this math
2651 * at compile time...
2652 */
2653 fs_inst *inst =
2654 abld.AND(ntb.bld.null_reg_d(), vertex_count,
2655 brw_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2656 inst->conditional_mod = BRW_CONDITIONAL_Z;
2657
2658 abld.IF(BRW_PREDICATE_NORMAL);
2659 /* If vertex_count is 0, then no control data bits have been
2660 * accumulated yet, so we can skip emitting them.
2661 */
2662 abld.CMP(ntb.bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2663 BRW_CONDITIONAL_NEQ);
2664 abld.IF(BRW_PREDICATE_NORMAL);
2665 s.emit_gs_control_data_bits(vertex_count);
2666 abld.emit(BRW_OPCODE_ENDIF);
2667
2668 /* Reset control_data_bits to 0 so we can start accumulating a new
2669 * batch.
2670 *
2671 * Note: in the case where vertex_count == 0, this neutralizes the
2672 * effect of any call to EndPrimitive() that the shader may have
2673 * made before outputting its first vertex.
2674 */
2675 abld.exec_all().MOV(s.control_data_bits, brw_imm_ud(0u));
2676 abld.emit(BRW_OPCODE_ENDIF);
2677 }
2678
2679 s.emit_urb_writes(vertex_count);
2680
2681 /* In stream mode we have to set control data bits for all vertices
2682 * unless we have disabled control data bits completely (which we do
2683 * do for MESA_PRIM_POINTS outputs that don't use streams).
2684 */
2685 if (s.gs_compile->control_data_header_size_bits > 0 &&
2686 gs_prog_data->control_data_format ==
2687 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2688 set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2689 }
2690 }
2691
2692 static void
brw_combine_with_vec(const fs_builder & bld,const brw_reg & dst,const brw_reg & src,unsigned n)2693 brw_combine_with_vec(const fs_builder &bld, const brw_reg &dst,
2694 const brw_reg &src, unsigned n)
2695 {
2696 assert(n <= NIR_MAX_VEC_COMPONENTS);
2697 brw_reg comps[NIR_MAX_VEC_COMPONENTS];
2698 for (unsigned i = 0; i < n; i++)
2699 comps[i] = offset(src, bld, i);
2700 bld.VEC(dst, comps, n);
2701 }
2702
2703 static void
emit_gs_input_load(nir_to_brw_state & ntb,const brw_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2704 emit_gs_input_load(nir_to_brw_state &ntb, const brw_reg &dst,
2705 const nir_src &vertex_src,
2706 unsigned base_offset,
2707 const nir_src &offset_src,
2708 unsigned num_components,
2709 unsigned first_component)
2710 {
2711 const fs_builder &bld = ntb.bld;
2712 const struct intel_device_info *devinfo = ntb.devinfo;
2713
2714 fs_visitor &s = ntb.s;
2715
2716 assert(brw_type_size_bytes(dst.type) == 4);
2717 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2718 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2719
2720 /* TODO: figure out push input layout for invocations == 1 */
2721 if (gs_prog_data->invocations == 1 &&
2722 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2723 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2724 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2725 nir_src_as_uint(vertex_src) * push_reg_count;
2726
2727 const brw_reg attr = offset(brw_attr_reg(0, dst.type), bld,
2728 first_component + imm_offset);
2729 brw_combine_with_vec(bld, dst, attr, num_components);
2730 return;
2731 }
2732
2733 /* Resort to the pull model. Ensure the VUE handles are provided. */
2734 assert(gs_prog_data->base.include_vue_handles);
2735
2736 brw_reg start = s.gs_payload().icp_handle_start;
2737 brw_reg icp_handle = ntb.bld.vgrf(BRW_TYPE_UD);
2738 const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2739
2740 if (gs_prog_data->invocations == 1) {
2741 if (nir_src_is_const(vertex_src)) {
2742 /* The vertex index is constant; just select the proper URB handle. */
2743 icp_handle =
2744 byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2745 } else {
2746 /* The vertex index is non-constant. We need to use indirect
2747 * addressing to fetch the proper URB handle.
2748 *
2749 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2750 * indicating that channel <n> should read the handle from
2751 * DWord <n>. We convert that to bytes by multiplying by 4.
2752 *
2753 * Next, we convert the vertex index to bytes by multiplying
2754 * by 32/64 (shifting by 5/6), and add the two together. This is
2755 * the final indirect byte offset.
2756 */
2757 brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
2758
2759 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2760 brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
2761 /* Convert vertex_index to bytes (multiply by 32/64) */
2762 assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2763 brw_reg vertex_offset_bytes =
2764 bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
2765 brw_imm_ud(ffs(grf_size_bytes) - 1));
2766 brw_reg icp_offset_bytes =
2767 bld.ADD(vertex_offset_bytes, channel_offsets);
2768
2769 /* Use first_icp_handle as the base offset. There is one register
2770 * of URB handles per vertex, so inform the register allocator that
2771 * we might read up to nir->info.gs.vertices_in registers.
2772 */
2773 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2774 brw_reg(icp_offset_bytes),
2775 brw_imm_ud(s.nir->info.gs.vertices_in * grf_size_bytes));
2776 }
2777 } else {
2778 assert(gs_prog_data->invocations > 1);
2779
2780 if (nir_src_is_const(vertex_src)) {
2781 unsigned vertex = nir_src_as_uint(vertex_src);
2782 bld.MOV(icp_handle, component(start, vertex));
2783 } else {
2784 /* The vertex index is non-constant. We need to use indirect
2785 * addressing to fetch the proper URB handle.
2786 *
2787 * Convert vertex_index to bytes (multiply by 4)
2788 */
2789 brw_reg icp_offset_bytes =
2790 bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
2791 brw_imm_ud(2u));
2792
2793 /* Use first_icp_handle as the base offset. There is one DWord
2794 * of URB handles per vertex, so inform the register allocator that
2795 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2796 */
2797 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2798 brw_reg(icp_offset_bytes),
2799 brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2800 grf_size_bytes));
2801 }
2802 }
2803
2804 fs_inst *inst;
2805 brw_reg indirect_offset = get_nir_src(ntb, offset_src);
2806
2807 if (nir_src_is_const(offset_src)) {
2808 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
2809 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2810
2811 /* Constant indexing - use global offset. */
2812 if (first_component != 0) {
2813 unsigned read_components = num_components + first_component;
2814 brw_reg tmp = bld.vgrf(dst.type, read_components);
2815 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2816 ARRAY_SIZE(srcs));
2817 inst->size_written = read_components *
2818 tmp.component_size(inst->exec_size);
2819 brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
2820 num_components);
2821 } else {
2822 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2823 ARRAY_SIZE(srcs));
2824 inst->size_written = num_components *
2825 dst.component_size(inst->exec_size);
2826 }
2827 inst->offset = base_offset + nir_src_as_uint(offset_src);
2828 } else {
2829 /* Indirect indexing - use per-slot offsets as well. */
2830 unsigned read_components = num_components + first_component;
2831 brw_reg tmp = bld.vgrf(dst.type, read_components);
2832
2833 /* Convert oword offset to bytes on Xe2+ */
2834 if (devinfo->ver >= 20)
2835 indirect_offset = bld.SHL(indirect_offset, brw_imm_ud(4u));
2836
2837 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
2838 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2839 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2840
2841 if (first_component != 0) {
2842 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2843 srcs, ARRAY_SIZE(srcs));
2844 inst->size_written = read_components *
2845 tmp.component_size(inst->exec_size);
2846 brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
2847 num_components);
2848 } else {
2849 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
2850 srcs, ARRAY_SIZE(srcs));
2851 inst->size_written = num_components *
2852 dst.component_size(inst->exec_size);
2853 }
2854 inst->offset = base_offset;
2855 }
2856 }
2857
2858 static brw_reg
get_indirect_offset(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)2859 get_indirect_offset(nir_to_brw_state &ntb, nir_intrinsic_instr *instr)
2860 {
2861 const intel_device_info *devinfo = ntb.devinfo;
2862 nir_src *offset_src = nir_get_io_offset_src(instr);
2863
2864 if (nir_src_is_const(*offset_src)) {
2865 /* The only constant offset we should find is 0. brw_nir.c's
2866 * add_const_offset_to_base() will fold other constant offsets
2867 * into the "base" index.
2868 */
2869 assert(nir_src_as_uint(*offset_src) == 0);
2870 return brw_reg();
2871 }
2872
2873 brw_reg offset = get_nir_src(ntb, *offset_src);
2874
2875 if (devinfo->ver < 20)
2876 return offset;
2877
2878 /* Convert Owords (16-bytes) to bytes */
2879 return ntb.bld.SHL(retype(offset, BRW_TYPE_UD), brw_imm_ud(4u));
2880 }
2881
2882 static void
fs_nir_emit_vs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)2883 fs_nir_emit_vs_intrinsic(nir_to_brw_state &ntb,
2884 nir_intrinsic_instr *instr)
2885 {
2886 const fs_builder &bld = ntb.bld;
2887 fs_visitor &s = ntb.s;
2888 assert(s.stage == MESA_SHADER_VERTEX);
2889
2890 brw_reg dest;
2891 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2892 dest = get_nir_def(ntb, instr->def);
2893
2894 switch (instr->intrinsic) {
2895 case nir_intrinsic_load_vertex_id:
2896 case nir_intrinsic_load_base_vertex:
2897 unreachable("should be lowered by nir_lower_system_values()");
2898
2899 case nir_intrinsic_load_input: {
2900 assert(instr->def.bit_size == 32);
2901 const brw_reg src = offset(brw_attr_reg(0, dest.type), bld,
2902 nir_intrinsic_base(instr) * 4 +
2903 nir_intrinsic_component(instr) +
2904 nir_src_as_uint(instr->src[0]));
2905 brw_combine_with_vec(bld, dest, src, instr->num_components);
2906 break;
2907 }
2908
2909 case nir_intrinsic_load_vertex_id_zero_base:
2910 case nir_intrinsic_load_instance_id:
2911 case nir_intrinsic_load_base_instance:
2912 case nir_intrinsic_load_draw_id:
2913 case nir_intrinsic_load_first_vertex:
2914 case nir_intrinsic_load_is_indexed_draw:
2915 unreachable("lowered by brw_nir_lower_vs_inputs");
2916
2917 default:
2918 fs_nir_emit_intrinsic(ntb, bld, instr);
2919 break;
2920 }
2921 }
2922
2923 static brw_reg
get_tcs_single_patch_icp_handle(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2924 get_tcs_single_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
2925 nir_intrinsic_instr *instr)
2926 {
2927 fs_visitor &s = ntb.s;
2928
2929 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
2930 const nir_src &vertex_src = instr->src[0];
2931 nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2932
2933 const brw_reg start = s.tcs_payload().icp_handle_start;
2934
2935 brw_reg icp_handle;
2936
2937 if (nir_src_is_const(vertex_src)) {
2938 /* Emit a MOV to resolve <0,1,0> regioning. */
2939 unsigned vertex = nir_src_as_uint(vertex_src);
2940 icp_handle = bld.MOV(component(start, vertex));
2941 } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2942 vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2943 /* For the common case of only 1 instance, an array index of
2944 * gl_InvocationID means reading the handles from the start. Skip all
2945 * the indirect work.
2946 */
2947 icp_handle = start;
2948 } else {
2949 /* The vertex index is non-constant. We need to use indirect
2950 * addressing to fetch the proper URB handle.
2951 */
2952 icp_handle = bld.vgrf(BRW_TYPE_UD);
2953
2954 /* Each ICP handle is a single DWord (4 bytes) */
2955 brw_reg vertex_offset_bytes =
2956 bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
2957 brw_imm_ud(2u));
2958
2959 /* We might read up to 4 registers. */
2960 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2961 start, vertex_offset_bytes,
2962 brw_imm_ud(4 * REG_SIZE));
2963 }
2964
2965 return icp_handle;
2966 }
2967
2968 static brw_reg
get_tcs_multi_patch_icp_handle(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2969 get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
2970 nir_intrinsic_instr *instr)
2971 {
2972 fs_visitor &s = ntb.s;
2973 const intel_device_info *devinfo = s.devinfo;
2974
2975 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) s.key;
2976 const nir_src &vertex_src = instr->src[0];
2977 const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2978
2979 const brw_reg start = s.tcs_payload().icp_handle_start;
2980
2981 if (nir_src_is_const(vertex_src))
2982 return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2983
2984 /* The vertex index is non-constant. We need to use indirect
2985 * addressing to fetch the proper URB handle.
2986 *
2987 * First, we start with the sequence indicating that channel <n>
2988 * should read the handle from DWord <n>. We convert that to bytes
2989 * by multiplying by 4.
2990 *
2991 * Next, we convert the vertex index to bytes by multiplying
2992 * by the GRF size (by shifting), and add the two together. This is
2993 * the final indirect byte offset.
2994 */
2995 brw_reg sequence = bld.LOAD_SUBGROUP_INVOCATION();
2996
2997 /* Offsets will be 0, 4, 8, ... */
2998 brw_reg channel_offsets = bld.SHL(sequence, brw_imm_ud(2u));
2999 /* Convert vertex_index to bytes (multiply by 32) */
3000 assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
3001 brw_reg vertex_offset_bytes =
3002 bld.SHL(retype(get_nir_src(ntb, vertex_src), BRW_TYPE_UD),
3003 brw_imm_ud(ffs(grf_size_bytes) - 1));
3004 brw_reg icp_offset_bytes =
3005 bld.ADD(vertex_offset_bytes, channel_offsets);
3006
3007 /* Use start of ICP handles as the base offset. There is one register
3008 * of URB handles per vertex, so inform the register allocator that
3009 * we might read up to nir->info.gs.vertices_in registers.
3010 */
3011 brw_reg icp_handle = bld.vgrf(BRW_TYPE_UD);
3012 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
3013 icp_offset_bytes,
3014 brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) *
3015 grf_size_bytes));
3016
3017 return icp_handle;
3018 }
3019
3020 static void
setup_barrier_message_payload_gfx125(const fs_builder & bld,const brw_reg & msg_payload)3021 setup_barrier_message_payload_gfx125(const fs_builder &bld,
3022 const brw_reg &msg_payload)
3023 {
3024 const fs_builder ubld = bld.exec_all().group(1, 0);
3025 const struct intel_device_info *devinfo = bld.shader->devinfo;
3026 assert(devinfo->verx10 >= 125);
3027
3028 /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
3029 brw_reg m0_10ub = horiz_offset(retype(msg_payload, BRW_TYPE_UB), 10);
3030 brw_reg r0_11ub =
3031 stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_TYPE_UB), 11),
3032 0, 1, 0);
3033 ubld.group(2, 0).MOV(m0_10ub, r0_11ub);
3034
3035 if (devinfo->ver >= 20) {
3036 /* Use an active threads barrier. */
3037 const brw_reg m0_2ud = component(retype(msg_payload, BRW_TYPE_UD), 2);
3038 ubld.OR(m0_2ud, m0_2ud, brw_imm_ud(1u << 8));
3039 }
3040 }
3041
3042 static void
emit_barrier(nir_to_brw_state & ntb)3043 emit_barrier(nir_to_brw_state &ntb)
3044 {
3045 const intel_device_info *devinfo = ntb.devinfo;
3046 const fs_builder &bld = ntb.bld;
3047 const fs_builder ubld = bld.exec_all();
3048 const fs_builder hbld = ubld.group(8 * reg_unit(devinfo), 0);
3049 fs_visitor &s = ntb.s;
3050
3051 /* We are getting the barrier ID from the compute shader header */
3052 assert(gl_shader_stage_uses_workgroup(s.stage));
3053
3054 /* Zero-initialize the payload */
3055 brw_reg payload = hbld.MOV(brw_imm_ud(0u));
3056
3057 if (devinfo->verx10 >= 125) {
3058 setup_barrier_message_payload_gfx125(bld, payload);
3059 } else {
3060 assert(gl_shader_stage_is_compute(s.stage));
3061
3062 brw_reg barrier_id_mask =
3063 brw_imm_ud(devinfo->ver == 9 ? 0x8f000000u : 0x7f000000u);
3064
3065 /* Copy the barrier id from r0.2 to the message payload reg.2 */
3066 brw_reg r0_2 = brw_reg(retype(brw_vec1_grf(0, 2), BRW_TYPE_UD));
3067 ubld.group(1, 0).AND(component(payload, 2), r0_2, barrier_id_mask);
3068 }
3069
3070 /* Emit a gateway "barrier" message using the payload we set up, followed
3071 * by a wait instruction.
3072 */
3073 ubld.emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
3074 }
3075
3076 static void
emit_tcs_barrier(nir_to_brw_state & ntb)3077 emit_tcs_barrier(nir_to_brw_state &ntb)
3078 {
3079 const intel_device_info *devinfo = ntb.devinfo;
3080 const fs_builder &bld = ntb.bld;
3081 fs_visitor &s = ntb.s;
3082
3083 assert(s.stage == MESA_SHADER_TESS_CTRL);
3084 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
3085
3086 brw_reg m0 = bld.vgrf(BRW_TYPE_UD);
3087 brw_reg m0_2 = component(m0, 2);
3088
3089 const fs_builder chanbld = bld.exec_all().group(1, 0);
3090
3091 /* Zero the message header */
3092 bld.exec_all().MOV(m0, brw_imm_ud(0u));
3093
3094 if (devinfo->verx10 >= 125) {
3095 setup_barrier_message_payload_gfx125(bld, m0);
3096 } else if (devinfo->ver >= 11) {
3097 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_TYPE_UD),
3098 brw_imm_ud(INTEL_MASK(30, 24)));
3099
3100 /* Set the Barrier Count and the enable bit */
3101 chanbld.OR(m0_2, m0_2,
3102 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
3103 } else {
3104 /* Copy "Barrier ID" from r0.2, bits 16:13 */
3105 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_TYPE_UD),
3106 brw_imm_ud(INTEL_MASK(16, 13)));
3107
3108 /* Shift it up to bits 27:24. */
3109 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
3110
3111 /* Set the Barrier Count and the enable bit */
3112 chanbld.OR(m0_2, m0_2,
3113 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
3114 }
3115
3116 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
3117 }
3118
3119 static void
fs_nir_emit_tcs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3120 fs_nir_emit_tcs_intrinsic(nir_to_brw_state &ntb,
3121 nir_intrinsic_instr *instr)
3122 {
3123 const intel_device_info *devinfo = ntb.devinfo;
3124 const fs_builder &bld = ntb.bld;
3125 fs_visitor &s = ntb.s;
3126
3127 assert(s.stage == MESA_SHADER_TESS_CTRL);
3128 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
3129 struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
3130
3131 brw_reg dst;
3132 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3133 dst = get_nir_def(ntb, instr->def);
3134
3135 switch (instr->intrinsic) {
3136 case nir_intrinsic_load_primitive_id:
3137 bld.MOV(dst, s.tcs_payload().primitive_id);
3138 break;
3139 case nir_intrinsic_load_invocation_id:
3140 bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
3141 break;
3142
3143 case nir_intrinsic_barrier:
3144 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
3145 fs_nir_emit_intrinsic(ntb, bld, instr);
3146 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
3147 if (tcs_prog_data->instances != 1)
3148 emit_tcs_barrier(ntb);
3149 }
3150 break;
3151
3152 case nir_intrinsic_load_input:
3153 unreachable("nir_lower_io should never give us these.");
3154 break;
3155
3156 case nir_intrinsic_load_per_vertex_input: {
3157 assert(instr->def.bit_size == 32);
3158 brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3159 unsigned imm_offset = nir_intrinsic_base(instr);
3160 fs_inst *inst;
3161
3162 const bool multi_patch =
3163 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
3164
3165 brw_reg icp_handle = multi_patch ?
3166 get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
3167 get_tcs_single_patch_icp_handle(ntb, bld, instr);
3168
3169 /* We can only read two double components with each URB read, so
3170 * we send two read messages in that case, each one loading up to
3171 * two double components.
3172 */
3173 unsigned num_components = instr->num_components;
3174 unsigned first_component = nir_intrinsic_component(instr);
3175
3176 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3177 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
3178
3179 if (indirect_offset.file == BAD_FILE) {
3180 /* Constant indexing - use global offset. */
3181 if (first_component != 0) {
3182 unsigned read_components = num_components + first_component;
3183 brw_reg tmp = bld.vgrf(dst.type, read_components);
3184 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
3185 ARRAY_SIZE(srcs));
3186 brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3187 num_components);
3188 } else {
3189 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
3190 ARRAY_SIZE(srcs));
3191 }
3192 inst->offset = imm_offset;
3193 } else {
3194 /* Indirect indexing - use per-slot offsets as well. */
3195 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3196
3197 if (first_component != 0) {
3198 unsigned read_components = num_components + first_component;
3199 brw_reg tmp = bld.vgrf(dst.type, read_components);
3200 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3201 srcs, ARRAY_SIZE(srcs));
3202 brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3203 num_components);
3204 } else {
3205 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3206 srcs, ARRAY_SIZE(srcs));
3207 }
3208 inst->offset = imm_offset;
3209 }
3210 inst->size_written = (num_components + first_component) *
3211 inst->dst.component_size(inst->exec_size);
3212
3213 /* Copy the temporary to the destination to deal with writemasking.
3214 *
3215 * Also attempt to deal with gl_PointSize being in the .w component.
3216 */
3217 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
3218 assert(brw_type_size_bytes(dst.type) == 4);
3219 inst->dst = bld.vgrf(dst.type, 4);
3220 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
3221 bld.MOV(dst, offset(inst->dst, bld, 3));
3222 }
3223 break;
3224 }
3225
3226 case nir_intrinsic_load_output:
3227 case nir_intrinsic_load_per_vertex_output: {
3228 assert(instr->def.bit_size == 32);
3229 brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3230 unsigned imm_offset = nir_intrinsic_base(instr);
3231 unsigned first_component = nir_intrinsic_component(instr);
3232
3233 fs_inst *inst;
3234 if (indirect_offset.file == BAD_FILE) {
3235 /* This MOV replicates the output handle to all enabled channels
3236 * is SINGLE_PATCH mode.
3237 */
3238 brw_reg patch_handle = bld.MOV(s.tcs_payload().patch_urb_output);
3239
3240 {
3241 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3242 srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
3243
3244 if (first_component != 0) {
3245 unsigned read_components =
3246 instr->num_components + first_component;
3247 brw_reg tmp = bld.vgrf(dst.type, read_components);
3248 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3249 srcs, ARRAY_SIZE(srcs));
3250 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3251 brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3252 instr->num_components);
3253 } else {
3254 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3255 srcs, ARRAY_SIZE(srcs));
3256 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3257 }
3258 inst->offset = imm_offset;
3259 }
3260 } else {
3261 /* Indirect indexing - use per-slot offsets as well. */
3262 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3263 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3264 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3265
3266 if (first_component != 0) {
3267 unsigned read_components =
3268 instr->num_components + first_component;
3269 brw_reg tmp = bld.vgrf(dst.type, read_components);
3270 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3271 srcs, ARRAY_SIZE(srcs));
3272 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3273 brw_combine_with_vec(bld, dst, offset(tmp, bld, first_component),
3274 instr->num_components);
3275 } else {
3276 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3277 srcs, ARRAY_SIZE(srcs));
3278 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3279 }
3280 inst->offset = imm_offset;
3281 }
3282 break;
3283 }
3284
3285 case nir_intrinsic_store_output:
3286 case nir_intrinsic_store_per_vertex_output: {
3287 assert(nir_src_bit_size(instr->src[0]) == 32);
3288 brw_reg value = get_nir_src(ntb, instr->src[0], -1);
3289 brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3290 unsigned imm_offset = nir_intrinsic_base(instr);
3291 unsigned mask = nir_intrinsic_write_mask(instr);
3292
3293 if (mask == 0)
3294 break;
3295
3296 unsigned num_components = util_last_bit(mask);
3297 unsigned first_component = nir_intrinsic_component(instr);
3298 assert((first_component + num_components) <= 4);
3299
3300 mask = mask << first_component;
3301
3302 const bool has_urb_lsc = devinfo->ver >= 20;
3303
3304 brw_reg mask_reg;
3305 if (mask != WRITEMASK_XYZW)
3306 mask_reg = brw_imm_ud(mask << 16);
3307
3308 brw_reg sources[4];
3309
3310 unsigned m = has_urb_lsc ? 0 : first_component;
3311 for (unsigned i = 0; i < num_components; i++) {
3312 int c = i + first_component;
3313 if (mask & (1 << c)) {
3314 sources[m++] = offset(value, bld, i);
3315 } else if (devinfo->ver < 20) {
3316 m++;
3317 }
3318 }
3319
3320 assert(has_urb_lsc || m == (first_component + num_components));
3321
3322 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3323 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3324 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3325 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
3326 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_TYPE_F, m);
3327 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(m);
3328 bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
3329
3330 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
3331 srcs, ARRAY_SIZE(srcs));
3332 inst->offset = imm_offset;
3333 break;
3334 }
3335
3336 default:
3337 fs_nir_emit_intrinsic(ntb, bld, instr);
3338 break;
3339 }
3340 }
3341
3342 static void
fs_nir_emit_tes_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3343 fs_nir_emit_tes_intrinsic(nir_to_brw_state &ntb,
3344 nir_intrinsic_instr *instr)
3345 {
3346 const intel_device_info *devinfo = ntb.devinfo;
3347 const fs_builder &bld = ntb.bld;
3348 fs_visitor &s = ntb.s;
3349
3350 assert(s.stage == MESA_SHADER_TESS_EVAL);
3351 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(s.prog_data);
3352
3353 brw_reg dest;
3354 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3355 dest = get_nir_def(ntb, instr->def);
3356
3357 switch (instr->intrinsic) {
3358 case nir_intrinsic_load_primitive_id:
3359 bld.MOV(dest, s.tes_payload().primitive_id);
3360 break;
3361
3362 case nir_intrinsic_load_tess_coord:
3363 for (unsigned i = 0; i < 3; i++)
3364 bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3365 break;
3366
3367 case nir_intrinsic_load_input:
3368 case nir_intrinsic_load_per_vertex_input: {
3369 assert(instr->def.bit_size == 32);
3370 brw_reg indirect_offset = get_indirect_offset(ntb, instr);
3371 unsigned imm_offset = nir_intrinsic_base(instr);
3372 unsigned first_component = nir_intrinsic_component(instr);
3373
3374 fs_inst *inst;
3375 if (indirect_offset.file == BAD_FILE) {
3376 /* Arbitrarily only push up to 32 vec4 slots worth of data,
3377 * which is 16 registers (since each holds 2 vec4 slots).
3378 */
3379 const unsigned max_push_slots = 32;
3380 if (imm_offset < max_push_slots) {
3381 const brw_reg src = horiz_offset(brw_attr_reg(0, dest.type),
3382 4 * imm_offset + first_component);
3383 brw_reg comps[NIR_MAX_VEC_COMPONENTS];
3384 for (unsigned i = 0; i < instr->num_components; i++) {
3385 comps[i] = component(src, i);
3386 }
3387 bld.VEC(dest, comps, instr->num_components);
3388
3389 tes_prog_data->base.urb_read_length =
3390 MAX2(tes_prog_data->base.urb_read_length,
3391 (imm_offset / 2) + 1);
3392 } else {
3393 /* Replicate the patch handle to all enabled channels */
3394 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3395 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3396
3397 if (first_component != 0) {
3398 unsigned read_components =
3399 instr->num_components + first_component;
3400 brw_reg tmp = bld.vgrf(dest.type, read_components);
3401 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3402 srcs, ARRAY_SIZE(srcs));
3403 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3404 brw_combine_with_vec(bld, dest, offset(tmp, bld, first_component),
3405 instr->num_components);
3406 } else {
3407 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3408 srcs, ARRAY_SIZE(srcs));
3409 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3410 }
3411 inst->offset = imm_offset;
3412 }
3413 } else {
3414 /* Indirect indexing - use per-slot offsets as well. */
3415
3416 /* We can only read two double components with each URB read, so
3417 * we send two read messages in that case, each one loading up to
3418 * two double components.
3419 */
3420 unsigned num_components = instr->num_components;
3421
3422 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
3423 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3424 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3425
3426 if (first_component != 0) {
3427 unsigned read_components =
3428 num_components + first_component;
3429 brw_reg tmp = bld.vgrf(dest.type, read_components);
3430 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3431 srcs, ARRAY_SIZE(srcs));
3432 brw_combine_with_vec(bld, dest, offset(tmp, bld, first_component),
3433 num_components);
3434 } else {
3435 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3436 srcs, ARRAY_SIZE(srcs));
3437 }
3438 inst->offset = imm_offset;
3439 inst->size_written = (num_components + first_component) *
3440 inst->dst.component_size(inst->exec_size);
3441 }
3442 break;
3443 }
3444 default:
3445 fs_nir_emit_intrinsic(ntb, bld, instr);
3446 break;
3447 }
3448 }
3449
3450 static void
fs_nir_emit_gs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3451 fs_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
3452 nir_intrinsic_instr *instr)
3453 {
3454 const fs_builder &bld = ntb.bld;
3455 fs_visitor &s = ntb.s;
3456
3457 assert(s.stage == MESA_SHADER_GEOMETRY);
3458
3459 brw_reg dest;
3460 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3461 dest = get_nir_def(ntb, instr->def);
3462
3463 switch (instr->intrinsic) {
3464 case nir_intrinsic_load_primitive_id:
3465 assert(s.stage == MESA_SHADER_GEOMETRY);
3466 assert(brw_gs_prog_data(s.prog_data)->include_primitive_id);
3467 bld.MOV(retype(dest, BRW_TYPE_UD), s.gs_payload().primitive_id);
3468 break;
3469
3470 case nir_intrinsic_load_input:
3471 unreachable("load_input intrinsics are invalid for the GS stage");
3472
3473 case nir_intrinsic_load_per_vertex_input:
3474 emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3475 instr->src[1], instr->num_components,
3476 nir_intrinsic_component(instr));
3477 break;
3478
3479 case nir_intrinsic_emit_vertex_with_counter:
3480 emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3481
3482 /* After an EmitVertex() call, the values of all outputs are undefined.
3483 * If this is not in control flow, recreate a fresh set of output
3484 * registers to keep their live ranges separate.
3485 */
3486 if (instr->instr.block->cf_node.parent->type == nir_cf_node_function)
3487 fs_nir_setup_outputs(ntb);
3488 break;
3489
3490 case nir_intrinsic_end_primitive_with_counter:
3491 emit_gs_end_primitive(ntb, instr->src[0]);
3492 break;
3493
3494 case nir_intrinsic_set_vertex_and_primitive_count:
3495 bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3496 break;
3497
3498 case nir_intrinsic_load_invocation_id: {
3499 brw_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3500 assert(val.file != BAD_FILE);
3501 dest.type = val.type;
3502 bld.MOV(dest, val);
3503 break;
3504 }
3505
3506 default:
3507 fs_nir_emit_intrinsic(ntb, bld, instr);
3508 break;
3509 }
3510 }
3511
3512 /**
3513 * Fetch the current render target layer index.
3514 */
3515 static brw_reg
fetch_render_target_array_index(const fs_builder & bld)3516 fetch_render_target_array_index(const fs_builder &bld)
3517 {
3518 const fs_visitor *v = bld.shader;
3519
3520 if (bld.shader->devinfo->ver >= 20) {
3521 /* Gfx20+ has separate Render Target Array indices for each pair
3522 * of subspans in order to support multiple polygons, so we need
3523 * to use a <1;8,0> region in order to select the correct word
3524 * for each channel.
3525 */
3526 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3527
3528 for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
3529 const fs_builder hbld = bld.group(16, i);
3530 const struct brw_reg reg = retype(brw_vec1_grf(2 * i + 1, 1),
3531 BRW_TYPE_UW);
3532 hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
3533 brw_imm_uw(0x7ff));
3534 }
3535
3536 return idx;
3537 } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
3538 /* According to the BSpec "PS Thread Payload for Normal
3539 * Dispatch", the render target array index is stored as bits
3540 * 26:16 of either the R1.1 or R1.6 poly info dwords, for the
3541 * first and second polygons respectively in multipolygon PS
3542 * dispatch mode.
3543 */
3544 assert(bld.dispatch_width() == 16);
3545 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3546
3547 for (unsigned i = 0; i < v->max_polygons; i++) {
3548 const fs_builder hbld = bld.group(8, i);
3549 const struct brw_reg g1 = brw_uw1_reg(FIXED_GRF, 1, 3 + 10 * i);
3550 hbld.AND(offset(idx, hbld, i), g1, brw_imm_uw(0x7ff));
3551 }
3552
3553 return idx;
3554 } else if (bld.shader->devinfo->ver >= 12) {
3555 /* The render target array index is provided in the thread payload as
3556 * bits 26:16 of r1.1.
3557 */
3558 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3559 bld.AND(idx, brw_uw1_reg(FIXED_GRF, 1, 3),
3560 brw_imm_uw(0x7ff));
3561 return idx;
3562 } else {
3563 /* The render target array index is provided in the thread payload as
3564 * bits 26:16 of r0.0.
3565 */
3566 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3567 bld.AND(idx, brw_uw1_reg(FIXED_GRF, 0, 1),
3568 brw_imm_uw(0x7ff));
3569 return idx;
3570 }
3571 }
3572
3573 static brw_reg
fetch_viewport_index(const fs_builder & bld)3574 fetch_viewport_index(const fs_builder &bld)
3575 {
3576 const fs_visitor *v = bld.shader;
3577
3578 if (bld.shader->devinfo->ver >= 20) {
3579 /* Gfx20+ has separate viewport indices for each pair
3580 * of subspans in order to support multiple polygons, so we need
3581 * to use a <1;8,0> region in order to select the correct word
3582 * for each channel.
3583 */
3584 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3585
3586 for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
3587 const fs_builder hbld = bld.group(16, i);
3588 const struct brw_reg reg = retype(xe2_vec1_grf(i, 9),
3589 BRW_TYPE_UW);
3590 hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
3591 brw_imm_uw(0xf000));
3592 }
3593
3594 bld.SHR(idx, idx, brw_imm_ud(12));
3595 return idx;
3596 } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
3597 /* According to the BSpec "PS Thread Payload for Normal
3598 * Dispatch", the viewport index is stored as bits
3599 * 30:27 of either the R1.1 or R1.6 poly info dwords, for the
3600 * first and second polygons respectively in multipolygon PS
3601 * dispatch mode.
3602 */
3603 assert(bld.dispatch_width() == 16);
3604 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3605 brw_reg vp_idx_per_poly_dw[2] = {
3606 brw_ud1_reg(FIXED_GRF, 1, 1), /* R1.1 bits 30:27 */
3607 brw_ud1_reg(FIXED_GRF, 1, 6), /* R1.6 bits 30:27 */
3608 };
3609
3610 for (unsigned i = 0; i < v->max_polygons; i++) {
3611 const fs_builder hbld = bld.group(8, i);
3612 hbld.SHR(offset(idx, hbld, i), vp_idx_per_poly_dw[i], brw_imm_ud(27));
3613 }
3614
3615 return bld.AND(idx, brw_imm_ud(0xf));
3616 } else if (bld.shader->devinfo->ver >= 12) {
3617 /* The viewport index is provided in the thread payload as
3618 * bits 30:27 of r1.1.
3619 */
3620 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3621 bld.SHR(idx,
3622 bld.AND(brw_uw1_reg(FIXED_GRF, 1, 3),
3623 brw_imm_uw(0x7800)),
3624 brw_imm_ud(11));
3625 return idx;
3626 } else {
3627 /* The viewport index is provided in the thread payload as
3628 * bits 30:27 of r0.0.
3629 */
3630 const brw_reg idx = bld.vgrf(BRW_TYPE_UD);
3631 bld.SHR(idx,
3632 bld.AND(brw_uw1_reg(FIXED_GRF, 0, 1),
3633 brw_imm_uw(0x7800)),
3634 brw_imm_ud(11));
3635 return idx;
3636 }
3637 }
3638
3639 /* Sample from the MCS surface attached to this multisample texture. */
3640 static brw_reg
emit_mcs_fetch(nir_to_brw_state & ntb,const brw_reg & coordinate,unsigned components,const brw_reg & texture,const brw_reg & texture_handle)3641 emit_mcs_fetch(nir_to_brw_state &ntb, const brw_reg &coordinate, unsigned components,
3642 const brw_reg &texture,
3643 const brw_reg &texture_handle)
3644 {
3645 const fs_builder &bld = ntb.bld;
3646
3647 const brw_reg dest = bld.vgrf(BRW_TYPE_UD, 4);
3648
3649 brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
3650 srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3651 srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3652 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
3653 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3654 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
3655 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
3656 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
3657
3658 fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3659 ARRAY_SIZE(srcs));
3660
3661 /* We only care about one or two regs of response, but the sampler always
3662 * writes 4/8.
3663 */
3664 inst->size_written = 4 * dest.component_size(inst->exec_size);
3665
3666 return dest;
3667 }
3668
3669 /**
3670 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3671 * framebuffer at the current fragment coordinates and sample index.
3672 */
3673 static fs_inst *
emit_non_coherent_fb_read(nir_to_brw_state & ntb,const fs_builder & bld,const brw_reg & dst,unsigned target)3674 emit_non_coherent_fb_read(nir_to_brw_state &ntb, const fs_builder &bld, const brw_reg &dst,
3675 unsigned target)
3676 {
3677 fs_visitor &s = ntb.s;
3678 const struct intel_device_info *devinfo = s.devinfo;
3679
3680 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3681 const brw_wm_prog_key *wm_key =
3682 reinterpret_cast<const brw_wm_prog_key *>(s.key);
3683 assert(!wm_key->coherent_fb_fetch);
3684
3685 /* Calculate the fragment coordinates. */
3686 const brw_reg coords = bld.vgrf(BRW_TYPE_UD, 3);
3687 bld.MOV(offset(coords, bld, 0), s.pixel_x);
3688 bld.MOV(offset(coords, bld, 1), s.pixel_y);
3689 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3690
3691 /* Calculate the sample index and MCS payload when multisampling. Luckily
3692 * the MCS fetch message behaves deterministically for UMS surfaces, so it
3693 * shouldn't be necessary to recompile based on whether the framebuffer is
3694 * CMS or UMS.
3695 */
3696 assert(wm_key->multisample_fbo == INTEL_ALWAYS ||
3697 wm_key->multisample_fbo == INTEL_NEVER);
3698 if (wm_key->multisample_fbo &&
3699 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3700 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3701
3702 const brw_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3703 const brw_reg mcs = wm_key->multisample_fbo ?
3704 emit_mcs_fetch(ntb, coords, 3, brw_imm_ud(target), brw_reg()) : brw_reg();
3705
3706 /* Use either a normal or a CMS texel fetch message depending on whether
3707 * the framebuffer is single or multisample. On SKL+ use the wide CMS
3708 * message just in case the framebuffer uses 16x multisampling, it should
3709 * be equivalent to the normal CMS fetch for lower multisampling modes.
3710 */
3711 opcode op;
3712 if (wm_key->multisample_fbo) {
3713 /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
3714 * multisampling, it should be equivalent to the normal CMS fetch for
3715 * lower multisampling modes.
3716 *
3717 * On Gfx12HP, there is only CMS_W variant available.
3718 */
3719 if (devinfo->verx10 >= 125)
3720 op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
3721 else
3722 op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
3723 } else {
3724 op = SHADER_OPCODE_TXF_LOGICAL;
3725 }
3726
3727 /* Emit the instruction. */
3728 brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
3729 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords;
3730 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0);
3731 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample;
3732 srcs[TEX_LOGICAL_SRC_MCS] = mcs;
3733 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(target);
3734 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
3735 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3736 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);
3737 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0);
3738
3739 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3740 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3741
3742 return inst;
3743 }
3744
3745 /**
3746 * Actual coherent framebuffer read implemented using the native render target
3747 * read message. Requires SKL+.
3748 */
3749 static fs_inst *
emit_coherent_fb_read(const fs_builder & bld,const brw_reg & dst,unsigned target)3750 emit_coherent_fb_read(const fs_builder &bld, const brw_reg &dst, unsigned target)
3751 {
3752 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3753 inst->target = target;
3754 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3755
3756 return inst;
3757 }
3758
3759 static brw_reg
alloc_temporary(const fs_builder & bld,unsigned size,brw_reg * regs,unsigned n)3760 alloc_temporary(const fs_builder &bld, unsigned size, brw_reg *regs, unsigned n)
3761 {
3762 if (n && regs[0].file != BAD_FILE) {
3763 return regs[0];
3764
3765 } else {
3766 const brw_reg tmp = bld.vgrf(BRW_TYPE_F, size);
3767
3768 for (unsigned i = 0; i < n; i++)
3769 regs[i] = tmp;
3770
3771 return tmp;
3772 }
3773 }
3774
3775 static brw_reg
alloc_frag_output(nir_to_brw_state & ntb,unsigned location)3776 alloc_frag_output(nir_to_brw_state &ntb, unsigned location)
3777 {
3778 fs_visitor &s = ntb.s;
3779
3780 assert(s.stage == MESA_SHADER_FRAGMENT);
3781 const brw_wm_prog_key *const key =
3782 reinterpret_cast<const brw_wm_prog_key *>(s.key);
3783 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3784 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3785
3786 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3787 return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3788
3789 else if (l == FRAG_RESULT_COLOR)
3790 return alloc_temporary(ntb.bld, 4, s.outputs,
3791 MAX2(key->nr_color_regions, 1));
3792
3793 else if (l == FRAG_RESULT_DEPTH)
3794 return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3795
3796 else if (l == FRAG_RESULT_STENCIL)
3797 return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3798
3799 else if (l == FRAG_RESULT_SAMPLE_MASK)
3800 return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3801
3802 else if (l >= FRAG_RESULT_DATA0 &&
3803 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3804 return alloc_temporary(ntb.bld, 4,
3805 &s.outputs[l - FRAG_RESULT_DATA0], 1);
3806
3807 else
3808 unreachable("Invalid location");
3809 }
3810
3811 static void
emit_is_helper_invocation(nir_to_brw_state & ntb,brw_reg result)3812 emit_is_helper_invocation(nir_to_brw_state &ntb, brw_reg result)
3813 {
3814 const fs_builder &bld = ntb.bld;
3815
3816 /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3817 * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3818 * consideration demoted invocations.
3819 */
3820 result.type = BRW_TYPE_UD;
3821
3822 bld.MOV(result, brw_imm_ud(0));
3823
3824 /* See brw_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3825 unsigned width = bld.dispatch_width();
3826 for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3827 const fs_builder b = bld.group(MIN2(width, 16), i);
3828
3829 fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0));
3830
3831 /* The at() ensures that any code emitted to get the predicate happens
3832 * before the mov right above. This is not an issue elsewhere because
3833 * lowering code already set up the builder this way.
3834 */
3835 brw_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3836 mov->predicate_inverse = true;
3837 }
3838 }
3839
3840 static brw_reg
emit_frontfacing_interpolation(nir_to_brw_state & ntb)3841 emit_frontfacing_interpolation(nir_to_brw_state &ntb)
3842 {
3843 const intel_device_info *devinfo = ntb.devinfo;
3844 const fs_builder &bld = ntb.bld;
3845 fs_visitor &s = ntb.s;
3846
3847 brw_reg ff = bld.vgrf(BRW_TYPE_D);
3848
3849 if (devinfo->ver >= 20) {
3850 /* Gfx20+ has separate back-facing bits for each pair of
3851 * subspans in order to support multiple polygons, so we need to
3852 * use a <1;8,0> region in order to select the correct word for
3853 * each channel.
3854 */
3855 const brw_reg tmp = bld.vgrf(BRW_TYPE_UW);
3856
3857 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3858 const fs_builder hbld = bld.group(16, i);
3859 const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
3860 BRW_TYPE_UW);
3861 hbld.AND(offset(tmp, hbld, i), gi_uw, brw_imm_uw(0x800));
3862 }
3863
3864 bld.CMP(ff, tmp, brw_imm_uw(0), BRW_CONDITIONAL_Z);
3865
3866 } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
3867 /* According to the BSpec "PS Thread Payload for Normal
3868 * Dispatch", the front/back facing interpolation bit is stored
3869 * as bit 15 of either the R1.1 or R1.6 poly info field, for the
3870 * first and second polygons respectively in multipolygon PS
3871 * dispatch mode.
3872 */
3873 assert(s.dispatch_width == 16);
3874 brw_reg tmp = bld.vgrf(BRW_TYPE_W);
3875
3876 for (unsigned i = 0; i < s.max_polygons; i++) {
3877 const fs_builder hbld = bld.group(8, i);
3878 const struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
3879 BRW_TYPE_W);
3880 hbld.ASR(offset(tmp, hbld, i), g1, brw_imm_d(15));
3881 }
3882
3883 bld.NOT(ff, tmp);
3884
3885 } else if (devinfo->ver >= 12) {
3886 brw_reg g1 = brw_reg(retype(brw_vec1_grf(1, 1), BRW_TYPE_W));
3887
3888 brw_reg tmp = bld.vgrf(BRW_TYPE_W);
3889 bld.ASR(tmp, g1, brw_imm_d(15));
3890 bld.NOT(ff, tmp);
3891 } else {
3892 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3893 * a boolean result from this (~0/true or 0/false).
3894 *
3895 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3896 * this task in only one instruction:
3897 * - a negation source modifier will flip the bit; and
3898 * - a W -> D type conversion will sign extend the bit into the high
3899 * word of the destination.
3900 *
3901 * An ASR 15 fills the low word of the destination.
3902 */
3903 brw_reg g0 = brw_reg(retype(brw_vec1_grf(0, 0), BRW_TYPE_W));
3904
3905 bld.ASR(ff, negate(g0), brw_imm_d(15));
3906 }
3907
3908 return ff;
3909 }
3910
3911 static brw_reg
emit_samplepos_setup(nir_to_brw_state & ntb)3912 emit_samplepos_setup(nir_to_brw_state &ntb)
3913 {
3914 const fs_builder &bld = ntb.bld;
3915 fs_visitor &s = ntb.s;
3916
3917 assert(s.stage == MESA_SHADER_FRAGMENT);
3918 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
3919
3920 const fs_builder abld = bld.annotate("compute sample position");
3921 brw_reg pos = abld.vgrf(BRW_TYPE_F, 2);
3922
3923 if (wm_prog_data->persample_dispatch == INTEL_NEVER) {
3924 /* From ARB_sample_shading specification:
3925 * "When rendering to a non-multisample buffer, or if multisample
3926 * rasterization is disabled, gl_SamplePosition will always be
3927 * (0.5, 0.5).
3928 */
3929 bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f));
3930 bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f));
3931 return pos;
3932 }
3933
3934 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3935 * mode will be enabled.
3936 *
3937 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3938 * R31.1:0 Position Offset X/Y for Slot[3:0]
3939 * R31.3:2 Position Offset X/Y for Slot[7:4]
3940 * .....
3941 *
3942 * The X, Y sample positions come in as bytes in thread payload. So, read
3943 * the positions using vstride=16, width=8, hstride=2.
3944 */
3945 const brw_reg sample_pos_reg =
3946 fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, BRW_TYPE_W);
3947
3948 for (unsigned i = 0; i < 2; i++) {
3949 brw_reg tmp_d = bld.vgrf(BRW_TYPE_D);
3950 abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_TYPE_B, i));
3951 /* Convert int_sample_pos to floating point */
3952 brw_reg tmp_f = bld.vgrf(BRW_TYPE_F);
3953 abld.MOV(tmp_f, tmp_d);
3954 /* Scale to the range [0, 1] */
3955 abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
3956 }
3957
3958 if (wm_prog_data->persample_dispatch == INTEL_SOMETIMES) {
3959 check_dynamic_msaa_flag(abld, wm_prog_data,
3960 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3961 for (unsigned i = 0; i < 2; i++) {
3962 set_predicate(BRW_PREDICATE_NORMAL,
3963 bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3964 brw_imm_f(0.5f)));
3965 }
3966 }
3967
3968 return pos;
3969 }
3970
3971 static brw_reg
emit_sampleid_setup(nir_to_brw_state & ntb)3972 emit_sampleid_setup(nir_to_brw_state &ntb)
3973 {
3974 const intel_device_info *devinfo = ntb.devinfo;
3975 const fs_builder &bld = ntb.bld;
3976 fs_visitor &s = ntb.s;
3977
3978 assert(s.stage == MESA_SHADER_FRAGMENT);
3979 ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
3980 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
3981
3982 const fs_builder abld = bld.annotate("compute sample id");
3983 brw_reg sample_id = abld.vgrf(BRW_TYPE_UD);
3984
3985 assert(key->multisample_fbo != INTEL_NEVER);
3986
3987 /* Sample ID comes in as 4-bit numbers in g1.0:
3988 *
3989 * 15:12 Slot 3 SampleID (only used in SIMD16)
3990 * 11:8 Slot 2 SampleID (only used in SIMD16)
3991 * 7:4 Slot 1 SampleID
3992 * 3:0 Slot 0 SampleID
3993 *
3994 * Each slot corresponds to four channels, so we want to replicate each
3995 * half-byte value to 4 channels in a row:
3996 *
3997 * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
3998 * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
3999 *
4000 * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
4001 * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
4002 *
4003 * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
4004 * channels to read the first byte (7:0), and the second group of 8
4005 * channels to read the second byte (15:8). Then, we shift right by
4006 * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
4007 * values into place. Finally, we AND with 0xf to keep the low nibble.
4008 *
4009 * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
4010 * and(16) dst<1>D tmp<8,8,1>W 0xf:W
4011 *
4012 * TODO: These payload bits exist on Gfx7 too, but they appear to always
4013 * be zero, so this code fails to work. We should find out why.
4014 */
4015 const brw_reg tmp = abld.vgrf(BRW_TYPE_UW);
4016
4017 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
4018 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
4019 /* According to the "PS Thread Payload for Normal Dispatch"
4020 * pages on the BSpec, the sample ids are stored in R0.8/R1.8
4021 * on gfx20+ and in R1.0/R2.0 on gfx8+.
4022 */
4023 const struct brw_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
4024 brw_vec1_grf(i + 1, 0);
4025 hbld.SHR(offset(tmp, hbld, i),
4026 stride(retype(id_reg, BRW_TYPE_UB), 1, 8, 0),
4027 brw_imm_v(0x44440000));
4028 }
4029
4030 abld.AND(sample_id, tmp, brw_imm_w(0xf));
4031
4032 if (key->multisample_fbo == INTEL_SOMETIMES) {
4033 check_dynamic_msaa_flag(abld, wm_prog_data,
4034 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
4035 set_predicate(BRW_PREDICATE_NORMAL,
4036 abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
4037 }
4038
4039 return sample_id;
4040 }
4041
4042 static brw_reg
emit_samplemaskin_setup(nir_to_brw_state & ntb)4043 emit_samplemaskin_setup(nir_to_brw_state &ntb)
4044 {
4045 const fs_builder &bld = ntb.bld;
4046 fs_visitor &s = ntb.s;
4047
4048 assert(s.stage == MESA_SHADER_FRAGMENT);
4049 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
4050
4051 /* The HW doesn't provide us with expected values. */
4052 assert(wm_prog_data->coarse_pixel_dispatch != INTEL_ALWAYS);
4053
4054 brw_reg coverage_mask =
4055 fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, BRW_TYPE_UD);
4056
4057 if (wm_prog_data->persample_dispatch == INTEL_NEVER)
4058 return coverage_mask;
4059
4060 /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
4061 * and a mask representing which sample is being processed by the
4062 * current shader invocation.
4063 *
4064 * From the OES_sample_variables specification:
4065 * "When per-sample shading is active due to the use of a fragment input
4066 * qualified by "sample" or due to the use of the gl_SampleID or
4067 * gl_SamplePosition variables, only the bit for the current sample is
4068 * set in gl_SampleMaskIn."
4069 */
4070 const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
4071
4072 if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
4073 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
4074
4075 brw_reg one = abld.MOV(brw_imm_ud(1));
4076 brw_reg enabled_mask = abld.SHL(one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
4077 brw_reg mask = abld.AND(enabled_mask, coverage_mask);
4078
4079 if (wm_prog_data->persample_dispatch == INTEL_ALWAYS)
4080 return mask;
4081
4082 check_dynamic_msaa_flag(abld, wm_prog_data,
4083 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
4084 set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
4085
4086 return mask;
4087 }
4088
4089 static brw_reg
emit_shading_rate_setup(nir_to_brw_state & ntb)4090 emit_shading_rate_setup(nir_to_brw_state &ntb)
4091 {
4092 const intel_device_info *devinfo = ntb.devinfo;
4093 const fs_builder &bld = ntb.bld;
4094
4095 assert(devinfo->ver >= 11);
4096
4097 struct brw_wm_prog_data *wm_prog_data =
4098 brw_wm_prog_data(bld.shader->prog_data);
4099
4100 /* Coarse pixel shading size fields overlap with other fields of not in
4101 * coarse pixel dispatch mode, so report 0 when that's not the case.
4102 */
4103 if (wm_prog_data->coarse_pixel_dispatch == INTEL_NEVER)
4104 return brw_imm_ud(0);
4105
4106 const fs_builder abld = bld.annotate("compute fragment shading rate");
4107
4108 /* The shading rates provided in the shader are the actual 2D shading
4109 * rate while the SPIR-V built-in is the enum value that has the shading
4110 * rate encoded as a bitfield. Fortunately, the bitfield value is just
4111 * the shading rate divided by two and shifted.
4112 */
4113
4114 /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
4115 brw_reg actual_x = brw_reg(retype(brw_vec1_grf(1, 0), BRW_TYPE_UB));
4116 /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
4117 brw_reg actual_y = byte_offset(actual_x, 1);
4118
4119 brw_reg int_rate_y = abld.SHR(actual_y, brw_imm_ud(1));
4120 brw_reg int_rate_x = abld.SHR(actual_x, brw_imm_ud(1));
4121
4122 brw_reg rate = abld.OR(abld.SHL(int_rate_x, brw_imm_ud(2)), int_rate_y);
4123
4124 if (wm_prog_data->coarse_pixel_dispatch == INTEL_ALWAYS)
4125 return rate;
4126
4127 check_dynamic_msaa_flag(abld, wm_prog_data,
4128 INTEL_MSAA_FLAG_COARSE_RT_WRITES);
4129 set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
4130
4131 return rate;
4132 }
4133
4134 /* Input data is organized with first the per-primitive values, followed
4135 * by per-vertex values. The per-vertex will have interpolation information
4136 * associated, so use 4 components for each value.
4137 */
4138
4139 /* The register location here is relative to the start of the URB
4140 * data. It will get adjusted to be a real location before
4141 * generate_code() time.
4142 */
4143 static brw_reg
brw_interp_reg(const fs_builder & bld,unsigned location,unsigned channel,unsigned comp)4144 brw_interp_reg(const fs_builder &bld, unsigned location,
4145 unsigned channel, unsigned comp)
4146 {
4147 fs_visitor &s = *bld.shader;
4148 assert(s.stage == MESA_SHADER_FRAGMENT);
4149 assert(BITFIELD64_BIT(location) & ~s.nir->info.per_primitive_inputs);
4150
4151 const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
4152
4153 assert(prog_data->urb_setup[location] >= 0);
4154 unsigned nr = prog_data->urb_setup[location];
4155 channel += prog_data->urb_setup_channel[location];
4156
4157 /* Adjust so we start counting from the first per_vertex input. */
4158 assert(nr >= prog_data->num_per_primitive_inputs);
4159 nr -= prog_data->num_per_primitive_inputs;
4160
4161 const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
4162 const unsigned regnr = per_vertex_start + (nr * 4) + channel;
4163
4164 if (s.max_polygons > 1) {
4165 /* In multipolygon dispatch each plane parameter is a
4166 * dispatch_width-wide SIMD vector (see comment in
4167 * assign_urb_setup()), so we need to use offset() instead of
4168 * component() to select the specified parameter.
4169 */
4170 const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
4171 bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD),
4172 s.dispatch_width, comp));
4173 return retype(tmp, BRW_TYPE_F);
4174 } else {
4175 return component(brw_attr_reg(regnr, BRW_TYPE_F), comp);
4176 }
4177 }
4178
4179 /* The register location here is relative to the start of the URB
4180 * data. It will get adjusted to be a real location before
4181 * generate_code() time.
4182 */
4183 static brw_reg
brw_per_primitive_reg(const fs_builder & bld,int location,unsigned comp)4184 brw_per_primitive_reg(const fs_builder &bld, int location, unsigned comp)
4185 {
4186 fs_visitor &s = *bld.shader;
4187 assert(s.stage == MESA_SHADER_FRAGMENT);
4188 assert(BITFIELD64_BIT(location) & s.nir->info.per_primitive_inputs);
4189
4190 const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
4191
4192 comp += prog_data->urb_setup_channel[location];
4193
4194 assert(prog_data->urb_setup[location] >= 0);
4195
4196 const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
4197
4198 assert(regnr < prog_data->num_per_primitive_inputs);
4199
4200 if (s.max_polygons > 1) {
4201 /* In multipolygon dispatch each primitive constant is a
4202 * dispatch_width-wide SIMD vector (see comment in
4203 * assign_urb_setup()), so we need to use offset() instead of
4204 * component() to select the specified parameter.
4205 */
4206 const brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
4207 bld.MOV(tmp, offset(brw_attr_reg(regnr, BRW_TYPE_UD),
4208 s.dispatch_width, comp % 4));
4209 return retype(tmp, BRW_TYPE_F);
4210 } else {
4211 return component(brw_attr_reg(regnr, BRW_TYPE_F), comp % 4);
4212 }
4213 }
4214
4215 static void
fs_nir_emit_fs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4216 fs_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
4217 nir_intrinsic_instr *instr)
4218 {
4219 const intel_device_info *devinfo = ntb.devinfo;
4220 const fs_builder &bld = ntb.bld;
4221 fs_visitor &s = ntb.s;
4222
4223 assert(s.stage == MESA_SHADER_FRAGMENT);
4224
4225 brw_reg dest;
4226 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4227 dest = get_nir_def(ntb, instr->def);
4228
4229 switch (instr->intrinsic) {
4230 case nir_intrinsic_load_front_face:
4231 bld.MOV(retype(dest, BRW_TYPE_D), emit_frontfacing_interpolation(ntb));
4232 break;
4233
4234 case nir_intrinsic_load_sample_pos:
4235 case nir_intrinsic_load_sample_pos_or_center: {
4236 brw_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
4237 assert(sample_pos.file != BAD_FILE);
4238 dest.type = sample_pos.type;
4239 bld.MOV(dest, sample_pos);
4240 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
4241 break;
4242 }
4243
4244 case nir_intrinsic_load_layer_id:
4245 dest.type = BRW_TYPE_UD;
4246 bld.MOV(dest, fetch_render_target_array_index(bld));
4247 break;
4248
4249 case nir_intrinsic_is_helper_invocation:
4250 emit_is_helper_invocation(ntb, dest);
4251 break;
4252
4253 case nir_intrinsic_load_helper_invocation:
4254 case nir_intrinsic_load_sample_mask_in:
4255 case nir_intrinsic_load_sample_id:
4256 case nir_intrinsic_load_frag_shading_rate: {
4257 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
4258 brw_reg val = ntb.system_values[sv];
4259 assert(val.file != BAD_FILE);
4260 dest.type = val.type;
4261 bld.MOV(dest, val);
4262 break;
4263 }
4264
4265 case nir_intrinsic_store_output: {
4266 const brw_reg src = get_nir_src(ntb, instr->src[0], -1);
4267 const unsigned store_offset = nir_src_as_uint(instr->src[1]);
4268 const unsigned location = nir_intrinsic_base(instr) +
4269 SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
4270 const brw_reg new_dest =
4271 offset(retype(alloc_frag_output(ntb, location), src.type),
4272 bld, nir_intrinsic_component(instr));
4273
4274 brw_combine_with_vec(bld, new_dest, src, instr->num_components);
4275 break;
4276 }
4277
4278 case nir_intrinsic_load_output: {
4279 const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
4280 BRW_NIR_FRAG_OUTPUT_LOCATION);
4281 assert(l >= FRAG_RESULT_DATA0);
4282 const unsigned load_offset = nir_src_as_uint(instr->src[0]);
4283 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
4284 const brw_reg tmp = bld.vgrf(dest.type, 4);
4285
4286 if (reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch)
4287 emit_coherent_fb_read(bld, tmp, target);
4288 else
4289 emit_non_coherent_fb_read(ntb, bld, tmp, target);
4290
4291 brw_combine_with_vec(bld, dest,
4292 offset(tmp, bld, nir_intrinsic_component(instr)),
4293 instr->num_components);
4294 break;
4295 }
4296
4297 case nir_intrinsic_demote:
4298 case nir_intrinsic_terminate:
4299 case nir_intrinsic_demote_if:
4300 case nir_intrinsic_terminate_if: {
4301 /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we
4302 * can update just the flag bits that aren't yet discarded. If there's
4303 * no condition, we emit a CMP of g0 != g0, so all currently executing
4304 * channels will get turned off.
4305 */
4306 fs_inst *cmp = NULL;
4307 if (instr->intrinsic == nir_intrinsic_demote_if ||
4308 instr->intrinsic == nir_intrinsic_terminate_if) {
4309 nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
4310
4311 if (alu != NULL &&
4312 alu->op != nir_op_bcsel) {
4313 /* Re-emit the instruction that generated the Boolean value, but
4314 * do not store it. Since this instruction will be conditional,
4315 * other instructions that want to use the real Boolean value may
4316 * get garbage. This was a problem for piglit's fs-discard-exit-2
4317 * test.
4318 *
4319 * Ideally we'd detect that the instruction cannot have a
4320 * conditional modifier before emitting the instructions. Alas,
4321 * that is nigh impossible. Instead, we're going to assume the
4322 * instruction (or last instruction) generated can have a
4323 * conditional modifier. If it cannot, fallback to the old-style
4324 * compare, and hope dead code elimination will clean up the
4325 * extra instructions generated.
4326 */
4327 fs_nir_emit_alu(ntb, alu, false);
4328
4329 cmp = (fs_inst *) s.instructions.get_tail();
4330 if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
4331 if (cmp->can_do_cmod())
4332 cmp->conditional_mod = BRW_CONDITIONAL_Z;
4333 else
4334 cmp = NULL;
4335 } else {
4336 /* The old sequence that would have been generated is,
4337 * basically, bool_result == false. This is equivalent to
4338 * !bool_result, so negate the old modifier.
4339 *
4340 * Unfortunately, we can't do this to most float comparisons
4341 * because of NaN, so we'll have to fallback to the old-style
4342 * compare.
4343 *
4344 * For example, this code (after negation):
4345 * (+f1.0) cmp.ge.f1.0(8) null<1>F g30<8,8,1>F 0x0F
4346 * will provide different results from this:
4347 * cmp.l.f0.0(8) g31<1>F g30<1,1,0>F 0x0F
4348 * (+f1.0) cmp.z.f1.0(8) null<1>D g31<8,8,1>D 0D
4349 * because both (NaN >= 0) == false and (NaN < 0) == false.
4350 *
4351 * It will still work for == and != though, because
4352 * (NaN == x) == false and (NaN != x) == true.
4353 */
4354 if (brw_type_is_float(cmp->src[0].type) &&
4355 cmp->conditional_mod != BRW_CONDITIONAL_EQ &&
4356 cmp->conditional_mod != BRW_CONDITIONAL_NEQ) {
4357 cmp = NULL;
4358 } else {
4359 cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
4360 }
4361 }
4362 }
4363
4364 if (cmp == NULL) {
4365 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
4366 brw_imm_d(0), BRW_CONDITIONAL_Z);
4367 }
4368 } else {
4369 brw_reg some_reg = brw_reg(retype(brw_vec8_grf(0, 0), BRW_TYPE_UW));
4370 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
4371 }
4372
4373 cmp->predicate = BRW_PREDICATE_NORMAL;
4374 cmp->flag_subreg = sample_mask_flag_subreg(s);
4375
4376 fs_inst *jump = bld.emit(BRW_OPCODE_HALT);
4377 jump->flag_subreg = sample_mask_flag_subreg(s);
4378 jump->predicate_inverse = true;
4379
4380 if (instr->intrinsic == nir_intrinsic_terminate ||
4381 instr->intrinsic == nir_intrinsic_terminate_if) {
4382 jump->predicate = BRW_PREDICATE_NORMAL;
4383 } else {
4384 /* Only jump when the whole quad is demoted. For historical
4385 * reasons this is also used for discard.
4386 */
4387 jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
4388 BRW_PREDICATE_ALIGN1_ANY4H);
4389 }
4390 break;
4391 }
4392
4393 case nir_intrinsic_load_input:
4394 case nir_intrinsic_load_per_primitive_input: {
4395 /* In Fragment Shaders load_input is used either for flat inputs or
4396 * per-primitive inputs.
4397 */
4398 assert(instr->def.bit_size == 32);
4399 unsigned base = nir_intrinsic_base(instr);
4400 unsigned comp = nir_intrinsic_component(instr);
4401 unsigned num_components = instr->num_components;
4402
4403 /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
4404
4405 if (base == VARYING_SLOT_LAYER) {
4406 dest.type = BRW_TYPE_UD;
4407 bld.MOV(dest, fetch_render_target_array_index(bld));
4408 break;
4409 } else if (base == VARYING_SLOT_VIEWPORT) {
4410 dest.type = BRW_TYPE_UD;
4411 bld.MOV(dest, fetch_viewport_index(bld));
4412 break;
4413 }
4414
4415 if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
4416 assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
4417 for (unsigned int i = 0; i < num_components; i++) {
4418 bld.MOV(offset(dest, bld, i),
4419 retype(brw_per_primitive_reg(bld, base, comp + i), dest.type));
4420 }
4421 } else {
4422 /* Gfx20+ packs the plane parameters of a single logical
4423 * input in a vec3 format instead of the previously used vec4
4424 * format.
4425 */
4426 const unsigned k = devinfo->ver >= 20 ? 0 : 3;
4427 for (unsigned int i = 0; i < num_components; i++) {
4428 bld.MOV(offset(dest, bld, i),
4429 retype(brw_interp_reg(bld, base, comp + i, k), dest.type));
4430 }
4431 }
4432 break;
4433 }
4434
4435 case nir_intrinsic_load_fs_input_interp_deltas: {
4436 assert(s.stage == MESA_SHADER_FRAGMENT);
4437 assert(nir_src_as_uint(instr->src[0]) == 0);
4438 const unsigned base = nir_intrinsic_base(instr);
4439 const unsigned comp = nir_intrinsic_component(instr);
4440 dest.type = BRW_TYPE_F;
4441
4442 /* Gfx20+ packs the plane parameters of a single logical
4443 * input in a vec3 format instead of the previously used vec4
4444 * format.
4445 */
4446 if (devinfo->ver >= 20) {
4447 bld.MOV(offset(dest, bld, 0), brw_interp_reg(bld, base, comp, 0));
4448 bld.MOV(offset(dest, bld, 1), brw_interp_reg(bld, base, comp, 2));
4449 bld.MOV(offset(dest, bld, 2), brw_interp_reg(bld, base, comp, 1));
4450 } else {
4451 bld.MOV(offset(dest, bld, 0), brw_interp_reg(bld, base, comp, 3));
4452 bld.MOV(offset(dest, bld, 1), brw_interp_reg(bld, base, comp, 1));
4453 bld.MOV(offset(dest, bld, 2), brw_interp_reg(bld, base, comp, 0));
4454 }
4455
4456 break;
4457 }
4458
4459 case nir_intrinsic_load_barycentric_pixel:
4460 case nir_intrinsic_load_barycentric_centroid:
4461 case nir_intrinsic_load_barycentric_sample: {
4462 /* Use the delta_xy values computed from the payload */
4463 enum intel_barycentric_mode bary = brw_barycentric_mode(
4464 reinterpret_cast<const brw_wm_prog_key *>(s.key), instr);
4465 const brw_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
4466 offset(s.delta_xy[bary], bld, 1) };
4467 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4468 break;
4469 }
4470
4471 case nir_intrinsic_load_barycentric_at_sample: {
4472 const glsl_interp_mode interpolation =
4473 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4474
4475 if (devinfo->ver >= 20) {
4476 emit_pixel_interpolater_alu_at_sample(
4477 bld, dest, retype(get_nir_src(ntb, instr->src[0]),
4478 BRW_TYPE_UD),
4479 interpolation);
4480
4481 } else {
4482 const brw_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
4483 BRW_TYPE_UD);
4484 const brw_reg sample_id = bld.emit_uniformize(sample_src);
4485 const brw_reg msg_data = component(bld.group(8, 0).vgrf(BRW_TYPE_UD), 0);
4486
4487 bld.exec_all().group(1, 0).SHL(msg_data, sample_id, brw_imm_ud(4u));
4488
4489 brw_reg flag_reg;
4490 struct brw_wm_prog_key *wm_prog_key = (struct brw_wm_prog_key *) s.key;
4491 if (wm_prog_key->multisample_fbo == INTEL_SOMETIMES) {
4492 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
4493
4494 check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
4495 wm_prog_data,
4496 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
4497 flag_reg = brw_flag_reg(0, 0);
4498 }
4499
4500 emit_pixel_interpolater_send(bld,
4501 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
4502 dest,
4503 brw_reg(), /* src */
4504 msg_data,
4505 flag_reg,
4506 interpolation);
4507 }
4508 break;
4509 }
4510
4511 case nir_intrinsic_load_barycentric_at_offset: {
4512 const glsl_interp_mode interpolation =
4513 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4514
4515 if (devinfo->ver >= 20) {
4516 emit_pixel_interpolater_alu_at_offset(
4517 bld, dest,
4518 retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F),
4519 interpolation);
4520
4521 } else if (nir_const_value *const_offset = nir_src_as_const_value(instr->src[0])) {
4522 assert(nir_src_bit_size(instr->src[0]) == 32);
4523 unsigned off_x = const_offset[0].u32 & 0xf;
4524 unsigned off_y = const_offset[1].u32 & 0xf;
4525
4526 emit_pixel_interpolater_send(bld,
4527 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
4528 dest,
4529 brw_reg(), /* src */
4530 brw_imm_ud(off_x | (off_y << 4)),
4531 brw_reg(), /* flag_reg */
4532 interpolation);
4533 } else {
4534 brw_reg src = retype(get_nir_src(ntb, instr->src[0], -1), BRW_TYPE_D);
4535 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
4536 emit_pixel_interpolater_send(bld,
4537 opcode,
4538 dest,
4539 src,
4540 brw_imm_ud(0u),
4541 brw_reg(), /* flag_reg */
4542 interpolation);
4543 }
4544 break;
4545 }
4546
4547 case nir_intrinsic_load_frag_coord: {
4548 brw_reg comps[4] = { s.pixel_x, s.pixel_y, s.pixel_z, s.wpos_w };
4549 bld.VEC(dest, comps, 4);
4550 break;
4551 }
4552
4553 case nir_intrinsic_load_interpolated_input: {
4554 assert(instr->src[0].ssa &&
4555 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
4556 nir_intrinsic_instr *bary_intrinsic =
4557 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
4558 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
4559 brw_reg dst_xy;
4560
4561 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
4562 bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
4563 /* Use the result of the PI message. */
4564 dst_xy = retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F);
4565 } else {
4566 /* Use the delta_xy values computed from the payload */
4567 enum intel_barycentric_mode bary = brw_barycentric_mode(
4568 reinterpret_cast<const brw_wm_prog_key *>(s.key), bary_intrinsic);
4569 dst_xy = s.delta_xy[bary];
4570 }
4571
4572 for (unsigned int i = 0; i < instr->num_components; i++) {
4573 brw_reg interp =
4574 brw_interp_reg(bld, nir_intrinsic_base(instr),
4575 nir_intrinsic_component(instr) + i, 0);
4576 interp.type = BRW_TYPE_F;
4577 dest.type = BRW_TYPE_F;
4578
4579 bld.PLN(offset(dest, bld, i), interp, dst_xy);
4580 }
4581 break;
4582 }
4583
4584 default:
4585 fs_nir_emit_intrinsic(ntb, bld, instr);
4586 break;
4587 }
4588 }
4589
4590 static unsigned
brw_workgroup_size(fs_visitor & s)4591 brw_workgroup_size(fs_visitor &s)
4592 {
4593 assert(gl_shader_stage_uses_workgroup(s.stage));
4594 assert(!s.nir->info.workgroup_size_variable);
4595 const struct brw_cs_prog_data *cs = brw_cs_prog_data(s.prog_data);
4596 return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
4597 }
4598
4599 static void
fs_nir_emit_cs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4600 fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
4601 nir_intrinsic_instr *instr)
4602 {
4603 const intel_device_info *devinfo = ntb.devinfo;
4604 const fs_builder &bld = ntb.bld;
4605 fs_visitor &s = ntb.s;
4606
4607 assert(gl_shader_stage_uses_workgroup(s.stage));
4608 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(s.prog_data);
4609
4610 brw_reg dest;
4611 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4612 dest = get_nir_def(ntb, instr->def);
4613
4614 const fs_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
4615
4616 switch (instr->intrinsic) {
4617 case nir_intrinsic_barrier:
4618 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4619 fs_nir_emit_intrinsic(ntb, bld, instr);
4620 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4621 /* The whole workgroup fits in a single HW thread, so all the
4622 * invocations are already executed lock-step. Instead of an actual
4623 * barrier just emit a scheduling fence, that will generate no code.
4624 */
4625 if (!s.nir->info.workgroup_size_variable &&
4626 brw_workgroup_size(s) <= s.dispatch_width) {
4627 bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
4628 break;
4629 }
4630
4631 emit_barrier(ntb);
4632 cs_prog_data->uses_barrier = true;
4633 }
4634 break;
4635
4636 case nir_intrinsic_load_inline_data_intel: {
4637 const cs_thread_payload &payload = s.cs_payload();
4638 unsigned inline_stride = brw_type_size_bytes(dest.type);
4639 for (unsigned c = 0; c < instr->def.num_components; c++) {
4640 xbld.MOV(offset(dest, xbld, c),
4641 retype(
4642 byte_offset(payload.inline_parameter,
4643 nir_intrinsic_base(instr) +
4644 c * inline_stride),
4645 dest.type));
4646 }
4647 break;
4648 }
4649
4650 case nir_intrinsic_load_subgroup_id:
4651 s.cs_payload().load_subgroup_id(bld, dest);
4652 break;
4653
4654 case nir_intrinsic_load_local_invocation_id:
4655 /* This is only used for hardware generated local IDs. */
4656 assert(cs_prog_data->generate_local_id);
4657
4658 dest.type = BRW_TYPE_UD;
4659
4660 for (unsigned i = 0; i < 3; i++)
4661 bld.MOV(offset(dest, bld, i), s.cs_payload().local_invocation_id[i]);
4662 break;
4663
4664 case nir_intrinsic_load_workgroup_id: {
4665 brw_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4666 const fs_builder ubld = bld.scalar_group();
4667
4668 assert(val.file != BAD_FILE);
4669 assert(val.is_scalar);
4670
4671 dest.type = val.type;
4672 for (unsigned i = 0; i < 3; i++)
4673 ubld.MOV(offset(dest, ubld, i), offset(val, ubld, i));
4674 break;
4675 }
4676
4677 case nir_intrinsic_load_num_workgroups: {
4678 assert(instr->def.bit_size == 32);
4679
4680 cs_prog_data->uses_num_work_groups = true;
4681
4682 brw_reg srcs[MEMORY_LOGICAL_NUM_SRCS];
4683 srcs[MEMORY_LOGICAL_OPCODE] = brw_imm_ud(LSC_OP_LOAD);
4684 srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_UNTYPED);
4685 srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_BTI);
4686 srcs[MEMORY_LOGICAL_BINDING] = brw_imm_ud(0);
4687 srcs[MEMORY_LOGICAL_ADDRESS] = brw_imm_ud(0);
4688 srcs[MEMORY_LOGICAL_COORD_COMPONENTS] = brw_imm_ud(1);
4689 srcs[MEMORY_LOGICAL_ALIGNMENT] = brw_imm_ud(4);
4690 srcs[MEMORY_LOGICAL_DATA_SIZE] = brw_imm_ud(LSC_DATA_SIZE_D32);
4691 srcs[MEMORY_LOGICAL_COMPONENTS] = brw_imm_ud(3);
4692 srcs[MEMORY_LOGICAL_FLAGS] = brw_imm_ud(0);
4693
4694 fs_inst *inst =
4695 bld.emit(SHADER_OPCODE_MEMORY_LOAD_LOGICAL,
4696 dest, srcs, MEMORY_LOGICAL_NUM_SRCS);
4697 inst->size_written = 3 * s.dispatch_width * 4;
4698 break;
4699 }
4700
4701 case nir_intrinsic_load_workgroup_size: {
4702 /* Should have been lowered by brw_nir_lower_cs_intrinsics() or
4703 * iris_setup_uniforms() for the variable group size case.
4704 */
4705 unreachable("Should have been lowered");
4706 break;
4707 }
4708
4709 case nir_intrinsic_dpas_intel: {
4710 const unsigned sdepth = nir_intrinsic_systolic_depth(instr);
4711 const unsigned rcount = nir_intrinsic_repeat_count(instr);
4712
4713 const brw_reg_type dest_type =
4714 brw_type_for_nir_type(devinfo, nir_intrinsic_dest_type(instr));
4715 const brw_reg_type src_type =
4716 brw_type_for_nir_type(devinfo, nir_intrinsic_src_type(instr));
4717
4718 dest = retype(dest, dest_type);
4719 brw_reg src0 = retype(get_nir_src(ntb, instr->src[0]), dest_type);
4720
4721 fs_builder bld16 = bld.exec_all().group(16, 0);
4722 fs_builder bldn = devinfo->ver >= 20 ? bld16 : bld.exec_all().group(8, 0);
4723
4724 bldn.DPAS(dest,
4725 src0,
4726 retype(get_nir_src(ntb, instr->src[2]), src_type),
4727 retype(get_nir_src(ntb, instr->src[1]), src_type),
4728 sdepth,
4729 rcount)
4730 ->saturate = nir_intrinsic_saturate(instr);
4731
4732 cs_prog_data->uses_systolic = true;
4733 break;
4734 }
4735
4736 default:
4737 fs_nir_emit_intrinsic(ntb, bld, instr);
4738 break;
4739 }
4740 }
4741
4742 static void
emit_rt_lsc_fence(const fs_builder & bld,enum lsc_fence_scope scope,enum lsc_flush_type flush_type)4743 emit_rt_lsc_fence(const fs_builder &bld,
4744 enum lsc_fence_scope scope,
4745 enum lsc_flush_type flush_type)
4746 {
4747 const intel_device_info *devinfo = bld.shader->devinfo;
4748
4749 const fs_builder ubld = bld.exec_all().group(8, 0);
4750 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
4751 fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
4752 brw_imm_ud(0) /* desc */,
4753 brw_imm_ud(0) /* ex_desc */,
4754 brw_vec8_grf(0, 0) /* payload */);
4755 send->sfid = GFX12_SFID_UGM;
4756 send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true);
4757 send->mlen = reg_unit(devinfo); /* g0 header */
4758 send->ex_mlen = 0;
4759 /* Temp write for scheduling */
4760 send->size_written = REG_SIZE * reg_unit(devinfo);
4761 send->send_has_side_effects = true;
4762
4763 ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
4764 }
4765
4766
4767 static void
fs_nir_emit_bs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4768 fs_nir_emit_bs_intrinsic(nir_to_brw_state &ntb,
4769 nir_intrinsic_instr *instr)
4770 {
4771 const fs_builder &bld = ntb.bld;
4772 fs_visitor &s = ntb.s;
4773
4774 assert(brw_shader_stage_is_bindless(s.stage));
4775 const bs_thread_payload &payload = s.bs_payload();
4776
4777 brw_reg dest;
4778 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4779 dest = get_nir_def(ntb, instr->def);
4780
4781 const fs_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
4782
4783 switch (instr->intrinsic) {
4784 case nir_intrinsic_load_btd_global_arg_addr_intel:
4785 xbld.MOV(dest, retype(payload.global_arg_ptr, dest.type));
4786 break;
4787
4788 case nir_intrinsic_load_btd_local_arg_addr_intel:
4789 xbld.MOV(dest, retype(payload.local_arg_ptr, dest.type));
4790 break;
4791
4792 case nir_intrinsic_load_btd_shader_type_intel:
4793 payload.load_shader_type(xbld, dest);
4794 break;
4795
4796 default:
4797 fs_nir_emit_intrinsic(ntb, bld, instr);
4798 break;
4799 }
4800 }
4801
4802 static brw_reduce_op
brw_reduce_op_for_nir_reduction_op(nir_op op)4803 brw_reduce_op_for_nir_reduction_op(nir_op op)
4804 {
4805 switch (op) {
4806 case nir_op_iadd: return BRW_REDUCE_OP_ADD;
4807 case nir_op_fadd: return BRW_REDUCE_OP_ADD;
4808 case nir_op_imul: return BRW_REDUCE_OP_MUL;
4809 case nir_op_fmul: return BRW_REDUCE_OP_MUL;
4810 case nir_op_imin: return BRW_REDUCE_OP_MIN;
4811 case nir_op_umin: return BRW_REDUCE_OP_MIN;
4812 case nir_op_fmin: return BRW_REDUCE_OP_MIN;
4813 case nir_op_imax: return BRW_REDUCE_OP_MAX;
4814 case nir_op_umax: return BRW_REDUCE_OP_MAX;
4815 case nir_op_fmax: return BRW_REDUCE_OP_MAX;
4816 case nir_op_iand: return BRW_REDUCE_OP_AND;
4817 case nir_op_ior: return BRW_REDUCE_OP_OR;
4818 case nir_op_ixor: return BRW_REDUCE_OP_XOR;
4819 default:
4820 unreachable("Invalid reduction operation");
4821 }
4822 }
4823
4824 static brw_reg
get_nir_image_intrinsic_image(nir_to_brw_state & ntb,const brw::fs_builder & bld,nir_intrinsic_instr * instr)4825 get_nir_image_intrinsic_image(nir_to_brw_state &ntb, const brw::fs_builder &bld,
4826 nir_intrinsic_instr *instr)
4827 {
4828 brw_reg surf_index = get_nir_src_imm(ntb, instr->src[0]);
4829 enum brw_reg_type type = brw_type_with_size(BRW_TYPE_UD,
4830 brw_type_size_bits(surf_index.type));
4831
4832 return bld.emit_uniformize(retype(surf_index, type));
4833 }
4834
4835 static brw_reg
get_nir_buffer_intrinsic_index(nir_to_brw_state & ntb,const brw::fs_builder & bld,nir_intrinsic_instr * instr,bool * no_mask_handle=NULL)4836 get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw::fs_builder &bld,
4837 nir_intrinsic_instr *instr, bool *no_mask_handle = NULL)
4838 {
4839 /* SSBO stores are weird in that their index is in src[1] */
4840 const bool is_store =
4841 instr->intrinsic == nir_intrinsic_store_ssbo ||
4842 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4843 nir_src src = is_store ? instr->src[1] : instr->src[0];
4844
4845 brw_reg surf_index = get_nir_src_imm(ntb, src);
4846
4847 if (no_mask_handle)
4848 *no_mask_handle = surf_index.is_scalar || surf_index.file == IMM;
4849
4850 enum brw_reg_type type = brw_type_with_size(BRW_TYPE_UD,
4851 brw_type_size_bits(surf_index.type));
4852
4853 return bld.emit_uniformize(retype(surf_index, type));
4854 }
4855
4856 /**
4857 * The offsets we get from NIR act as if each SIMD channel has it's own blob
4858 * of contiguous space. However, if we actually place each SIMD channel in
4859 * it's own space, we end up with terrible cache performance because each SIMD
4860 * channel accesses a different cache line even when they're all accessing the
4861 * same byte offset. To deal with this problem, we swizzle the address using
4862 * a simple algorithm which ensures that any time a SIMD message reads or
4863 * writes the same address, it's all in the same cache line. We have to keep
4864 * the bottom two bits fixed so that we can read/write up to a dword at a time
4865 * and the individual element is contiguous. We do this by splitting the
4866 * address as follows:
4867 *
4868 * 31 4-6 2 0
4869 * +-------------------------------+------------+----------+
4870 * | Hi address bits | chan index | addr low |
4871 * +-------------------------------+------------+----------+
4872 *
4873 * In other words, the bottom two address bits stay, and the top 30 get
4874 * shifted up so that we can stick the SIMD channel index in the middle. This
4875 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4876 * at the same logical offset, the scratch read/write instruction acts on
4877 * continuous elements and we get good cache locality.
4878 */
4879 static brw_reg
swizzle_nir_scratch_addr(nir_to_brw_state & ntb,const brw::fs_builder & bld,const nir_src & nir_addr_src,bool in_dwords)4880 swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
4881 const brw::fs_builder &bld,
4882 const nir_src &nir_addr_src,
4883 bool in_dwords)
4884 {
4885 fs_visitor &s = ntb.s;
4886
4887 const brw_reg chan_index = bld.LOAD_SUBGROUP_INVOCATION();
4888 const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4889
4890 if (nir_src_is_const(nir_addr_src)) {
4891 unsigned nir_addr = nir_src_as_uint(nir_addr_src);
4892 if (in_dwords) {
4893 /* In this case, we know the address is aligned to a DWORD and we want
4894 * the final address in DWORDs.
4895 */
4896 return bld.OR(chan_index,
4897 brw_imm_ud(nir_addr << (chan_index_bits - 2)));
4898 } else {
4899 /* This case is substantially more annoying because we have to pay
4900 * attention to those pesky two bottom bits.
4901 */
4902 unsigned addr_hi = (nir_addr & ~0x3u) << chan_index_bits;
4903 unsigned addr_lo = (nir_addr & 0x3u);
4904
4905 return bld.OR(bld.SHL(chan_index, brw_imm_ud(2)),
4906 brw_imm_ud(addr_lo | addr_hi));
4907 }
4908 }
4909
4910 const brw_reg nir_addr =
4911 retype(get_nir_src(ntb, nir_addr_src), BRW_TYPE_UD);
4912
4913 if (in_dwords) {
4914 /* In this case, we know the address is aligned to a DWORD and we want
4915 * the final address in DWORDs.
4916 */
4917 return bld.OR(bld.SHL(nir_addr, brw_imm_ud(chan_index_bits - 2)),
4918 chan_index);
4919 } else {
4920 /* This case substantially more annoying because we have to pay
4921 * attention to those pesky two bottom bits.
4922 */
4923 brw_reg chan_addr = bld.SHL(chan_index, brw_imm_ud(2));
4924 brw_reg addr_bits =
4925 bld.OR(bld.AND(nir_addr, brw_imm_ud(0x3u)),
4926 bld.SHL(bld.AND(nir_addr, brw_imm_ud(~0x3u)),
4927 brw_imm_ud(chan_index_bits)));
4928 return bld.OR(addr_bits, chan_addr);
4929 }
4930 }
4931
4932 static unsigned
choose_block_size_dwords(const intel_device_info * devinfo,unsigned dwords)4933 choose_block_size_dwords(const intel_device_info *devinfo, unsigned dwords)
4934 {
4935 const unsigned min_block = 8;
4936 const unsigned max_block = devinfo->has_lsc ? 64 : 32;
4937
4938 const unsigned block = 1 << util_logbase2(dwords);
4939
4940 return CLAMP(block, min_block, max_block);
4941 }
4942
4943 static brw_reg
increment_a64_address(const fs_builder & _bld,brw_reg address,uint32_t v,bool use_no_mask)4944 increment_a64_address(const fs_builder &_bld, brw_reg address, uint32_t v, bool use_no_mask)
4945 {
4946 const fs_builder bld = use_no_mask ? _bld.exec_all().group(8, 0) : _bld;
4947
4948 if (bld.shader->devinfo->has_64bit_int) {
4949 struct brw_reg imm = brw_imm_reg(address.type);
4950 imm.u64 = v;
4951 return bld.ADD(address, imm);
4952 } else {
4953 brw_reg dst = bld.vgrf(BRW_TYPE_UQ);
4954 brw_reg dst_low = subscript(dst, BRW_TYPE_UD, 0);
4955 brw_reg dst_high = subscript(dst, BRW_TYPE_UD, 1);
4956 brw_reg src_low = subscript(address, BRW_TYPE_UD, 0);
4957 brw_reg src_high = subscript(address, BRW_TYPE_UD, 1);
4958
4959 /* Add low and if that overflows, add carry to high. */
4960 bld.ADD(dst_low, src_low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
4961 bld.ADD(dst_high, src_high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
4962 return dst_low;
4963 }
4964 }
4965
4966 static brw_reg
emit_fence(const fs_builder & bld,enum opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4967 emit_fence(const fs_builder &bld, enum opcode opcode,
4968 uint8_t sfid, uint32_t desc,
4969 bool commit_enable, uint8_t bti)
4970 {
4971 assert(opcode == SHADER_OPCODE_INTERLOCK ||
4972 opcode == SHADER_OPCODE_MEMORY_FENCE);
4973
4974 brw_reg dst = bld.vgrf(BRW_TYPE_UD);
4975 fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
4976 brw_imm_ud(commit_enable),
4977 brw_imm_ud(bti));
4978 fence->sfid = sfid;
4979 fence->desc = desc;
4980
4981 return dst;
4982 }
4983
4984 static uint32_t
lsc_fence_descriptor_for_intrinsic(const struct intel_device_info * devinfo,nir_intrinsic_instr * instr)4985 lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
4986 nir_intrinsic_instr *instr)
4987 {
4988 assert(devinfo->has_lsc);
4989
4990 enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
4991 enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
4992
4993 if (nir_intrinsic_has_memory_scope(instr)) {
4994 switch (nir_intrinsic_memory_scope(instr)) {
4995 case SCOPE_DEVICE:
4996 case SCOPE_QUEUE_FAMILY:
4997 scope = LSC_FENCE_TILE;
4998 flush_type = LSC_FLUSH_TYPE_EVICT;
4999 break;
5000 case SCOPE_WORKGROUP:
5001 scope = LSC_FENCE_THREADGROUP;
5002 break;
5003 case SCOPE_SHADER_CALL:
5004 case SCOPE_INVOCATION:
5005 case SCOPE_SUBGROUP:
5006 case SCOPE_NONE:
5007 break;
5008 }
5009 } else {
5010 /* No scope defined. */
5011 scope = LSC_FENCE_TILE;
5012 flush_type = LSC_FLUSH_TYPE_EVICT;
5013 }
5014 return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
5015 }
5016
5017 /**
5018 * Create a MOV to read the timestamp register.
5019 */
5020 static brw_reg
get_timestamp(const fs_builder & bld)5021 get_timestamp(const fs_builder &bld)
5022 {
5023 fs_visitor &s = *bld.shader;
5024
5025 brw_reg ts = brw_reg(retype(brw_vec4_reg(ARF,
5026 BRW_ARF_TIMESTAMP, 0), BRW_TYPE_UD));
5027
5028 brw_reg dst = brw_vgrf(s.alloc.allocate(1), BRW_TYPE_UD);
5029
5030 /* We want to read the 3 fields we care about even if it's not enabled in
5031 * the dispatch.
5032 */
5033 bld.group(4, 0).exec_all().MOV(dst, ts);
5034
5035 return dst;
5036 }
5037
5038 static unsigned
component_from_intrinsic(nir_intrinsic_instr * instr)5039 component_from_intrinsic(nir_intrinsic_instr *instr)
5040 {
5041 if (nir_intrinsic_has_component(instr))
5042 return nir_intrinsic_component(instr);
5043 else
5044 return 0;
5045 }
5046
5047 static void
adjust_handle_and_offset(const fs_builder & bld,brw_reg & urb_handle,unsigned & urb_global_offset)5048 adjust_handle_and_offset(const fs_builder &bld,
5049 brw_reg &urb_handle,
5050 unsigned &urb_global_offset)
5051 {
5052 /* Make sure that URB global offset is below 2048 (2^11), because
5053 * that's the maximum possible value encoded in Message Descriptor.
5054 */
5055 unsigned adjustment = (urb_global_offset >> 11) << 11;
5056
5057 if (adjustment) {
5058 fs_builder ubld8 = bld.group(8, 0).exec_all();
5059 /* Allocate new register to not overwrite the shared URB handle. */
5060 urb_handle = ubld8.ADD(urb_handle, brw_imm_ud(adjustment));
5061 urb_global_offset -= adjustment;
5062 }
5063 }
5064
5065 static void
emit_urb_direct_vec4_write(const fs_builder & bld,unsigned urb_global_offset,const brw_reg & src,brw_reg urb_handle,unsigned dst_comp_offset,unsigned comps,unsigned mask)5066 emit_urb_direct_vec4_write(const fs_builder &bld,
5067 unsigned urb_global_offset,
5068 const brw_reg &src,
5069 brw_reg urb_handle,
5070 unsigned dst_comp_offset,
5071 unsigned comps,
5072 unsigned mask)
5073 {
5074 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5075 fs_builder bld8 = bld.group(8, q);
5076
5077 brw_reg payload_srcs[8];
5078 unsigned length = 0;
5079
5080 for (unsigned i = 0; i < dst_comp_offset; i++)
5081 payload_srcs[length++] = reg_undef;
5082
5083 for (unsigned c = 0; c < comps; c++)
5084 payload_srcs[length++] = quarter(offset(src, bld, c), q);
5085
5086 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5087 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5088 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5089 srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(bld.shader->alloc.allocate(length),
5090 BRW_TYPE_F);
5091 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5092 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5093
5094 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5095 reg_undef, srcs, ARRAY_SIZE(srcs));
5096 inst->offset = urb_global_offset;
5097 assert(inst->offset < 2048);
5098 }
5099 }
5100
5101 static void
emit_urb_direct_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,brw_reg urb_handle)5102 emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
5103 const brw_reg &src, brw_reg urb_handle)
5104 {
5105 assert(nir_src_bit_size(instr->src[0]) == 32);
5106
5107 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5108 assert(nir_src_is_const(*offset_nir_src));
5109
5110 const unsigned comps = nir_src_num_components(instr->src[0]);
5111 assert(comps <= 4);
5112
5113 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5114 nir_src_as_uint(*offset_nir_src) +
5115 component_from_intrinsic(instr);
5116
5117 /* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
5118 * We can write up to 8 dwords, so single vec4 write is enough.
5119 */
5120 const unsigned comp_shift = offset_in_dwords % 4;
5121 const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
5122
5123 unsigned urb_global_offset = offset_in_dwords / 4;
5124 adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
5125
5126 emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle,
5127 comp_shift, comps, mask);
5128 }
5129
5130 static void
emit_urb_direct_vec4_write_xe2(const fs_builder & bld,unsigned offset_in_bytes,const brw_reg & src,brw_reg urb_handle,unsigned comps,unsigned mask)5131 emit_urb_direct_vec4_write_xe2(const fs_builder &bld,
5132 unsigned offset_in_bytes,
5133 const brw_reg &src,
5134 brw_reg urb_handle,
5135 unsigned comps,
5136 unsigned mask)
5137 {
5138 const struct intel_device_info *devinfo = bld.shader->devinfo;
5139 const unsigned runit = reg_unit(devinfo);
5140 const unsigned write_size = 8 * runit;
5141
5142 if (offset_in_bytes > 0) {
5143 fs_builder bldall = bld.group(write_size, 0).exec_all();
5144 urb_handle = bldall.ADD(urb_handle, brw_imm_ud(offset_in_bytes));
5145 }
5146
5147 for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
5148 fs_builder hbld = bld.group(write_size, q);
5149
5150 assert(comps <= 4);
5151 brw_reg payload_srcs[4];
5152
5153 for (unsigned c = 0; c < comps; c++)
5154 payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
5155
5156 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5157 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5158 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5159 int nr = bld.shader->alloc.allocate(comps * runit);
5160 srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(nr, BRW_TYPE_F);
5161 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
5162 hbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
5163
5164 hbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5165 reg_undef, srcs, ARRAY_SIZE(srcs));
5166 }
5167 }
5168
5169 static void
emit_urb_direct_writes_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,brw_reg urb_handle)5170 emit_urb_direct_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5171 const brw_reg &src, brw_reg urb_handle)
5172 {
5173 assert(nir_src_bit_size(instr->src[0]) == 32);
5174
5175 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5176 assert(nir_src_is_const(*offset_nir_src));
5177
5178 const unsigned comps = nir_src_num_components(instr->src[0]);
5179 assert(comps <= 4);
5180
5181 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5182 nir_src_as_uint(*offset_nir_src) +
5183 component_from_intrinsic(instr);
5184
5185 const unsigned mask = nir_intrinsic_write_mask(instr);
5186
5187 emit_urb_direct_vec4_write_xe2(bld, offset_in_dwords * 4, src,
5188 urb_handle, comps, mask);
5189 }
5190
5191 static void
emit_urb_indirect_vec4_write(const fs_builder & bld,const brw_reg & offset_src,unsigned base,const brw_reg & src,brw_reg urb_handle,unsigned dst_comp_offset,unsigned comps,unsigned mask)5192 emit_urb_indirect_vec4_write(const fs_builder &bld,
5193 const brw_reg &offset_src,
5194 unsigned base,
5195 const brw_reg &src,
5196 brw_reg urb_handle,
5197 unsigned dst_comp_offset,
5198 unsigned comps,
5199 unsigned mask)
5200 {
5201 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5202 fs_builder bld8 = bld.group(8, q);
5203
5204 /* offset is always positive, so signedness doesn't matter */
5205 assert(offset_src.type == BRW_TYPE_D || offset_src.type == BRW_TYPE_UD);
5206 brw_reg qtr = bld8.MOV(quarter(retype(offset_src, BRW_TYPE_UD), q));
5207 brw_reg off = bld8.SHR(bld8.ADD(qtr, brw_imm_ud(base)), brw_imm_ud(2));
5208
5209 brw_reg payload_srcs[8];
5210 unsigned length = 0;
5211
5212 for (unsigned i = 0; i < dst_comp_offset; i++)
5213 payload_srcs[length++] = reg_undef;
5214
5215 for (unsigned c = 0; c < comps; c++)
5216 payload_srcs[length++] = quarter(offset(src, bld, c), q);
5217
5218 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5219 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5220 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
5221 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5222 srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(bld.shader->alloc.allocate(length),
5223 BRW_TYPE_F);
5224 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5225 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5226
5227 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5228 reg_undef, srcs, ARRAY_SIZE(srcs));
5229 inst->offset = 0;
5230 }
5231 }
5232
5233 static void
emit_urb_indirect_writes_mod(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,const brw_reg & offset_src,brw_reg urb_handle,unsigned mod)5234 emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
5235 const brw_reg &src, const brw_reg &offset_src,
5236 brw_reg urb_handle, unsigned mod)
5237 {
5238 assert(nir_src_bit_size(instr->src[0]) == 32);
5239
5240 const unsigned comps = nir_src_num_components(instr->src[0]);
5241 assert(comps <= 4);
5242
5243 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5244 component_from_intrinsic(instr);
5245
5246 const unsigned comp_shift = mod;
5247 const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
5248
5249 emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
5250 urb_handle, comp_shift, comps, mask);
5251 }
5252
5253 static void
emit_urb_indirect_writes_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,const brw_reg & offset_src,brw_reg urb_handle)5254 emit_urb_indirect_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5255 const brw_reg &src, const brw_reg &offset_src,
5256 brw_reg urb_handle)
5257 {
5258 assert(nir_src_bit_size(instr->src[0]) == 32);
5259
5260 const struct intel_device_info *devinfo = bld.shader->devinfo;
5261 const unsigned runit = reg_unit(devinfo);
5262 const unsigned write_size = 8 * runit;
5263
5264 const unsigned comps = nir_src_num_components(instr->src[0]);
5265 assert(comps <= 4);
5266
5267 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5268 component_from_intrinsic(instr);
5269
5270 if (base_in_dwords > 0) {
5271 fs_builder bldall = bld.group(write_size, 0).exec_all();
5272 urb_handle = bldall.ADD(urb_handle, brw_imm_ud(base_in_dwords * 4));
5273 }
5274
5275 const unsigned mask = nir_intrinsic_write_mask(instr);
5276
5277 for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
5278 fs_builder wbld = bld.group(write_size, q);
5279
5280 brw_reg payload_srcs[4];
5281
5282 for (unsigned c = 0; c < comps; c++)
5283 payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
5284
5285 brw_reg addr =
5286 wbld.ADD(wbld.SHL(retype(horiz_offset(offset_src, write_size * q),
5287 BRW_TYPE_UD),
5288 brw_imm_ud(2)), urb_handle);
5289
5290 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5291 srcs[URB_LOGICAL_SRC_HANDLE] = addr;
5292 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5293 int nr = bld.shader->alloc.allocate(comps * runit);
5294 srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(nr, BRW_TYPE_F);
5295 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
5296 wbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
5297
5298 wbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5299 reg_undef, srcs, ARRAY_SIZE(srcs));
5300 }
5301 }
5302
5303 static void
emit_urb_indirect_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & src,const brw_reg & offset_src,brw_reg urb_handle)5304 emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
5305 const brw_reg &src, const brw_reg &offset_src,
5306 brw_reg urb_handle)
5307 {
5308 assert(nir_src_bit_size(instr->src[0]) == 32);
5309
5310 const unsigned comps = nir_src_num_components(instr->src[0]);
5311 assert(comps <= 4);
5312
5313 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5314 component_from_intrinsic(instr);
5315
5316 /* Use URB write message that allow different offsets per-slot. The offset
5317 * is in units of vec4s (128 bits), so we use a write for each component,
5318 * replicating it in the sources and applying the appropriate mask based on
5319 * the dword offset.
5320 */
5321
5322 for (unsigned c = 0; c < comps; c++) {
5323 if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
5324 continue;
5325
5326 brw_reg src_comp = offset(src, bld, c);
5327
5328 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5329 fs_builder bld8 = bld.group(8, q);
5330
5331 /* offset is always positive, so signedness doesn't matter */
5332 assert(offset_src.type == BRW_TYPE_D ||
5333 offset_src.type == BRW_TYPE_UD);
5334
5335 brw_reg off =
5336 bld8.ADD(quarter(retype(offset_src, BRW_TYPE_UD), q),
5337 brw_imm_ud(c + base_in_dwords));
5338 brw_reg m = bld8.AND(off, brw_imm_ud(0x3));
5339 brw_reg t = bld8.SHL(bld8.MOV(brw_imm_ud(1)), m);
5340 brw_reg mask = bld8.SHL(t, brw_imm_ud(16));
5341 brw_reg final_offset = bld8.SHR(off, brw_imm_ud(2));
5342
5343 brw_reg payload_srcs[4];
5344 unsigned length = 0;
5345
5346 for (unsigned j = 0; j < 4; j++)
5347 payload_srcs[length++] = quarter(src_comp, q);
5348
5349 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5350 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5351 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = final_offset;
5352 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
5353 srcs[URB_LOGICAL_SRC_DATA] = brw_vgrf(bld.shader->alloc.allocate(length),
5354 BRW_TYPE_F);
5355 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5356 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5357
5358 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5359 reg_undef, srcs, ARRAY_SIZE(srcs));
5360 inst->offset = 0;
5361 }
5362 }
5363 }
5364
5365 static void
emit_urb_direct_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,brw_reg urb_handle)5366 emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
5367 const brw_reg &dest, brw_reg urb_handle)
5368 {
5369 assert(instr->def.bit_size == 32);
5370
5371 unsigned comps = instr->def.num_components;
5372 if (comps == 0)
5373 return;
5374
5375 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5376 assert(nir_src_is_const(*offset_nir_src));
5377
5378 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5379 nir_src_as_uint(*offset_nir_src) +
5380 component_from_intrinsic(instr);
5381
5382 unsigned urb_global_offset = offset_in_dwords / 4;
5383 adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
5384
5385 const unsigned comp_offset = offset_in_dwords % 4;
5386 const unsigned num_regs = comp_offset + comps;
5387
5388 fs_builder ubld8 = bld.group(8, 0).exec_all();
5389 brw_reg data = ubld8.vgrf(BRW_TYPE_UD, num_regs);
5390 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5391 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5392
5393 fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
5394 srcs, ARRAY_SIZE(srcs));
5395 inst->offset = urb_global_offset;
5396 assert(inst->offset < 2048);
5397 inst->size_written = num_regs * REG_SIZE;
5398
5399 for (unsigned c = 0; c < comps; c++) {
5400 brw_reg dest_comp = offset(dest, bld, c);
5401 brw_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
5402 bld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
5403 }
5404 }
5405
5406 static void
emit_urb_direct_reads_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,brw_reg urb_handle)5407 emit_urb_direct_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5408 const brw_reg &dest, brw_reg urb_handle)
5409 {
5410 assert(instr->def.bit_size == 32);
5411
5412 unsigned comps = instr->def.num_components;
5413 if (comps == 0)
5414 return;
5415
5416 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5417 assert(nir_src_is_const(*offset_nir_src));
5418
5419 fs_builder ubld16 = bld.group(16, 0).exec_all();
5420
5421 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5422 nir_src_as_uint(*offset_nir_src) +
5423 component_from_intrinsic(instr);
5424
5425 if (offset_in_dwords > 0)
5426 urb_handle = ubld16.ADD(urb_handle, brw_imm_ud(offset_in_dwords * 4));
5427
5428 brw_reg data = ubld16.vgrf(BRW_TYPE_UD, comps);
5429 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5430 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5431
5432 fs_inst *inst = ubld16.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5433 data, srcs, ARRAY_SIZE(srcs));
5434 inst->size_written = 2 * comps * REG_SIZE;
5435
5436 for (unsigned c = 0; c < comps; c++) {
5437 brw_reg dest_comp = offset(dest, bld, c);
5438 brw_reg data_comp = horiz_stride(offset(data, ubld16, c), 0);
5439 bld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
5440 }
5441 }
5442
5443 static void
emit_urb_indirect_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,const brw_reg & offset_src,brw_reg urb_handle)5444 emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
5445 const brw_reg &dest, const brw_reg &offset_src, brw_reg urb_handle)
5446 {
5447 assert(instr->def.bit_size == 32);
5448
5449 unsigned comps = instr->def.num_components;
5450 if (comps == 0)
5451 return;
5452
5453 brw_reg seq_ud;
5454 {
5455 fs_builder ubld8 = bld.group(8, 0).exec_all();
5456 seq_ud = ubld8.vgrf(BRW_TYPE_UD, 1);
5457 brw_reg seq_uw = ubld8.vgrf(BRW_TYPE_UW, 1);
5458 ubld8.MOV(seq_uw, brw_reg(brw_imm_v(0x76543210)));
5459 ubld8.MOV(seq_ud, seq_uw);
5460 seq_ud = ubld8.SHL(seq_ud, brw_imm_ud(2));
5461 }
5462
5463 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5464 component_from_intrinsic(instr);
5465
5466 for (unsigned c = 0; c < comps; c++) {
5467 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5468 fs_builder bld8 = bld.group(8, q);
5469
5470 /* offset is always positive, so signedness doesn't matter */
5471 assert(offset_src.type == BRW_TYPE_D ||
5472 offset_src.type == BRW_TYPE_UD);
5473 brw_reg off =
5474 bld8.ADD(bld8.MOV(quarter(retype(offset_src, BRW_TYPE_UD), q)),
5475 brw_imm_ud(base_in_dwords + c));
5476
5477 STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
5478
5479 brw_reg comp;
5480 comp = bld8.AND(off, brw_imm_ud(0x3));
5481 comp = bld8.SHL(comp, brw_imm_ud(ffs(REG_SIZE) - 1));
5482 comp = bld8.ADD(comp, seq_ud);
5483
5484 off = bld8.SHR(off, brw_imm_ud(2));
5485
5486 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5487 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5488 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
5489
5490 brw_reg data = bld8.vgrf(BRW_TYPE_UD, 4);
5491
5492 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5493 data, srcs, ARRAY_SIZE(srcs));
5494 inst->offset = 0;
5495 inst->size_written = 4 * REG_SIZE;
5496
5497 brw_reg dest_comp = offset(dest, bld, c);
5498 bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
5499 retype(quarter(dest_comp, q), BRW_TYPE_UD),
5500 data,
5501 comp,
5502 brw_imm_ud(4 * REG_SIZE));
5503 }
5504 }
5505 }
5506
5507 static void
emit_urb_indirect_reads_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & dest,const brw_reg & offset_src,brw_reg urb_handle)5508 emit_urb_indirect_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5509 const brw_reg &dest, const brw_reg &offset_src,
5510 brw_reg urb_handle)
5511 {
5512 assert(instr->def.bit_size == 32);
5513
5514 unsigned comps = instr->def.num_components;
5515 if (comps == 0)
5516 return;
5517
5518 fs_builder ubld16 = bld.group(16, 0).exec_all();
5519
5520 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5521 component_from_intrinsic(instr);
5522
5523 if (offset_in_dwords > 0)
5524 urb_handle = ubld16.ADD(urb_handle, brw_imm_ud(offset_in_dwords * 4));
5525
5526 brw_reg data = ubld16.vgrf(BRW_TYPE_UD, comps);
5527
5528 for (unsigned q = 0; q < bld.dispatch_width() / 16; q++) {
5529 fs_builder wbld = bld.group(16, q);
5530
5531 brw_reg addr = wbld.SHL(retype(horiz_offset(offset_src, 16 * q),
5532 BRW_TYPE_UD),
5533 brw_imm_ud(2));
5534
5535 brw_reg srcs[URB_LOGICAL_NUM_SRCS];
5536 srcs[URB_LOGICAL_SRC_HANDLE] = wbld.ADD(addr, urb_handle);
5537
5538 fs_inst *inst = wbld.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5539 data, srcs, ARRAY_SIZE(srcs));
5540 inst->size_written = 2 * comps * REG_SIZE;
5541
5542 for (unsigned c = 0; c < comps; c++) {
5543 brw_reg dest_comp = horiz_offset(offset(dest, bld, c), 16 * q);
5544 brw_reg data_comp = offset(data, wbld, c);
5545 wbld.MOV(retype(dest_comp, BRW_TYPE_UD), data_comp);
5546 }
5547 }
5548 }
5549
5550 static void
emit_task_mesh_store(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & urb_handle)5551 emit_task_mesh_store(nir_to_brw_state &ntb,
5552 const fs_builder &bld, nir_intrinsic_instr *instr,
5553 const brw_reg &urb_handle)
5554 {
5555 brw_reg src = get_nir_src(ntb, instr->src[0], -1);
5556 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5557
5558 if (nir_src_is_const(*offset_nir_src)) {
5559 if (bld.shader->devinfo->ver >= 20)
5560 emit_urb_direct_writes_xe2(bld, instr, src, urb_handle);
5561 else
5562 emit_urb_direct_writes(bld, instr, src, urb_handle);
5563 } else {
5564 if (bld.shader->devinfo->ver >= 20) {
5565 emit_urb_indirect_writes_xe2(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
5566 return;
5567 }
5568 bool use_mod = false;
5569 unsigned mod;
5570
5571 /* Try to calculate the value of (offset + base) % 4. If we can do
5572 * this, then we can do indirect writes using only 1 URB write.
5573 */
5574 use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
5575 if (use_mod) {
5576 mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
5577 mod %= 4;
5578 }
5579
5580 if (use_mod) {
5581 emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle, mod);
5582 } else {
5583 emit_urb_indirect_writes(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
5584 }
5585 }
5586 }
5587
5588 static void
emit_task_mesh_load(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,const brw_reg & urb_handle)5589 emit_task_mesh_load(nir_to_brw_state &ntb,
5590 const fs_builder &bld, nir_intrinsic_instr *instr,
5591 const brw_reg &urb_handle)
5592 {
5593 brw_reg dest = get_nir_def(ntb, instr->def);
5594 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5595
5596 /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
5597 * the non-array-index offset, we could use to decide if we can perform
5598 * a single large aligned read instead one per component.
5599 */
5600
5601 if (nir_src_is_const(*offset_nir_src)) {
5602 if (bld.shader->devinfo->ver >= 20)
5603 emit_urb_direct_reads_xe2(bld, instr, dest, urb_handle);
5604 else
5605 emit_urb_direct_reads(bld, instr, dest, urb_handle);
5606 } else {
5607 if (bld.shader->devinfo->ver >= 20)
5608 emit_urb_indirect_reads_xe2(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
5609 else
5610 emit_urb_indirect_reads(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
5611 }
5612 }
5613
5614 static void
fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)5615 fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld,
5616 nir_intrinsic_instr *instr)
5617 {
5618 fs_visitor &s = ntb.s;
5619
5620 assert(s.stage == MESA_SHADER_MESH || s.stage == MESA_SHADER_TASK);
5621 const task_mesh_thread_payload &payload = s.task_mesh_payload();
5622
5623 brw_reg dest;
5624 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5625 dest = get_nir_def(ntb, instr->def);
5626
5627 switch (instr->intrinsic) {
5628 case nir_intrinsic_load_draw_id:
5629 dest = retype(dest, BRW_TYPE_UD);
5630 bld.MOV(dest, payload.extended_parameter_0);
5631 break;
5632
5633 case nir_intrinsic_load_local_invocation_id:
5634 unreachable("local invocation id should have been lowered earlier");
5635 break;
5636
5637 case nir_intrinsic_load_local_invocation_index:
5638 dest = retype(dest, BRW_TYPE_UD);
5639 bld.MOV(dest, payload.local_index);
5640 break;
5641
5642 case nir_intrinsic_load_num_workgroups:
5643 dest = retype(dest, BRW_TYPE_UD);
5644 bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */
5645 bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8)); /* g0.4 & 0xffff */
5646 bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9)); /* g0.4 >> 16 */
5647 break;
5648
5649 case nir_intrinsic_load_workgroup_index:
5650 dest = retype(dest, BRW_TYPE_UD);
5651 bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_TYPE_UD));
5652 break;
5653
5654 default:
5655 fs_nir_emit_cs_intrinsic(ntb, instr);
5656 break;
5657 }
5658 }
5659
5660 static void
fs_nir_emit_task_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)5661 fs_nir_emit_task_intrinsic(nir_to_brw_state &ntb,
5662 nir_intrinsic_instr *instr)
5663 {
5664 const fs_builder &bld = ntb.bld;
5665 fs_visitor &s = ntb.s;
5666
5667 assert(s.stage == MESA_SHADER_TASK);
5668 const task_mesh_thread_payload &payload = s.task_mesh_payload();
5669
5670 switch (instr->intrinsic) {
5671 case nir_intrinsic_store_output:
5672 case nir_intrinsic_store_task_payload:
5673 emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
5674 break;
5675
5676 case nir_intrinsic_load_output:
5677 case nir_intrinsic_load_task_payload:
5678 emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
5679 break;
5680
5681 default:
5682 fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
5683 break;
5684 }
5685 }
5686
5687 static void
fs_nir_emit_mesh_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)5688 fs_nir_emit_mesh_intrinsic(nir_to_brw_state &ntb,
5689 nir_intrinsic_instr *instr)
5690 {
5691 const fs_builder &bld = ntb.bld;
5692 fs_visitor &s = ntb.s;
5693
5694 assert(s.stage == MESA_SHADER_MESH);
5695 const task_mesh_thread_payload &payload = s.task_mesh_payload();
5696
5697 switch (instr->intrinsic) {
5698 case nir_intrinsic_store_per_primitive_output:
5699 case nir_intrinsic_store_per_vertex_output:
5700 case nir_intrinsic_store_output:
5701 emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
5702 break;
5703
5704 case nir_intrinsic_load_per_vertex_output:
5705 case nir_intrinsic_load_per_primitive_output:
5706 case nir_intrinsic_load_output:
5707 emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
5708 break;
5709
5710 case nir_intrinsic_load_task_payload:
5711 emit_task_mesh_load(ntb, bld, instr, payload.task_urb_input);
5712 break;
5713
5714 default:
5715 fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
5716 break;
5717 }
5718 }
5719
5720 static void
fs_nir_emit_intrinsic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)5721 fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
5722 const fs_builder &bld, nir_intrinsic_instr *instr)
5723 {
5724 const intel_device_info *devinfo = ntb.devinfo;
5725 fs_visitor &s = ntb.s;
5726
5727 /* We handle this as a special case */
5728 if (instr->intrinsic == nir_intrinsic_decl_reg) {
5729 assert(nir_intrinsic_num_array_elems(instr) == 0);
5730 unsigned bit_size = nir_intrinsic_bit_size(instr);
5731 unsigned num_components = nir_intrinsic_num_components(instr);
5732 const brw_reg_type reg_type =
5733 brw_type_with_size(bit_size == 8 ? BRW_TYPE_D : BRW_TYPE_F,
5734 bit_size);
5735
5736 /* Re-use the destination's slot in the table for the register */
5737 ntb.ssa_values[instr->def.index] =
5738 bld.vgrf(reg_type, num_components);
5739 return;
5740 }
5741
5742 brw_reg dest;
5743 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5744 dest = get_nir_def(ntb, instr->def);
5745
5746 const fs_builder xbld = dest.is_scalar ? bld.scalar_group() : bld;
5747
5748 switch (instr->intrinsic) {
5749 case nir_intrinsic_resource_intel: {
5750 ntb.ssa_bind_infos[instr->def.index].valid = true;
5751 ntb.ssa_bind_infos[instr->def.index].bindless =
5752 (nir_intrinsic_resource_access_intel(instr) &
5753 nir_resource_intel_bindless) != 0;
5754 ntb.ssa_bind_infos[instr->def.index].block =
5755 nir_intrinsic_resource_block_intel(instr);
5756 ntb.ssa_bind_infos[instr->def.index].set =
5757 nir_intrinsic_desc_set(instr);
5758 ntb.ssa_bind_infos[instr->def.index].binding =
5759 nir_intrinsic_binding(instr);
5760
5761 dest = retype(dest, BRW_TYPE_UD);
5762 ntb.ssa_values[instr->def.index] = dest;
5763
5764 xbld.MOV(dest,
5765 bld.emit_uniformize(get_nir_src(ntb, instr->src[1])));
5766 break;
5767 }
5768
5769 case nir_intrinsic_load_reg:
5770 case nir_intrinsic_store_reg:
5771 /* Nothing to do with these. */
5772 break;
5773
5774 case nir_intrinsic_load_global_constant_uniform_block_intel:
5775 case nir_intrinsic_load_ssbo_uniform_block_intel:
5776 case nir_intrinsic_load_shared_uniform_block_intel:
5777 case nir_intrinsic_load_global_block_intel:
5778 case nir_intrinsic_store_global_block_intel:
5779 case nir_intrinsic_load_shared_block_intel:
5780 case nir_intrinsic_store_shared_block_intel:
5781 case nir_intrinsic_load_ssbo_block_intel:
5782 case nir_intrinsic_store_ssbo_block_intel:
5783 case nir_intrinsic_image_load:
5784 case nir_intrinsic_image_store:
5785 case nir_intrinsic_image_atomic:
5786 case nir_intrinsic_image_atomic_swap:
5787 case nir_intrinsic_bindless_image_load:
5788 case nir_intrinsic_bindless_image_store:
5789 case nir_intrinsic_bindless_image_atomic:
5790 case nir_intrinsic_bindless_image_atomic_swap:
5791 case nir_intrinsic_load_shared:
5792 case nir_intrinsic_store_shared:
5793 case nir_intrinsic_shared_atomic:
5794 case nir_intrinsic_shared_atomic_swap:
5795 case nir_intrinsic_load_ssbo:
5796 case nir_intrinsic_store_ssbo:
5797 case nir_intrinsic_ssbo_atomic:
5798 case nir_intrinsic_ssbo_atomic_swap:
5799 case nir_intrinsic_load_global:
5800 case nir_intrinsic_load_global_constant:
5801 case nir_intrinsic_store_global:
5802 case nir_intrinsic_global_atomic:
5803 case nir_intrinsic_global_atomic_swap:
5804 case nir_intrinsic_load_scratch:
5805 case nir_intrinsic_store_scratch:
5806 fs_nir_emit_memory_access(ntb, bld, xbld, instr);
5807 break;
5808
5809 case nir_intrinsic_image_size:
5810 case nir_intrinsic_bindless_image_size: {
5811 /* Cube image sizes should have previously been lowered to a 2D array */
5812 assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
5813
5814 /* Unlike the [un]typed load and store opcodes, the TXS that this turns
5815 * into will handle the binding table index for us in the geneerator.
5816 * Incidentally, this means that we can handle bindless with exactly the
5817 * same code.
5818 */
5819 brw_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_TYPE_UD);
5820 image = bld.emit_uniformize(image);
5821
5822 assert(nir_src_as_uint(instr->src[1]) == 0);
5823
5824 brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
5825 if (instr->intrinsic == nir_intrinsic_image_size)
5826 srcs[TEX_LOGICAL_SRC_SURFACE] = image;
5827 else
5828 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
5829 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
5830 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
5831 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
5832 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
5833
5834 /* Since the image size is always uniform, we can just emit a SIMD8
5835 * query instruction and splat the result out.
5836 */
5837 const fs_builder ubld = bld.scalar_group();
5838
5839 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
5840 fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
5841 tmp, srcs, ARRAY_SIZE(srcs));
5842 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5843
5844 for (unsigned c = 0; c < instr->def.num_components; ++c) {
5845 bld.MOV(offset(retype(dest, tmp.type), bld, c),
5846 component(offset(tmp, ubld, c), 0));
5847 }
5848 break;
5849 }
5850
5851 case nir_intrinsic_barrier:
5852 case nir_intrinsic_begin_invocation_interlock:
5853 case nir_intrinsic_end_invocation_interlock: {
5854 bool ugm_fence, slm_fence, tgm_fence, urb_fence;
5855 enum opcode opcode = BRW_OPCODE_NOP;
5856
5857 /* Handling interlock intrinsics here will allow the logic for IVB
5858 * render cache (see below) to be reused.
5859 */
5860
5861 switch (instr->intrinsic) {
5862 case nir_intrinsic_barrier: {
5863 /* Note we only care about the memory part of the
5864 * barrier. The execution part will be taken care
5865 * of by the stage specific intrinsic handler functions.
5866 */
5867 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
5868 ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
5869 slm_fence = modes & nir_var_mem_shared;
5870 tgm_fence = modes & nir_var_image;
5871 urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
5872 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
5873 opcode = SHADER_OPCODE_MEMORY_FENCE;
5874 break;
5875 }
5876
5877 case nir_intrinsic_begin_invocation_interlock:
5878 /* For beginInvocationInterlockARB(), we will generate a memory fence
5879 * but with a different opcode so that generator can pick SENDC
5880 * instead of SEND.
5881 */
5882 assert(s.stage == MESA_SHADER_FRAGMENT);
5883 ugm_fence = tgm_fence = true;
5884 slm_fence = urb_fence = false;
5885 opcode = SHADER_OPCODE_INTERLOCK;
5886 break;
5887
5888 case nir_intrinsic_end_invocation_interlock:
5889 /* For endInvocationInterlockARB(), we need to insert a memory fence which
5890 * stalls in the shader until the memory transactions prior to that
5891 * fence are complete. This ensures that the shader does not end before
5892 * any writes from its critical section have landed. Otherwise, you can
5893 * end up with a case where the next invocation on that pixel properly
5894 * stalls for previous FS invocation on its pixel to complete but
5895 * doesn't actually wait for the dataport memory transactions from that
5896 * thread to land before submitting its own.
5897 */
5898 assert(s.stage == MESA_SHADER_FRAGMENT);
5899 ugm_fence = tgm_fence = true;
5900 slm_fence = urb_fence = false;
5901 opcode = SHADER_OPCODE_MEMORY_FENCE;
5902 break;
5903
5904 default:
5905 unreachable("invalid intrinsic");
5906 }
5907
5908 if (opcode == BRW_OPCODE_NOP)
5909 break;
5910
5911 if (s.nir->info.shared_size > 0) {
5912 assert(gl_shader_stage_uses_workgroup(s.stage));
5913 } else {
5914 slm_fence = false;
5915 }
5916
5917 /* If the workgroup fits in a single HW thread, the messages for SLM are
5918 * processed in-order and the shader itself is already synchronized so
5919 * the memory fence is not necessary.
5920 *
5921 * TODO: Check if applies for many HW threads sharing same Data Port.
5922 */
5923 if (!s.nir->info.workgroup_size_variable &&
5924 slm_fence && brw_workgroup_size(s) <= s.dispatch_width)
5925 slm_fence = false;
5926
5927 switch (s.stage) {
5928 case MESA_SHADER_TESS_CTRL:
5929 case MESA_SHADER_TASK:
5930 case MESA_SHADER_MESH:
5931 break;
5932 default:
5933 urb_fence = false;
5934 break;
5935 }
5936
5937 unsigned fence_regs_count = 0;
5938 brw_reg fence_regs[4] = {};
5939
5940 const fs_builder ubld = bld.group(8, 0);
5941
5942 /* A memory barrier with acquire semantics requires us to
5943 * guarantee that memory operations of the specified storage
5944 * class sequenced-after the barrier aren't reordered before the
5945 * barrier, nor before any previous atomic operation
5946 * sequenced-before the barrier which may be synchronizing this
5947 * acquire barrier with a prior release sequence.
5948 *
5949 * In order to guarantee the latter we must make sure that any
5950 * such previous operation has completed execution before
5951 * invalidating the relevant caches, since otherwise some cache
5952 * could be polluted by a concurrent thread after its
5953 * invalidation but before the previous atomic completes, which
5954 * could lead to a violation of the expected memory ordering if
5955 * a subsequent memory read hits the polluted cacheline, which
5956 * would return a stale value read from memory before the
5957 * completion of the atomic sequenced-before the barrier.
5958 *
5959 * This ordering inversion can be avoided trivially if the
5960 * operations we need to order are all handled by a single
5961 * in-order cache, since the flush implied by the memory fence
5962 * occurs after any pending operations have completed, however
5963 * that doesn't help us when dealing with multiple caches
5964 * processing requests out of order, in which case we need to
5965 * explicitly stall the EU until any pending memory operations
5966 * have executed.
5967 *
5968 * Note that that might be somewhat heavy handed in some cases.
5969 * In particular when this memory fence was inserted by
5970 * spirv_to_nir() lowering an atomic with acquire semantics into
5971 * an atomic+barrier sequence we could do a better job by
5972 * synchronizing with respect to that one atomic *only*, but
5973 * that would require additional information not currently
5974 * available to the backend.
5975 *
5976 * XXX - Use an alternative workaround on IVB and ICL, since
5977 * SYNC.ALLWR is only available on Gfx12+.
5978 */
5979 if (devinfo->ver >= 12 &&
5980 (!nir_intrinsic_has_memory_scope(instr) ||
5981 (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
5982 ubld.exec_all().group(1, 0).SYNC(TGL_SYNC_ALLWR);
5983 }
5984
5985 if (devinfo->has_lsc) {
5986 assert(devinfo->verx10 >= 125);
5987 uint32_t desc =
5988 lsc_fence_descriptor_for_intrinsic(devinfo, instr);
5989 if (ugm_fence) {
5990 fence_regs[fence_regs_count++] =
5991 emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
5992 true /* commit_enable */,
5993 0 /* bti; ignored for LSC */);
5994 }
5995
5996 if (tgm_fence) {
5997 fence_regs[fence_regs_count++] =
5998 emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
5999 true /* commit_enable */,
6000 0 /* bti; ignored for LSC */);
6001 }
6002
6003 if (slm_fence) {
6004 assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6005 if (intel_needs_workaround(devinfo, 14014063774)) {
6006 /* Wa_14014063774
6007 *
6008 * Before SLM fence compiler needs to insert SYNC.ALLWR in order
6009 * to avoid the SLM data race.
6010 */
6011 ubld.exec_all().group(1, 0).SYNC(TGL_SYNC_ALLWR);
6012 }
6013 fence_regs[fence_regs_count++] =
6014 emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
6015 true /* commit_enable */,
6016 0 /* BTI; ignored for LSC */);
6017 }
6018
6019 if (urb_fence) {
6020 assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6021 fence_regs[fence_regs_count++] =
6022 emit_fence(ubld, opcode, BRW_SFID_URB, desc,
6023 true /* commit_enable */,
6024 0 /* BTI; ignored for LSC */);
6025 }
6026 } else if (devinfo->ver >= 11) {
6027 if (tgm_fence || ugm_fence || urb_fence) {
6028 fence_regs[fence_regs_count++] =
6029 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6030 true /* commit_enable HSD ES # 1404612949 */,
6031 0 /* BTI = 0 means data cache */);
6032 }
6033
6034 if (slm_fence) {
6035 assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6036 fence_regs[fence_regs_count++] =
6037 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6038 true /* commit_enable HSD ES # 1404612949 */,
6039 GFX7_BTI_SLM);
6040 }
6041 } else {
6042 /* Simulation also complains on Gfx9 if we do not enable commit.
6043 */
6044 const bool commit_enable =
6045 instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
6046 devinfo->ver == 9;
6047
6048 if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
6049 fence_regs[fence_regs_count++] =
6050 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6051 commit_enable, 0 /* BTI */);
6052 }
6053 }
6054
6055 assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
6056
6057 /* Be conservative in Gen11+ and always stall in a fence. Since
6058 * there are two different fences, and shader might want to
6059 * synchronize between them.
6060 *
6061 * TODO: Use scope and visibility information for the barriers from NIR
6062 * to make a better decision on whether we need to stall.
6063 */
6064 bool force_stall = devinfo->ver >= 11;
6065
6066 /* There are four cases where we want to insert a stall:
6067 *
6068 * 1. If we're a nir_intrinsic_end_invocation_interlock. This is
6069 * required to ensure that the shader EOT doesn't happen until
6070 * after the fence returns. Otherwise, we might end up with the
6071 * next shader invocation for that pixel not respecting our fence
6072 * because it may happen on a different HW thread.
6073 *
6074 * 2. If we have multiple fences. This is required to ensure that
6075 * they all complete and nothing gets weirdly out-of-order.
6076 *
6077 * 3. If we have no fences. In this case, we need at least a
6078 * scheduling barrier to keep the compiler from moving things
6079 * around in an invalid way.
6080 *
6081 * 4. On Gen11+ and platforms with LSC, we have multiple fence types,
6082 * without further information about the fence, we need to force a
6083 * stall.
6084 */
6085 if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
6086 fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
6087 ubld.exec_all().group(1, 0).emit(
6088 FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
6089 fence_regs, fence_regs_count);
6090 }
6091
6092 break;
6093 }
6094
6095 case nir_intrinsic_shader_clock: {
6096 /* We cannot do anything if there is an event, so ignore it for now */
6097 const brw_reg shader_clock = get_timestamp(bld);
6098 const brw_reg srcs[] = { component(shader_clock, 0),
6099 component(shader_clock, 1) };
6100 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
6101 break;
6102 }
6103
6104 case nir_intrinsic_load_reloc_const_intel: {
6105 uint32_t id = nir_intrinsic_param_idx(instr);
6106 uint32_t base = nir_intrinsic_base(instr);
6107
6108 assert(dest.is_scalar);
6109
6110 xbld.emit(SHADER_OPCODE_MOV_RELOC_IMM, retype(dest, BRW_TYPE_D),
6111 brw_imm_ud(id), brw_imm_ud(base));
6112 break;
6113 }
6114
6115 case nir_intrinsic_load_uniform: {
6116 /* Offsets are in bytes but they should always aligned to
6117 * the type size
6118 */
6119 unsigned base_offset = nir_intrinsic_base(instr);
6120 assert(base_offset % 4 == 0 || base_offset % brw_type_size_bytes(dest.type) == 0);
6121
6122 brw_reg src = brw_uniform_reg(base_offset / 4, dest.type);
6123
6124 if (nir_src_is_const(instr->src[0])) {
6125 unsigned load_offset = nir_src_as_uint(instr->src[0]);
6126 assert(load_offset % brw_type_size_bytes(dest.type) == 0);
6127 /* The base offset can only handle 32-bit units, so for 16-bit
6128 * data take the modulo of the offset with 4 bytes and add it to
6129 * the offset to read from within the source register.
6130 */
6131 src.offset = load_offset + base_offset % 4;
6132
6133 for (unsigned j = 0; j < instr->num_components; j++) {
6134 xbld.MOV(offset(dest, xbld, j), offset(src, xbld, j));
6135 }
6136 } else {
6137 brw_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
6138 BRW_TYPE_UD);
6139
6140 /* We need to pass a size to the MOV_INDIRECT but we don't want it to
6141 * go past the end of the uniform. In order to keep the n'th
6142 * component from running past, we subtract off the size of all but
6143 * one component of the vector.
6144 */
6145 assert(nir_intrinsic_range(instr) >=
6146 instr->num_components * brw_type_size_bytes(dest.type));
6147 unsigned read_size = nir_intrinsic_range(instr) -
6148 (instr->num_components - 1) * brw_type_size_bytes(dest.type);
6149
6150 bool supports_64bit_indirects = !intel_device_info_is_9lp(devinfo);
6151
6152 if (brw_type_size_bytes(dest.type) != 8 || supports_64bit_indirects) {
6153 for (unsigned j = 0; j < instr->num_components; j++) {
6154 xbld.emit(SHADER_OPCODE_MOV_INDIRECT,
6155 offset(dest, xbld, j), offset(src, xbld, j),
6156 indirect, brw_imm_ud(read_size));
6157 }
6158 } else {
6159 const unsigned num_mov_indirects =
6160 brw_type_size_bytes(dest.type) / brw_type_size_bytes(BRW_TYPE_UD);
6161 /* We read a little bit less per MOV INDIRECT, as they are now
6162 * 32-bits ones instead of 64-bit. Fix read_size then.
6163 */
6164 const unsigned read_size_32bit = read_size -
6165 (num_mov_indirects - 1) * brw_type_size_bytes(BRW_TYPE_UD);
6166 for (unsigned j = 0; j < instr->num_components; j++) {
6167 for (unsigned i = 0; i < num_mov_indirects; i++) {
6168 xbld.emit(SHADER_OPCODE_MOV_INDIRECT,
6169 subscript(offset(dest, xbld, j), BRW_TYPE_UD, i),
6170 subscript(offset(src, xbld, j), BRW_TYPE_UD, i),
6171 indirect, brw_imm_ud(read_size_32bit));
6172 }
6173 }
6174 }
6175 }
6176 break;
6177 }
6178
6179 case nir_intrinsic_load_ubo:
6180 case nir_intrinsic_load_ubo_uniform_block_intel: {
6181 brw_reg surface, surface_handle;
6182 bool no_mask_handle = false;
6183
6184 if (get_nir_src_bindless(ntb, instr->src[0]))
6185 surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
6186 else
6187 surface = get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
6188
6189 const unsigned first_component =
6190 nir_def_first_component_read(&instr->def);
6191 const unsigned last_component =
6192 nir_def_last_component_read(&instr->def);
6193 const unsigned num_components = last_component - first_component + 1;
6194
6195 if (!nir_src_is_const(instr->src[1])) {
6196 s.prog_data->has_ubo_pull = true;
6197
6198 if (instr->intrinsic == nir_intrinsic_load_ubo) {
6199 /* load_ubo with non-constant offset. The offset might still be
6200 * uniform on non-LSC platforms when loading fewer than 4
6201 * components.
6202 */
6203 brw_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
6204 BRW_TYPE_UD);
6205
6206 const unsigned comps_per_load = brw_type_size_bytes(dest.type) == 8 ? 2 : 4;
6207
6208 for (unsigned i = first_component;
6209 i <= last_component;
6210 i += comps_per_load) {
6211 const unsigned remaining = last_component + 1 - i;
6212 xbld.VARYING_PULL_CONSTANT_LOAD(offset(dest, xbld, i),
6213 surface, surface_handle,
6214 base_offset,
6215 i * brw_type_size_bytes(dest.type),
6216 instr->def.bit_size / 8,
6217 MIN2(remaining, comps_per_load));
6218 }
6219 } else {
6220 /* load_ubo_uniform_block_intel with non-constant offset */
6221 fs_nir_emit_memory_access(ntb, bld, xbld, instr);
6222 }
6223 } else {
6224 /* Even if we are loading doubles, a pull constant load will load
6225 * a 32-bit vec4, so should only reserve vgrf space for that. If we
6226 * need to load a full dvec4 we will have to emit 2 loads. This is
6227 * similar to demote_pull_constants(), except that in that case we
6228 * see individual accesses to each component of the vector and then
6229 * we let CSE deal with duplicate loads. Here we see a vector access
6230 * and we have to split it if necessary.
6231 */
6232 const unsigned type_size = brw_type_size_bytes(dest.type);
6233 const unsigned load_offset =
6234 nir_src_as_uint(instr->src[1]) + first_component * type_size;
6235 const unsigned end_offset = load_offset + num_components * type_size;
6236 const unsigned ubo_block =
6237 brw_nir_ubo_surface_index_get_push_block(instr->src[0]);
6238 const unsigned offset_256b = load_offset / 32;
6239 const unsigned end_256b = DIV_ROUND_UP(end_offset, 32);
6240
6241 /* See if we've selected this as a push constant candidate */
6242 brw_reg push_reg;
6243 for (int i = 0; i < 4; i++) {
6244 const struct brw_ubo_range *range = &s.prog_data->ubo_ranges[i];
6245 if (range->block == ubo_block &&
6246 offset_256b >= range->start &&
6247 end_256b <= range->start + range->length) {
6248
6249 push_reg = brw_uniform_reg(UBO_START + i, dest.type);
6250 push_reg.offset = load_offset - 32 * range->start;
6251 break;
6252 }
6253 }
6254
6255 if (push_reg.file != BAD_FILE) {
6256 for (unsigned i = first_component; i <= last_component; i++) {
6257 xbld.MOV(offset(dest, xbld, i),
6258 byte_offset(push_reg,
6259 (i - first_component) * type_size));
6260 }
6261 break;
6262 }
6263
6264 s.prog_data->has_ubo_pull = true;
6265
6266 if (instr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) {
6267 fs_nir_emit_memory_access(ntb, bld, xbld, instr);
6268 break;
6269 }
6270
6271 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
6272 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
6273
6274 for (unsigned c = 0; c < num_components;) {
6275 const unsigned base = load_offset + c * type_size;
6276 /* Number of usable components in the next block-aligned load. */
6277 const unsigned count = MIN2(num_components - c,
6278 (block_sz - base % block_sz) / type_size);
6279
6280 const brw_reg packed_consts = ubld.vgrf(BRW_TYPE_UD);
6281 brw_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
6282 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface;
6283 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
6284 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
6285 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
6286
6287 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
6288 srcs, PULL_UNIFORM_CONSTANT_SRCS);
6289
6290 const brw_reg consts =
6291 retype(byte_offset(packed_consts, base & (block_sz - 1)),
6292 dest.type);
6293
6294 for (unsigned d = 0; d < count; d++) {
6295 xbld.MOV(offset(dest, xbld, first_component + c + d),
6296 component(consts, d));
6297 }
6298
6299 c += count;
6300 }
6301 }
6302 break;
6303 }
6304
6305 case nir_intrinsic_store_output: {
6306 assert(nir_src_bit_size(instr->src[0]) == 32);
6307 brw_reg src = get_nir_src(ntb, instr->src[0], -1);
6308
6309 unsigned store_offset = nir_src_as_uint(instr->src[1]);
6310 unsigned num_components = instr->num_components;
6311 unsigned first_component = nir_intrinsic_component(instr);
6312
6313 brw_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
6314 4 * store_offset), src.type);
6315
6316 brw_combine_with_vec(bld, offset(new_dest, bld, first_component),
6317 src, num_components);
6318 break;
6319 }
6320
6321 case nir_intrinsic_get_ssbo_size: {
6322 assert(nir_src_num_components(instr->src[0]) == 1);
6323
6324 /* A resinfo's sampler message is used to get the buffer size. The
6325 * SIMD8's writeback message consists of four registers and SIMD16's
6326 * writeback message consists of 8 destination registers (two per each
6327 * component). Because we are only interested on the first channel of
6328 * the first returned component, where resinfo returns the buffer size
6329 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
6330 * the dispatch width.
6331 */
6332 const fs_builder ubld = bld.scalar_group();
6333 brw_reg ret_payload = ubld.vgrf(BRW_TYPE_UD, 4);
6334
6335 /* Set LOD = 0 */
6336 brw_reg src_payload = ubld.MOV(brw_imm_ud(0));
6337
6338 brw_reg srcs[GET_BUFFER_SIZE_SRCS];
6339 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
6340 GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
6341 GET_BUFFER_SIZE_SRC_SURFACE] =
6342 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6343 srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
6344 fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
6345 srcs, GET_BUFFER_SIZE_SRCS);
6346 inst->header_size = 0;
6347 inst->mlen = reg_unit(devinfo);
6348 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
6349
6350 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
6351 *
6352 * "Out-of-bounds checking is always performed at a DWord granularity. If
6353 * any part of the DWord is out-of-bounds then the whole DWord is
6354 * considered out-of-bounds."
6355 *
6356 * This implies that types with size smaller than 4-bytes need to be
6357 * padded if they don't complete the last dword of the buffer. But as we
6358 * need to maintain the original size we need to reverse the padding
6359 * calculation to return the correct size to know the number of elements
6360 * of an unsized array. As we stored in the last two bits of the surface
6361 * size the needed padding for the buffer, we calculate here the
6362 * original buffer_size reversing the surface_size calculation:
6363 *
6364 * surface_size = isl_align(buffer_size, 4) +
6365 * (isl_align(buffer_size) - buffer_size)
6366 *
6367 * buffer_size = surface_size & ~3 - surface_size & 3
6368 */
6369 brw_reg size_padding = ubld.AND(ret_payload, brw_imm_ud(3));
6370 brw_reg size_aligned4 = ubld.AND(ret_payload, brw_imm_ud(~3));
6371 brw_reg buffer_size = ubld.ADD(size_aligned4, negate(size_padding));
6372
6373 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
6374 break;
6375 }
6376
6377 case nir_intrinsic_load_subgroup_size:
6378 /* This should only happen for fragment shaders because every other case
6379 * is lowered in NIR so we can optimize on it.
6380 */
6381 assert(s.stage == MESA_SHADER_FRAGMENT);
6382 bld.MOV(retype(dest, BRW_TYPE_D), brw_imm_d(s.dispatch_width));
6383 break;
6384
6385 case nir_intrinsic_load_subgroup_invocation:
6386 bld.MOV(retype(dest, BRW_TYPE_UD), bld.LOAD_SUBGROUP_INVOCATION());
6387 break;
6388
6389 case nir_intrinsic_load_subgroup_eq_mask:
6390 case nir_intrinsic_load_subgroup_ge_mask:
6391 case nir_intrinsic_load_subgroup_gt_mask:
6392 case nir_intrinsic_load_subgroup_le_mask:
6393 case nir_intrinsic_load_subgroup_lt_mask:
6394 unreachable("not reached");
6395
6396 case nir_intrinsic_ddx_fine:
6397 bld.emit(FS_OPCODE_DDX_FINE, retype(dest, BRW_TYPE_F),
6398 retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6399 break;
6400 case nir_intrinsic_ddx:
6401 case nir_intrinsic_ddx_coarse:
6402 bld.emit(FS_OPCODE_DDX_COARSE, retype(dest, BRW_TYPE_F),
6403 retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6404 break;
6405 case nir_intrinsic_ddy_fine:
6406 bld.emit(FS_OPCODE_DDY_FINE, retype(dest, BRW_TYPE_F),
6407 retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6408 break;
6409 case nir_intrinsic_ddy:
6410 case nir_intrinsic_ddy_coarse:
6411 bld.emit(FS_OPCODE_DDY_COARSE, retype(dest, BRW_TYPE_F),
6412 retype(get_nir_src(ntb, instr->src[0]), BRW_TYPE_F));
6413 break;
6414
6415 case nir_intrinsic_vote_any:
6416 case nir_intrinsic_vote_all:
6417 case nir_intrinsic_quad_vote_any:
6418 case nir_intrinsic_quad_vote_all: {
6419 const bool any = instr->intrinsic == nir_intrinsic_vote_any ||
6420 instr->intrinsic == nir_intrinsic_quad_vote_any;
6421 const bool quad = instr->intrinsic == nir_intrinsic_quad_vote_any ||
6422 instr->intrinsic == nir_intrinsic_quad_vote_all;
6423
6424 brw_reg cond = get_nir_src(ntb, instr->src[0]);
6425 const unsigned cluster_size = quad ? 4 : s.dispatch_width;
6426
6427 bld.emit(any ? SHADER_OPCODE_VOTE_ANY : SHADER_OPCODE_VOTE_ALL,
6428 retype(dest, BRW_TYPE_UD), cond, brw_imm_ud(cluster_size));
6429
6430 break;
6431 }
6432
6433 case nir_intrinsic_vote_feq:
6434 case nir_intrinsic_vote_ieq: {
6435 brw_reg value = get_nir_src(ntb, instr->src[0]);
6436 if (instr->intrinsic == nir_intrinsic_vote_feq) {
6437 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6438 value.type = bit_size == 8 ? BRW_TYPE_B :
6439 brw_type_with_size(BRW_TYPE_F, bit_size);
6440 }
6441 bld.emit(SHADER_OPCODE_VOTE_EQUAL, retype(dest, BRW_TYPE_D), value);
6442 break;
6443 }
6444
6445 case nir_intrinsic_ballot: {
6446 if (instr->def.bit_size > 32) {
6447 dest.type = BRW_TYPE_UQ;
6448 } else {
6449 dest.type = BRW_TYPE_UD;
6450 }
6451
6452 brw_reg value = get_nir_src(ntb, instr->src[0]);
6453
6454 /* A ballot will always be at the full dispatch width even if the
6455 * use of the ballot result is smaller. If the source is_scalar,
6456 * it may be allocated at less than the full dispatch width (e.g.,
6457 * allocated at SIMD8 with SIMD32 dispatch). The input may or may
6458 * not be stride=0. If it is not, the generated ballot
6459 *
6460 * ballot(32) dst, value<1>
6461 *
6462 * is invalid because it will read out of bounds from value.
6463 *
6464 * To account for this, modify the stride of an is_scalar input to be
6465 * zero.
6466 */
6467 if (value.is_scalar)
6468 value = component(value, 0);
6469
6470 /* Note the use of bld here instead of xbld. As mentioned above, the
6471 * ballot must execute on all SIMD lanes regardless of the amount of
6472 * data (i.e., scalar or not scalar) generated.
6473 */
6474 fs_inst *inst = bld.emit(SHADER_OPCODE_BALLOT, dest, value);
6475
6476 if (dest.is_scalar)
6477 inst->size_written = dest.component_size(xbld.dispatch_width());
6478
6479 break;
6480 }
6481
6482 case nir_intrinsic_read_invocation: {
6483 const brw_reg value = get_nir_src(ntb, instr->src[0]);
6484 const brw_reg invocation = get_nir_src_imm(ntb, instr->src[1]);
6485
6486 bld.emit(SHADER_OPCODE_READ_FROM_CHANNEL, retype(dest, value.type),
6487 value, invocation);
6488 break;
6489 }
6490
6491 case nir_intrinsic_read_first_invocation: {
6492 const brw_reg value = get_nir_src(ntb, instr->src[0]);
6493
6494 bld.emit(SHADER_OPCODE_READ_FROM_LIVE_CHANNEL, retype(dest, value.type), value);
6495 break;
6496 }
6497
6498 case nir_intrinsic_shuffle: {
6499 const brw_reg value = get_nir_src(ntb, instr->src[0]);
6500 const brw_reg index = get_nir_src(ntb, instr->src[1]);
6501
6502 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
6503 break;
6504 }
6505
6506 case nir_intrinsic_first_invocation: {
6507 brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
6508 bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
6509 bld.MOV(retype(dest, BRW_TYPE_UD),
6510 brw_reg(component(tmp, 0)));
6511 break;
6512 }
6513
6514 case nir_intrinsic_last_invocation: {
6515 brw_reg tmp = bld.vgrf(BRW_TYPE_UD);
6516 bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
6517 bld.MOV(retype(dest, BRW_TYPE_UD),
6518 brw_reg(component(tmp, 0)));
6519 break;
6520 }
6521
6522 case nir_intrinsic_quad_broadcast: {
6523 const brw_reg value = get_nir_src(ntb, instr->src[0]);
6524 const unsigned index = nir_src_as_uint(instr->src[1]);
6525
6526 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
6527 value, brw_imm_ud(index), brw_imm_ud(4));
6528 break;
6529 }
6530
6531 case nir_intrinsic_quad_swap_horizontal:
6532 case nir_intrinsic_quad_swap_vertical:
6533 case nir_intrinsic_quad_swap_diagonal: {
6534 const brw_reg value = get_nir_src(ntb, instr->src[0]);
6535
6536 enum brw_swap_direction dir;
6537 switch (instr->intrinsic) {
6538 case nir_intrinsic_quad_swap_horizontal: dir = BRW_SWAP_HORIZONTAL; break;
6539 case nir_intrinsic_quad_swap_vertical: dir = BRW_SWAP_VERTICAL; break;
6540 case nir_intrinsic_quad_swap_diagonal: dir = BRW_SWAP_DIAGONAL; break;
6541 default: unreachable("invalid quad swap");
6542 }
6543
6544 bld.emit(SHADER_OPCODE_QUAD_SWAP, retype(dest, value.type),
6545 value, brw_imm_ud(dir));
6546 break;
6547 }
6548
6549 case nir_intrinsic_reduce: {
6550 brw_reg src = get_nir_src(ntb, instr->src[0]);
6551 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
6552 enum brw_reduce_op brw_op = brw_reduce_op_for_nir_reduction_op(op);
6553 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
6554 if (cluster_size == 0 || cluster_size > s.dispatch_width)
6555 cluster_size = s.dispatch_width;
6556
6557 /* Figure out the source type */
6558 src.type = brw_type_for_nir_type(devinfo,
6559 (nir_alu_type)(nir_op_infos[op].input_types[0] |
6560 nir_src_bit_size(instr->src[0])));
6561
6562 bld.emit(SHADER_OPCODE_REDUCE, retype(dest, src.type), src,
6563 brw_imm_ud(brw_op), brw_imm_ud(cluster_size));
6564 break;
6565 }
6566
6567 case nir_intrinsic_inclusive_scan:
6568 case nir_intrinsic_exclusive_scan: {
6569 brw_reg src = get_nir_src(ntb, instr->src[0]);
6570 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
6571 enum brw_reduce_op brw_op = brw_reduce_op_for_nir_reduction_op(op);
6572
6573 /* Figure out the source type */
6574 src.type = brw_type_for_nir_type(devinfo,
6575 (nir_alu_type)(nir_op_infos[op].input_types[0] |
6576 nir_src_bit_size(instr->src[0])));
6577
6578 enum opcode opcode = instr->intrinsic == nir_intrinsic_exclusive_scan ?
6579 SHADER_OPCODE_EXCLUSIVE_SCAN : SHADER_OPCODE_INCLUSIVE_SCAN;
6580
6581 bld.emit(opcode, retype(dest, src.type), src, brw_imm_ud(brw_op));
6582 break;
6583 }
6584
6585 case nir_intrinsic_load_topology_id_intel: {
6586 /* These move around basically every hardware generation, so don't
6587 * do any unbounded checks and fail if the platform hasn't explicitly
6588 * been enabled here.
6589 */
6590 assert(devinfo->ver >= 12 && devinfo->ver <= 30);
6591
6592 /* Here is what the layout of SR0 looks like on Gfx12
6593 * https://gfxspecs.intel.com/Predator/Home/Index/47256
6594 * [13:11] : Slice ID.
6595 * [10:9] : Dual-SubSlice ID
6596 * [8] : SubSlice ID
6597 * [7] : EUID[2] (aka EU Row ID)
6598 * [6] : Reserved
6599 * [5:4] : EUID[1:0]
6600 * [2:0] : Thread ID
6601 *
6602 * Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and
6603 * Register Regions, ARF Registers, State Register,
6604 * https://gfxspecs.intel.com/Predator/Home/Index/56623
6605 * [15:11] : Slice ID.
6606 * [9:8] : SubSlice ID
6607 * [6:4] : EUID
6608 * [2:0] : Thread ID
6609 *
6610 * Xe3: Engine 3D and GPGPU Programs, EU Overview, Registers and
6611 * Register Regions, ARF Registers, State Register.
6612 * Bspec 56623 (r55736)
6613 *
6614 * [17:14] : Slice ID.
6615 * [11:8] : SubSlice ID
6616 * [6:4] : EUID
6617 * [3:0] : Thread ID
6618 */
6619 brw_reg raw_id = bld.vgrf(BRW_TYPE_UD);
6620 bld.UNDEF(raw_id);
6621 bld.emit(SHADER_OPCODE_READ_ARCH_REG, raw_id, retype(brw_sr0_reg(0),
6622 BRW_TYPE_UD));
6623 switch (nir_intrinsic_base(instr)) {
6624 case BRW_TOPOLOGY_ID_DSS:
6625 if (devinfo->ver >= 20) {
6626 /* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing:
6627 * https://gfxspecs.intel.com/Predator/Home/Index/56936
6628 *
6629 * Note: DSSID in all formulas below is a logical identifier of an
6630 * XeCore (a value that goes from 0 to (number_of_slices *
6631 * number_of_XeCores_per_slice -1). SW can get this value from
6632 * either:
6633 *
6634 * - Message Control Register LogicalSSID field (only in shaders
6635 * eligible for Mid-Thread Preemption).
6636 * - Calculated based of State Register with the following formula:
6637 * DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE +
6638 * StateRRegister.SubSliceID where GT_SS_PER_SLICE is an
6639 * architectural parameter defined per product SKU.
6640 *
6641 * We are using the state register to calculate the DSSID.
6642 */
6643 const uint32_t slice_id_mask = devinfo->ver >= 30 ?
6644 INTEL_MASK(17, 14) :
6645 INTEL_MASK(15, 11);
6646 const uint32_t slice_id_shift = devinfo->ver >= 30 ? 14 : 11;
6647
6648 const uint32_t subslice_id_mask = devinfo->ver >= 30 ?
6649 INTEL_MASK(11, 8) :
6650 INTEL_MASK(9, 8);
6651 brw_reg slice_id =
6652 bld.SHR(bld.AND(raw_id, brw_imm_ud(slice_id_mask)),
6653 brw_imm_ud(slice_id_shift));
6654
6655 /* Assert that max subslices covers at least 2 bits that we use for
6656 * subslices.
6657 */
6658 unsigned slice_stride = devinfo->max_subslices_per_slice;
6659 assert(slice_stride >= (1 << 2));
6660 brw_reg subslice_id =
6661 bld.SHR(bld.AND(raw_id, brw_imm_ud(subslice_id_mask)),
6662 brw_imm_ud(8));
6663 bld.ADD(retype(dest, BRW_TYPE_UD),
6664 bld.MUL(slice_id, brw_imm_ud(slice_stride)), subslice_id);
6665 } else {
6666 /* Get rid of anything below dualsubslice */
6667 bld.SHR(retype(dest, BRW_TYPE_UD),
6668 bld.AND(raw_id, brw_imm_ud(0x3fff)), brw_imm_ud(9));
6669 }
6670 break;
6671 case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: {
6672 s.limit_dispatch_width(16, "Topology helper for Ray queries, "
6673 "not supported in SIMD32 mode.");
6674 brw_reg dst = retype(dest, BRW_TYPE_UD);
6675 brw_reg eu;
6676
6677 if (devinfo->ver >= 20) {
6678 /* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions
6679 * Ray Tracing,
6680 * https://gfxspecs.intel.com/Predator/Home/Index/56936
6681 *
6682 * SyncStackID = (EUID[2:0] << 8) | (ThreadID[2:0] << 4) |
6683 * SIMDLaneID[3:0];
6684 *
6685 * This section just deals with the EUID part.
6686 *
6687 * The 3bit EU[2:0] we need to build for ray query memory addresses
6688 * computations is a bit odd :
6689 *
6690 * EU[2:0] = raw_id[6:4] (identified as EUID[2:0])
6691 */
6692 eu = bld.SHL(bld.AND(raw_id, brw_imm_ud(INTEL_MASK(6, 4))),
6693 brw_imm_ud(4));
6694 } else {
6695 /* EU[3:0] << 7
6696 *
6697 * The 4bit EU[3:0] we need to build for ray query memory addresses
6698 * computations is a bit odd :
6699 *
6700 * EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
6701 * EU[2] = raw_id[8] (identified as SubSlice ID)
6702 * EU[3] = raw_id[7] (identified as EUID[2] or Row ID)
6703 */
6704 brw_reg raw5_4 = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(5, 4)));
6705 brw_reg raw7 = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(7, 7)));
6706 brw_reg raw8 = bld.AND(raw_id, brw_imm_ud(INTEL_MASK(8, 8)));
6707 eu = bld.OR(bld.SHL(raw5_4, brw_imm_ud(3)),
6708 bld.OR(bld.SHL(raw7, brw_imm_ud(3)),
6709 bld.SHL(raw8, brw_imm_ud(1))));
6710 }
6711
6712 brw_reg tid;
6713 /* Xe3: Graphics Engine, 3D and GPGPU Programs, Shared Functions
6714 * Ray Tracing, (Bspec 56936 (r56740))
6715 *
6716 * SyncStackID = (EUID[2:0] << 8) | (ThreadID[3:0] << 4) |
6717 * SIMDLaneID[3:0];
6718 *
6719 * ThreadID[3:0] << 4 (ThreadID comes from raw_id[3:0])
6720 *
6721 * On older platforms (< Xe3):
6722 * ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0])
6723 */
6724 const uint32_t raw_id_mask = devinfo->ver >= 30 ?
6725 INTEL_MASK(3, 0) :
6726 INTEL_MASK(2, 0);
6727 tid = bld.SHL(bld.AND(raw_id, brw_imm_ud(raw_id_mask)),
6728 brw_imm_ud(4));
6729
6730 /* LaneID[0:3] << 0 (Use subgroup invocation) */
6731 assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
6732 bld.ADD(dst, bld.OR(eu, tid), bld.LOAD_SUBGROUP_INVOCATION());
6733 break;
6734 }
6735 default:
6736 unreachable("Invalid topology id type");
6737 }
6738 break;
6739 }
6740
6741 case nir_intrinsic_load_btd_stack_id_intel:
6742 if (s.stage == MESA_SHADER_COMPUTE) {
6743 assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
6744 } else {
6745 assert(brw_shader_stage_is_bindless(s.stage));
6746 }
6747 /* Stack IDs are always in R1 regardless of whether we're coming from a
6748 * bindless shader or a regular compute shader.
6749 */
6750 bld.MOV(retype(dest, BRW_TYPE_UD),
6751 retype(brw_vec8_grf(1 * reg_unit(devinfo), 0), BRW_TYPE_UW));
6752 break;
6753
6754 case nir_intrinsic_btd_spawn_intel:
6755 if (s.stage == MESA_SHADER_COMPUTE) {
6756 assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
6757 } else {
6758 assert(brw_shader_stage_is_bindless(s.stage));
6759 }
6760 /* Make sure all the pointers to resume shaders have landed where other
6761 * threads can see them.
6762 */
6763 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
6764
6765 bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
6766 bld.emit_uniformize(get_nir_src(ntb, instr->src[0], -1)),
6767 get_nir_src(ntb, instr->src[1]));
6768 break;
6769
6770 case nir_intrinsic_btd_retire_intel:
6771 if (s.stage == MESA_SHADER_COMPUTE) {
6772 assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
6773 } else {
6774 assert(brw_shader_stage_is_bindless(s.stage));
6775 }
6776 /* Make sure all the pointers to resume shaders have landed where other
6777 * threads can see them.
6778 */
6779 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
6780 bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
6781 break;
6782
6783 case nir_intrinsic_trace_ray_intel: {
6784 const bool synchronous = nir_intrinsic_synchronous(instr);
6785 assert(brw_shader_stage_is_bindless(s.stage) || synchronous);
6786
6787 /* Make sure all the previous RT structure writes are visible to the RT
6788 * fixed function within the DSS, as well as stack pointers to resume
6789 * shaders.
6790 */
6791 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
6792
6793 brw_reg srcs[RT_LOGICAL_NUM_SRCS];
6794
6795 brw_reg globals = get_nir_src(ntb, instr->src[0], -1);
6796 srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals);
6797 srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(ntb, instr->src[1]);
6798 srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(ntb, instr->src[2]);
6799 srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
6800
6801 /* Bspec 57508: Structure_SIMD16TraceRayMessage:: RayQuery Enable
6802 *
6803 * "When this bit is set in the header, Trace Ray Message behaves like
6804 * a Ray Query. This message requires a write-back message indicating
6805 * RayQuery for all valid Rays (SIMD lanes) have completed."
6806 */
6807 brw_reg dst = (devinfo->ver >= 20 && synchronous) ?
6808 bld.vgrf(BRW_TYPE_UD) :
6809 bld.null_reg_ud();
6810
6811 bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, dst, srcs, RT_LOGICAL_NUM_SRCS);
6812
6813 /* There is no actual value to use in the destination register of the
6814 * synchronous trace instruction. All of the communication with the HW
6815 * unit happens through memory reads/writes. So to ensure that the
6816 * operation has completed before we go read the results in memory, we
6817 * need a barrier followed by an invalidate before accessing memory.
6818 */
6819 if (synchronous) {
6820 bld.SYNC(TGL_SYNC_ALLWR);
6821 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE);
6822 }
6823 break;
6824 }
6825
6826 default:
6827 #ifndef NDEBUG
6828 assert(instr->intrinsic < nir_num_intrinsics);
6829 fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
6830 #endif
6831 unreachable("unknown intrinsic");
6832 }
6833 }
6834
6835 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)6836 lsc_bits_to_data_size(unsigned bit_size)
6837 {
6838 switch (bit_size / 8) {
6839 case 1: return LSC_DATA_SIZE_D8U32;
6840 case 2: return LSC_DATA_SIZE_D16U32;
6841 case 4: return LSC_DATA_SIZE_D32;
6842 case 8: return LSC_DATA_SIZE_D64;
6843 default:
6844 unreachable("Unsupported data size.");
6845 }
6846 }
6847
6848 /**
6849 *
6850 * \param bld "Normal" builder. This is the full dispatch width of the shader.
6851 *
6852 * \param xbld Builder for the intrinsic. If the intrinsic is convergent, this
6853 * builder will be scalar_group(). Otherwise it will be the same
6854 * as bld.
6855 *
6856 * Some places in the function will also use \c ubld. There are two cases of
6857 * this. Sometimes it is to generate intermediate values as SIMD1. Other
6858 * places that use \c ubld need a scalar_group() builder to operate on sources
6859 * to the intrinsic that are is_scalar.
6860 */
6861 static void
fs_nir_emit_memory_access(nir_to_brw_state & ntb,const fs_builder & bld,const fs_builder & xbld,nir_intrinsic_instr * instr)6862 fs_nir_emit_memory_access(nir_to_brw_state &ntb,
6863 const fs_builder &bld,
6864 const fs_builder &xbld,
6865 nir_intrinsic_instr *instr)
6866 {
6867 const intel_device_info *devinfo = ntb.devinfo;
6868 fs_visitor &s = ntb.s;
6869
6870 brw_reg srcs[MEMORY_LOGICAL_NUM_SRCS];
6871
6872 /* Start with some default values for most cases */
6873
6874 enum lsc_opcode op = lsc_op_for_nir_intrinsic(instr);
6875 const bool is_store = !nir_intrinsic_infos[instr->intrinsic].has_dest;
6876 const bool is_atomic = lsc_opcode_is_atomic(op);
6877 const bool is_load = !is_store && !is_atomic;
6878 const bool include_helpers = nir_intrinsic_has_access(instr) &&
6879 (nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
6880 const unsigned align =
6881 nir_intrinsic_has_align(instr) ? nir_intrinsic_align(instr) : 0;
6882 bool no_mask_handle = false;
6883 int data_src = -1;
6884
6885 srcs[MEMORY_LOGICAL_OPCODE] = brw_imm_ud(op);
6886 /* BINDING_TYPE, BINDING, and ADDRESS are handled in the switch */
6887 srcs[MEMORY_LOGICAL_COORD_COMPONENTS] = brw_imm_ud(1);
6888 srcs[MEMORY_LOGICAL_ALIGNMENT] = brw_imm_ud(align);
6889 /* DATA_SIZE and CHANNELS are handled below the switch */
6890 srcs[MEMORY_LOGICAL_FLAGS] =
6891 brw_imm_ud(include_helpers ? MEMORY_FLAG_INCLUDE_HELPERS : 0);
6892 /* DATA0 and DATA1 are handled below */
6893
6894 switch (instr->intrinsic) {
6895 case nir_intrinsic_bindless_image_load:
6896 case nir_intrinsic_bindless_image_store:
6897 case nir_intrinsic_bindless_image_atomic:
6898 case nir_intrinsic_bindless_image_atomic_swap:
6899 srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_BSS);
6900 FALLTHROUGH;
6901 case nir_intrinsic_image_load:
6902 case nir_intrinsic_image_store:
6903 case nir_intrinsic_image_atomic:
6904 case nir_intrinsic_image_atomic_swap:
6905 srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_TYPED);
6906 srcs[MEMORY_LOGICAL_BINDING] =
6907 get_nir_image_intrinsic_image(ntb, bld, instr);
6908
6909 if (srcs[MEMORY_LOGICAL_BINDING_TYPE].file == BAD_FILE)
6910 srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_BTI);
6911
6912 srcs[MEMORY_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6913 srcs[MEMORY_LOGICAL_COORD_COMPONENTS] =
6914 brw_imm_ud(nir_image_intrinsic_coord_components(instr));
6915
6916 data_src = 3;
6917 break;
6918
6919 case nir_intrinsic_load_ubo_uniform_block_intel:
6920 srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_CONSTANT);
6921 FALLTHROUGH;
6922 case nir_intrinsic_load_ssbo:
6923 case nir_intrinsic_store_ssbo:
6924 case nir_intrinsic_ssbo_atomic:
6925 case nir_intrinsic_ssbo_atomic_swap:
6926 case nir_intrinsic_load_ssbo_block_intel:
6927 case nir_intrinsic_store_ssbo_block_intel:
6928 case nir_intrinsic_load_ssbo_uniform_block_intel:
6929 if (srcs[MEMORY_LOGICAL_MODE].file == BAD_FILE)
6930 srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_UNTYPED);
6931 srcs[MEMORY_LOGICAL_BINDING_TYPE] =
6932 brw_imm_ud(get_nir_src_bindless(ntb, instr->src[is_store ? 1 : 0]) ?
6933 LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI);
6934 srcs[MEMORY_LOGICAL_BINDING] =
6935 get_nir_buffer_intrinsic_index(ntb, bld, instr, &no_mask_handle);
6936 srcs[MEMORY_LOGICAL_ADDRESS] =
6937 get_nir_src_imm(ntb, instr->src[is_store ? 2 : 1]);
6938
6939 data_src = is_atomic ? 2 : 0;
6940 break;
6941 case nir_intrinsic_load_shared:
6942 case nir_intrinsic_store_shared:
6943 case nir_intrinsic_shared_atomic:
6944 case nir_intrinsic_shared_atomic_swap:
6945 case nir_intrinsic_load_shared_block_intel:
6946 case nir_intrinsic_store_shared_block_intel:
6947 case nir_intrinsic_load_shared_uniform_block_intel: {
6948 srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_SHARED_LOCAL);
6949 srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
6950
6951 const brw_reg nir_src = get_nir_src(ntb, instr->src[is_store ? 1 : 0]);
6952 const fs_builder ubld = nir_src.is_scalar ? bld.scalar_group() : bld;
6953
6954 /* If the logical address is not uniform, a call to emit_uniformize
6955 * below will fix it up.
6956 */
6957 srcs[MEMORY_LOGICAL_ADDRESS] =
6958 ubld.ADD(retype(nir_src, BRW_TYPE_UD),
6959 brw_imm_ud(nir_intrinsic_base(instr)));
6960
6961 /* If nir_src is_scalar, the MEMORY_LOGICAL_ADDRESS will be allocated at
6962 * scalar_group() size and will have every component the same
6963 * value. This is the definition of is_scalar. Much more importantly,
6964 * setting is_scalar properly also ensures that emit_uniformize (below)
6965 * will handle the value as scalar_group() size instead of full dispatch
6966 * width.
6967 */
6968 srcs[MEMORY_LOGICAL_ADDRESS].is_scalar = nir_src.is_scalar;
6969
6970 data_src = is_atomic ? 1 : 0;
6971 no_mask_handle = true;
6972 break;
6973 }
6974 case nir_intrinsic_load_scratch:
6975 case nir_intrinsic_store_scratch: {
6976 srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_SCRATCH);
6977
6978 const nir_src &addr = instr->src[is_store ? 1 : 0];
6979
6980 if (devinfo->verx10 >= 125) {
6981 srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_SS);
6982
6983 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
6984 brw_reg bind = ubld.AND(retype(brw_vec1_grf(0, 5), BRW_TYPE_UD),
6985 brw_imm_ud(INTEL_MASK(31, 10)));
6986 if (devinfo->ver >= 20)
6987 bind = ubld.SHR(bind, brw_imm_ud(4));
6988
6989 /* load_scratch / store_scratch cannot be is_scalar yet. */
6990 assert(xbld.dispatch_width() == bld.dispatch_width());
6991
6992 srcs[MEMORY_LOGICAL_BINDING] = component(bind, 0);
6993 srcs[MEMORY_LOGICAL_ADDRESS] =
6994 swizzle_nir_scratch_addr(ntb, bld, addr, false);
6995 } else {
6996 unsigned bit_size =
6997 is_store ? nir_src_bit_size(instr->src[0]) : instr->def.bit_size;
6998 bool dword_aligned = align >= 4 && bit_size == 32;
6999
7000 /* load_scratch / store_scratch cannot be is_scalar yet. */
7001 assert(xbld.dispatch_width() == bld.dispatch_width());
7002
7003 srcs[MEMORY_LOGICAL_BINDING_TYPE] =
7004 brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
7005 srcs[MEMORY_LOGICAL_ADDRESS] =
7006 swizzle_nir_scratch_addr(ntb, bld, addr, dword_aligned);
7007 }
7008
7009 if (is_store)
7010 ++s.shader_stats.spill_count;
7011 else
7012 ++s.shader_stats.fill_count;
7013
7014 data_src = 0;
7015 break;
7016 }
7017
7018 case nir_intrinsic_load_global_constant_uniform_block_intel:
7019 case nir_intrinsic_load_global:
7020 case nir_intrinsic_load_global_constant:
7021 case nir_intrinsic_store_global:
7022 case nir_intrinsic_global_atomic:
7023 case nir_intrinsic_global_atomic_swap:
7024 case nir_intrinsic_load_global_block_intel:
7025 case nir_intrinsic_store_global_block_intel:
7026 srcs[MEMORY_LOGICAL_MODE] = brw_imm_ud(MEMORY_MODE_UNTYPED);
7027 srcs[MEMORY_LOGICAL_BINDING_TYPE] = brw_imm_ud(LSC_ADDR_SURFTYPE_FLAT);
7028 srcs[MEMORY_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[is_store ? 1 : 0]);
7029 no_mask_handle = srcs[MEMORY_LOGICAL_ADDRESS].is_scalar;
7030
7031 data_src = is_atomic ? 1 : 0;
7032 break;
7033
7034 default:
7035 unreachable("unknown memory intrinsic");
7036 }
7037
7038 unsigned components = is_store ? instr->src[data_src].ssa->num_components
7039 : instr->def.num_components;
7040 if (components == 0)
7041 components = instr->num_components;
7042
7043 srcs[MEMORY_LOGICAL_COMPONENTS] = brw_imm_ud(components);
7044
7045 const unsigned nir_bit_size =
7046 is_store ? instr->src[data_src].ssa->bit_size : instr->def.bit_size;
7047 enum lsc_data_size data_size = lsc_bits_to_data_size(nir_bit_size);
7048 uint32_t data_bit_size = lsc_data_size_bytes(data_size) * 8;
7049
7050 srcs[MEMORY_LOGICAL_DATA_SIZE] = brw_imm_ud(data_size);
7051
7052 const brw_reg_type data_type =
7053 brw_type_with_size(BRW_TYPE_UD, data_bit_size);
7054 const brw_reg_type nir_data_type =
7055 brw_type_with_size(BRW_TYPE_UD, nir_bit_size);
7056 assert(data_bit_size >= nir_bit_size);
7057
7058 if (!is_load) {
7059 for (unsigned i = 0; i < lsc_op_num_data_values(op); i++) {
7060 brw_reg nir_src =
7061 retype(get_nir_src(ntb, instr->src[data_src + i], -1), nir_data_type);
7062
7063 if (data_bit_size > nir_bit_size) {
7064 /* Expand e.g. D16 to D16U32 */
7065 srcs[MEMORY_LOGICAL_DATA0 + i] = xbld.vgrf(data_type, components);
7066 for (unsigned c = 0; c < components; c++) {
7067 xbld.MOV(offset(srcs[MEMORY_LOGICAL_DATA0 + i], xbld, c),
7068 offset(nir_src, xbld, c));
7069 }
7070 } else {
7071 srcs[MEMORY_LOGICAL_DATA0 + i] = nir_src;
7072 }
7073 }
7074 }
7075
7076 brw_reg dest, nir_dest;
7077 if (!is_store) {
7078 nir_dest = retype(get_nir_def(ntb, instr->def), nir_data_type);
7079 dest = data_bit_size > nir_bit_size ? xbld.vgrf(data_type, components)
7080 : nir_dest;
7081 }
7082
7083 enum opcode opcode = is_load ? SHADER_OPCODE_MEMORY_LOAD_LOGICAL :
7084 is_store ? SHADER_OPCODE_MEMORY_STORE_LOGICAL :
7085 SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL;
7086
7087 const bool convergent_block_load =
7088 instr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||
7089 instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel ||
7090 instr->intrinsic == nir_intrinsic_load_shared_uniform_block_intel ||
7091 instr->intrinsic == nir_intrinsic_load_global_constant_uniform_block_intel;
7092 const bool block = convergent_block_load ||
7093 instr->intrinsic == nir_intrinsic_load_global_block_intel ||
7094 instr->intrinsic == nir_intrinsic_load_shared_block_intel ||
7095 instr->intrinsic == nir_intrinsic_load_ssbo_block_intel ||
7096 instr->intrinsic == nir_intrinsic_store_global_block_intel ||
7097 instr->intrinsic == nir_intrinsic_store_shared_block_intel ||
7098 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
7099
7100 fs_inst *inst;
7101
7102 if (!block) {
7103 inst = xbld.emit(opcode, dest, srcs, MEMORY_LOGICAL_NUM_SRCS);
7104 inst->size_written *= components;
7105
7106 if (dest.file != BAD_FILE && data_bit_size > nir_bit_size) {
7107 /* Shrink e.g. D16U32 result back to D16 */
7108 for (unsigned i = 0; i < components; i++) {
7109 xbld.MOV(offset(nir_dest, xbld, i),
7110 subscript(offset(dest, xbld, i), nir_dest.type, 0));
7111 }
7112 }
7113 } else {
7114 assert(nir_bit_size == 32);
7115
7116 srcs[MEMORY_LOGICAL_FLAGS] =
7117 brw_imm_ud(MEMORY_FLAG_TRANSPOSE | srcs[MEMORY_LOGICAL_FLAGS].ud);
7118 srcs[MEMORY_LOGICAL_ADDRESS] =
7119 bld.emit_uniformize(srcs[MEMORY_LOGICAL_ADDRESS]);
7120
7121 const fs_builder ubld = bld.exec_all().group(1, 0);
7122 unsigned total, done;
7123 unsigned first_read_component = 0;
7124
7125 if (convergent_block_load) {
7126 /* If the address is a constant and alignment permits, skip unread
7127 * leading and trailing components. (It's probably not worth the
7128 * extra address math for non-constant addresses.)
7129 *
7130 * Note that SLM block loads on HDC platforms need to be 16B aligned.
7131 */
7132 if (srcs[MEMORY_LOGICAL_ADDRESS].file == IMM &&
7133 align >= data_bit_size / 8 &&
7134 (devinfo->has_lsc ||
7135 srcs[MEMORY_LOGICAL_MODE].ud != MEMORY_MODE_SHARED_LOCAL)) {
7136 first_read_component = nir_def_first_component_read(&instr->def);
7137 unsigned last_component = nir_def_last_component_read(&instr->def);
7138 srcs[MEMORY_LOGICAL_ADDRESS].u64 +=
7139 first_read_component * (data_bit_size / 8);
7140 components = last_component - first_read_component + 1;
7141 }
7142
7143 total = ALIGN(components, REG_SIZE * reg_unit(devinfo) / 4);
7144 dest = ubld.vgrf(BRW_TYPE_UD, total);
7145 } else {
7146 total = components * bld.dispatch_width();
7147 dest = nir_dest;
7148 }
7149
7150 brw_reg src = srcs[MEMORY_LOGICAL_DATA0];
7151
7152 unsigned block_comps = components;
7153
7154 for (done = 0; done < total; done += block_comps) {
7155 block_comps = choose_block_size_dwords(devinfo, total - done);
7156 const unsigned block_bytes = block_comps * (nir_bit_size / 8);
7157
7158 srcs[MEMORY_LOGICAL_COMPONENTS] = brw_imm_ud(block_comps);
7159
7160 brw_reg dst_offset = is_store ? brw_reg() :
7161 retype(byte_offset(dest, done * 4), BRW_TYPE_UD);
7162 if (is_store) {
7163 srcs[MEMORY_LOGICAL_DATA0] =
7164 retype(byte_offset(src, done * 4), BRW_TYPE_UD);
7165 }
7166
7167 inst = ubld.emit(opcode, dst_offset, srcs, MEMORY_LOGICAL_NUM_SRCS);
7168 inst->has_no_mask_send_params = no_mask_handle;
7169 if (is_load)
7170 inst->size_written = block_bytes;
7171
7172 if (brw_type_size_bits(srcs[MEMORY_LOGICAL_ADDRESS].type) == 64) {
7173 increment_a64_address(ubld, srcs[MEMORY_LOGICAL_ADDRESS],
7174 block_bytes, no_mask_handle);
7175 } else {
7176 srcs[MEMORY_LOGICAL_ADDRESS] =
7177 ubld.ADD(retype(srcs[MEMORY_LOGICAL_ADDRESS], BRW_TYPE_UD),
7178 brw_imm_ud(block_bytes));
7179 }
7180 }
7181 assert(done == total);
7182
7183 if (convergent_block_load) {
7184 for (unsigned c = 0; c < components; c++) {
7185 xbld.MOV(retype(offset(nir_dest, xbld, first_read_component + c),
7186 BRW_TYPE_UD),
7187 component(dest, c));
7188 }
7189 }
7190 }
7191 }
7192
7193 static void
fs_nir_emit_texture(nir_to_brw_state & ntb,nir_tex_instr * instr)7194 fs_nir_emit_texture(nir_to_brw_state &ntb,
7195 nir_tex_instr *instr)
7196 {
7197 const intel_device_info *devinfo = ntb.devinfo;
7198 const fs_builder &bld = ntb.bld;
7199
7200 brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
7201
7202 /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
7203 *
7204 * "The Pixel Null Mask field, when enabled via the Pixel Null Mask
7205 * Enable will be incorect for sample_c when applied to a surface with
7206 * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
7207 * Enable may incorrectly report pixels as referencing a Null surface."
7208 *
7209 * We'll take care of this in NIR.
7210 */
7211 assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
7212
7213 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse);
7214
7215 int lod_components = 0;
7216
7217 /* The hardware requires a LOD for buffer textures */
7218 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
7219 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
7220
7221 ASSERTED bool got_lod = false;
7222 ASSERTED bool got_bias = false;
7223 bool pack_lod_bias_and_offset = false;
7224 uint32_t header_bits = 0;
7225 for (unsigned i = 0; i < instr->num_srcs; i++) {
7226 nir_src nir_src = instr->src[i].src;
7227 brw_reg src = get_nir_src(ntb, nir_src, -1);
7228
7229 /* If the source is not a vector (e.g., a 1D texture coordinate), then
7230 * the eventual LOAD_PAYLOAD lowering will not properly adjust the
7231 * stride, etc., so do it now.
7232 */
7233 if (nir_tex_instr_src_size(instr, i) == 1)
7234 src = offset(src, bld, 0);
7235
7236 switch (instr->src[i].src_type) {
7237 case nir_tex_src_bias:
7238 assert(!got_lod);
7239 got_bias = true;
7240
7241 srcs[TEX_LOGICAL_SRC_LOD] =
7242 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7243 break;
7244 case nir_tex_src_comparator:
7245 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_TYPE_F);
7246 break;
7247 case nir_tex_src_coord:
7248 switch (instr->op) {
7249 case nir_texop_txf:
7250 case nir_texop_txf_ms:
7251 case nir_texop_txf_ms_mcs_intel:
7252 case nir_texop_samples_identical:
7253 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_TYPE_D);
7254 break;
7255 default:
7256 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_TYPE_F);
7257 break;
7258 }
7259 break;
7260 case nir_tex_src_ddx:
7261 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_TYPE_F);
7262 lod_components = nir_tex_instr_src_size(instr, i);
7263 break;
7264 case nir_tex_src_ddy:
7265 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_TYPE_F);
7266 break;
7267 case nir_tex_src_lod:
7268 assert(!got_bias);
7269 got_lod = true;
7270
7271 switch (instr->op) {
7272 case nir_texop_txs:
7273 srcs[TEX_LOGICAL_SRC_LOD] =
7274 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_UD);
7275 break;
7276 case nir_texop_txf:
7277 srcs[TEX_LOGICAL_SRC_LOD] =
7278 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_D);
7279 break;
7280 default:
7281 srcs[TEX_LOGICAL_SRC_LOD] =
7282 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7283 break;
7284 }
7285 break;
7286 case nir_tex_src_min_lod:
7287 srcs[TEX_LOGICAL_SRC_MIN_LOD] =
7288 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7289 break;
7290 case nir_tex_src_ms_index:
7291 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_TYPE_UD);
7292 break;
7293
7294 case nir_tex_src_offset: {
7295 uint32_t offset_bits = 0;
7296 if (brw_texture_offset(instr, i, &offset_bits)) {
7297 header_bits |= offset_bits;
7298 } else {
7299 /* On gfx12.5+, if the offsets are not both constant and in the
7300 * {-8,7} range, nir_lower_tex() will have already lowered the
7301 * source offset. So we should never reach this point.
7302 */
7303 assert(devinfo->verx10 < 125);
7304 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
7305 retype(src, BRW_TYPE_D);
7306 }
7307 break;
7308 }
7309
7310 case nir_tex_src_projector:
7311 unreachable("should be lowered");
7312
7313 case nir_tex_src_texture_offset: {
7314 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
7315 /* Emit code to evaluate the actual indexing expression */
7316 srcs[TEX_LOGICAL_SRC_SURFACE] =
7317 bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
7318 brw_imm_ud(instr->texture_index)));
7319 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
7320 break;
7321 }
7322
7323 case nir_tex_src_sampler_offset: {
7324 /* Emit code to evaluate the actual indexing expression */
7325 srcs[TEX_LOGICAL_SRC_SAMPLER] =
7326 bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
7327 brw_imm_ud(instr->sampler_index)));
7328 break;
7329 }
7330
7331 case nir_tex_src_texture_handle:
7332 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
7333 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_reg();
7334 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
7335 break;
7336
7337 case nir_tex_src_sampler_handle:
7338 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
7339 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_reg();
7340 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
7341 break;
7342
7343 case nir_tex_src_ms_mcs_intel:
7344 assert(instr->op == nir_texop_txf_ms);
7345 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_TYPE_D);
7346 break;
7347
7348 /* If this parameter is present, we are packing offset U, V and LOD/Bias
7349 * into a single (32-bit) value.
7350 */
7351 case nir_tex_src_backend2:
7352 assert(instr->op == nir_texop_tg4);
7353 pack_lod_bias_and_offset = true;
7354 srcs[TEX_LOGICAL_SRC_LOD] =
7355 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7356 break;
7357
7358 /* If this parameter is present, we are packing either the explicit LOD
7359 * or LOD bias and the array index into a single (32-bit) value when
7360 * 32-bit texture coordinates are used.
7361 */
7362 case nir_tex_src_backend1:
7363 assert(!got_lod && !got_bias);
7364 got_lod = true;
7365 assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
7366 srcs[TEX_LOGICAL_SRC_LOD] =
7367 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_TYPE_F);
7368 break;
7369
7370 default:
7371 unreachable("unknown texture source");
7372 }
7373 }
7374
7375 /* If the surface or sampler were not specified through sources, use the
7376 * instruction index.
7377 */
7378 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
7379 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
7380 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
7381 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
7382 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
7383 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
7384
7385 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
7386 (instr->op == nir_texop_txf_ms ||
7387 instr->op == nir_texop_samples_identical)) {
7388 srcs[TEX_LOGICAL_SRC_MCS] =
7389 emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
7390 instr->coord_components,
7391 srcs[TEX_LOGICAL_SRC_SURFACE],
7392 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
7393 }
7394
7395 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
7396 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
7397
7398 enum opcode opcode;
7399 switch (instr->op) {
7400 case nir_texop_tex:
7401 opcode = SHADER_OPCODE_TEX_LOGICAL;
7402 break;
7403 case nir_texop_txb:
7404 opcode = FS_OPCODE_TXB_LOGICAL;
7405 break;
7406 case nir_texop_txl:
7407 opcode = SHADER_OPCODE_TXL_LOGICAL;
7408 break;
7409 case nir_texop_txd:
7410 opcode = SHADER_OPCODE_TXD_LOGICAL;
7411 break;
7412 case nir_texop_txf:
7413 opcode = SHADER_OPCODE_TXF_LOGICAL;
7414 break;
7415 case nir_texop_txf_ms:
7416 /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
7417 * Functions - 3D Sampler - Messages - Message Format:
7418 *
7419 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
7420 */
7421 if (devinfo->verx10 >= 125)
7422 opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
7423 else
7424 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
7425 break;
7426 case nir_texop_txf_ms_mcs_intel:
7427 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
7428 break;
7429 case nir_texop_query_levels:
7430 case nir_texop_txs:
7431 opcode = SHADER_OPCODE_TXS_LOGICAL;
7432 break;
7433 case nir_texop_lod:
7434 opcode = SHADER_OPCODE_LOD_LOGICAL;
7435 break;
7436 case nir_texop_tg4: {
7437 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) {
7438 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
7439 } else {
7440 opcode = SHADER_OPCODE_TG4_LOGICAL;
7441 if (devinfo->ver >= 20) {
7442 /* If SPV_AMD_texture_gather_bias_lod extension is enabled, all
7443 * texture gather functions (ie. the ones which do not take the
7444 * extra bias argument and the ones that do) fetch texels from
7445 * implicit LOD in fragment shader stage. In all other shader
7446 * stages, base level is used instead.
7447 */
7448 if (instr->is_gather_implicit_lod)
7449 opcode = SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL;
7450
7451 if (got_bias)
7452 opcode = SHADER_OPCODE_TG4_BIAS_LOGICAL;
7453
7454 if (got_lod)
7455 opcode = SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL;
7456
7457 if (pack_lod_bias_and_offset) {
7458 if (got_lod)
7459 opcode = SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL;
7460 if (got_bias)
7461 opcode = SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL;
7462 }
7463 }
7464 }
7465 break;
7466 }
7467 case nir_texop_texture_samples:
7468 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
7469 break;
7470 case nir_texop_samples_identical: {
7471 brw_reg dst = retype(get_nir_def(ntb, instr->def), BRW_TYPE_D);
7472
7473 /* If mcs is an immediate value, it means there is no MCS. In that case
7474 * just return false.
7475 */
7476 if (srcs[TEX_LOGICAL_SRC_MCS].file == IMM) {
7477 bld.MOV(dst, brw_imm_ud(0u));
7478 } else {
7479 brw_reg tmp =
7480 bld.OR(srcs[TEX_LOGICAL_SRC_MCS],
7481 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
7482 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
7483 }
7484 return;
7485 }
7486 default:
7487 unreachable("unknown texture opcode");
7488 }
7489
7490 if (instr->op == nir_texop_tg4) {
7491 header_bits |= instr->component << 16;
7492 }
7493
7494 brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
7495
7496 const unsigned dest_size = nir_tex_instr_dest_size(instr);
7497 unsigned dest_comp;
7498 if (instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
7499 unsigned write_mask = nir_def_components_read(&instr->def);
7500 assert(write_mask != 0); /* dead code should have been eliminated */
7501
7502 dest_comp = util_last_bit(write_mask) - instr->is_sparse;
7503 } else {
7504 dest_comp = 4;
7505 }
7506
7507 /* Compute the number of physical registers needed to hold a single
7508 * component and round it up to a physical register count.
7509 */
7510 brw_reg_type dst_type = brw_type_for_nir_type(devinfo, instr->dest_type);
7511 const unsigned grf_size = reg_unit(devinfo) * REG_SIZE;
7512 const unsigned per_component_regs =
7513 DIV_ROUND_UP(brw_type_size_bytes(dst_type) * bld.dispatch_width(),
7514 grf_size);
7515 const unsigned total_regs =
7516 dest_comp * per_component_regs + instr->is_sparse;
7517 /* Allocate enough space for the components + one physical register for the
7518 * residency data.
7519 */
7520 brw_reg dst = brw_vgrf(
7521 bld.shader->alloc.allocate(total_regs * reg_unit(devinfo)),
7522 dst_type);
7523
7524 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
7525 inst->offset = header_bits;
7526 inst->size_written = total_regs * grf_size;
7527
7528 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
7529 inst->shadow_compare = true;
7530
7531 /* Wa_14012688258:
7532 *
7533 * Don't trim zeros at the end of payload for sample operations
7534 * in cube and cube arrays.
7535 */
7536 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
7537 intel_needs_workaround(devinfo, 14012688258)) {
7538
7539 /* Compiler should send U,V,R parameters even if V,R are 0. */
7540 if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
7541 assert(instr->coord_components >= 3u);
7542
7543 /* See opt_zero_samples(). */
7544 inst->keep_payload_trailing_zeros = true;
7545 }
7546
7547 /* With half-floats returns, the stride into a GRF allocation for each
7548 * component might be different than where the sampler is storing each
7549 * component. For example in SIMD8 on DG2 the layout of the data returned
7550 * by the sampler is as follow for 2 components load:
7551 *
7552 * _______________________________________________________________
7553 * g0 : | unused |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
7554 * g1 : | unused |hf7|hf6|hf5|hf4|hf3|hf2|hf1|hf0|
7555 *
7556 * The same issue also happens in SIMD16 on Xe2 because the physical
7557 * register size has doubled but we're still loading data only on half the
7558 * register.
7559 *
7560 * In those cases we need the special remapping case below.
7561 */
7562 const bool non_aligned_component_stride =
7563 (brw_type_size_bytes(dst_type) * bld.dispatch_width()) % grf_size != 0;
7564 if (instr->op != nir_texop_query_levels && !instr->is_sparse &&
7565 !non_aligned_component_stride) {
7566 /* In most cases we can write directly to the result. */
7567 inst->dst = nir_def_reg;
7568 } else {
7569 /* In other cases, we have to reorganize the sampler message's results
7570 * a bit to match the NIR intrinsic's expectations.
7571 */
7572 brw_reg nir_dest[5];
7573 for (unsigned i = 0; i < dest_comp; i++)
7574 nir_dest[i] = byte_offset(dst, i * per_component_regs * grf_size);
7575
7576 for (unsigned i = dest_comp; i < dest_size; i++)
7577 nir_dest[i].type = dst.type;
7578
7579 if (instr->op == nir_texop_query_levels) {
7580 /* # levels is in .w */
7581 if (devinfo->ver == 9) {
7582 /**
7583 * Wa_1940217:
7584 *
7585 * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
7586 * MIPCount returned is undefined instead of 0.
7587 */
7588 fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
7589 mov->conditional_mod = BRW_CONDITIONAL_NZ;
7590 nir_dest[0] = bld.vgrf(BRW_TYPE_D);
7591 fs_inst *sel =
7592 bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
7593 sel->predicate = BRW_PREDICATE_NORMAL;
7594 } else {
7595 nir_dest[0] = offset(dst, bld, 3);
7596 }
7597 }
7598
7599 /* The residency bits are only in the first component. */
7600 if (instr->is_sparse) {
7601 nir_dest[dest_size - 1] =
7602 component(offset(dst, bld, dest_size - 1), 0);
7603 }
7604
7605 bld.LOAD_PAYLOAD(nir_def_reg, nir_dest, dest_size, 0);
7606 }
7607 }
7608
7609 static void
fs_nir_emit_jump(nir_to_brw_state & ntb,nir_jump_instr * instr)7610 fs_nir_emit_jump(nir_to_brw_state &ntb, nir_jump_instr *instr)
7611 {
7612 switch (instr->type) {
7613 case nir_jump_break:
7614 ntb.bld.emit(BRW_OPCODE_BREAK);
7615 break;
7616 case nir_jump_continue:
7617 ntb.bld.emit(BRW_OPCODE_CONTINUE);
7618 break;
7619 case nir_jump_halt:
7620 ntb.bld.emit(BRW_OPCODE_HALT);
7621 break;
7622 case nir_jump_return:
7623 default:
7624 unreachable("unknown jump");
7625 }
7626 }
7627
7628 /*
7629 * This helper takes a source register and un/shuffles it into the destination
7630 * register.
7631 *
7632 * If source type size is smaller than destination type size the operation
7633 * needed is a component shuffle. The opposite case would be an unshuffle. If
7634 * source/destination type size is equal a shuffle is done that would be
7635 * equivalent to a simple MOV.
7636 *
7637 * For example, if source is a 16-bit type and destination is 32-bit. A 3
7638 * components .xyz 16-bit vector on SIMD8 would be.
7639 *
7640 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
7641 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
7642 *
7643 * This helper will return the following 2 32-bit components with the 16-bit
7644 * values shuffled:
7645 *
7646 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
7647 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
7648 *
7649 * For unshuffle, the example would be the opposite, a 64-bit type source
7650 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
7651 * would be:
7652 *
7653 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
7654 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
7655 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
7656 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
7657 *
7658 * The returned result would be the following 4 32-bit components unshuffled:
7659 *
7660 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
7661 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
7662 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
7663 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
7664 *
7665 * - Source and destination register must not be overlapped.
7666 * - components units are measured in terms of the smaller type between
7667 * source and destination because we are un/shuffling the smaller
7668 * components from/into the bigger ones.
7669 * - first_component parameter allows skipping source components.
7670 */
7671 void
shuffle_src_to_dst(const fs_builder & bld,const brw_reg & dst,const brw_reg & src,uint32_t first_component,uint32_t components)7672 shuffle_src_to_dst(const fs_builder &bld,
7673 const brw_reg &dst,
7674 const brw_reg &src,
7675 uint32_t first_component,
7676 uint32_t components)
7677 {
7678 if (brw_type_size_bytes(src.type) == brw_type_size_bytes(dst.type)) {
7679 assert(!regions_overlap(dst,
7680 brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
7681 offset(src, bld, first_component),
7682 brw_type_size_bytes(src.type) * bld.dispatch_width() * components));
7683 for (unsigned i = 0; i < components; i++) {
7684 bld.MOV(retype(offset(dst, bld, i), src.type),
7685 offset(src, bld, i + first_component));
7686 }
7687 } else if (brw_type_size_bytes(src.type) < brw_type_size_bytes(dst.type)) {
7688 /* Source is shuffled into destination */
7689 unsigned size_ratio = brw_type_size_bytes(dst.type) / brw_type_size_bytes(src.type);
7690 assert(!regions_overlap(dst,
7691 brw_type_size_bytes(dst.type) * bld.dispatch_width() *
7692 DIV_ROUND_UP(components, size_ratio),
7693 offset(src, bld, first_component),
7694 brw_type_size_bytes(src.type) * bld.dispatch_width() * components));
7695
7696 brw_reg_type shuffle_type =
7697 brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(src.type));
7698 for (unsigned i = 0; i < components; i++) {
7699 brw_reg shuffle_component_i =
7700 subscript(offset(dst, bld, i / size_ratio),
7701 shuffle_type, i % size_ratio);
7702 bld.MOV(shuffle_component_i,
7703 retype(offset(src, bld, i + first_component), shuffle_type));
7704 }
7705 } else {
7706 /* Source is unshuffled into destination */
7707 unsigned size_ratio = brw_type_size_bytes(src.type) / brw_type_size_bytes(dst.type);
7708 assert(!regions_overlap(dst,
7709 brw_type_size_bytes(dst.type) * bld.dispatch_width() * components,
7710 offset(src, bld, first_component / size_ratio),
7711 brw_type_size_bytes(src.type) * bld.dispatch_width() *
7712 DIV_ROUND_UP(components + (first_component % size_ratio),
7713 size_ratio)));
7714
7715 brw_reg_type shuffle_type =
7716 brw_type_with_size(BRW_TYPE_D, brw_type_size_bits(dst.type));
7717 for (unsigned i = 0; i < components; i++) {
7718 brw_reg shuffle_component_i =
7719 subscript(offset(src, bld, (first_component + i) / size_ratio),
7720 shuffle_type, (first_component + i) % size_ratio);
7721 bld.MOV(retype(offset(dst, bld, i), shuffle_type),
7722 shuffle_component_i);
7723 }
7724 }
7725 }
7726
7727 void
shuffle_from_32bit_read(const fs_builder & bld,const brw_reg & dst,const brw_reg & src,uint32_t first_component,uint32_t components)7728 shuffle_from_32bit_read(const fs_builder &bld,
7729 const brw_reg &dst,
7730 const brw_reg &src,
7731 uint32_t first_component,
7732 uint32_t components)
7733 {
7734 assert(brw_type_size_bytes(src.type) == 4);
7735
7736 /* This function takes components in units of the destination type while
7737 * shuffle_src_to_dst takes components in units of the smallest type
7738 */
7739 if (brw_type_size_bytes(dst.type) > 4) {
7740 assert(brw_type_size_bytes(dst.type) == 8);
7741 first_component *= 2;
7742 components *= 2;
7743 }
7744
7745 shuffle_src_to_dst(bld, dst, src, first_component, components);
7746 }
7747
7748 static void
fs_nir_emit_instr(nir_to_brw_state & ntb,nir_instr * instr)7749 fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr)
7750 {
7751 #ifndef NDEBUG
7752 if (unlikely(ntb.annotate)) {
7753 /* Use shader mem_ctx since annotations outlive the NIR conversion. */
7754 ntb.bld = ntb.bld.annotate(nir_instr_as_str(instr, ntb.s.mem_ctx));
7755 }
7756 #endif
7757
7758 switch (instr->type) {
7759 case nir_instr_type_alu:
7760 fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
7761 break;
7762
7763 case nir_instr_type_deref:
7764 unreachable("All derefs should've been lowered");
7765 break;
7766
7767 case nir_instr_type_intrinsic:
7768 switch (ntb.s.stage) {
7769 case MESA_SHADER_VERTEX:
7770 fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7771 break;
7772 case MESA_SHADER_TESS_CTRL:
7773 fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7774 break;
7775 case MESA_SHADER_TESS_EVAL:
7776 fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7777 break;
7778 case MESA_SHADER_GEOMETRY:
7779 fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7780 break;
7781 case MESA_SHADER_FRAGMENT:
7782 fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7783 break;
7784 case MESA_SHADER_COMPUTE:
7785 case MESA_SHADER_KERNEL:
7786 fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7787 break;
7788 case MESA_SHADER_RAYGEN:
7789 case MESA_SHADER_ANY_HIT:
7790 case MESA_SHADER_CLOSEST_HIT:
7791 case MESA_SHADER_MISS:
7792 case MESA_SHADER_INTERSECTION:
7793 case MESA_SHADER_CALLABLE:
7794 fs_nir_emit_bs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7795 break;
7796 case MESA_SHADER_TASK:
7797 fs_nir_emit_task_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7798 break;
7799 case MESA_SHADER_MESH:
7800 fs_nir_emit_mesh_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7801 break;
7802 default:
7803 unreachable("unsupported shader stage");
7804 }
7805 break;
7806
7807 case nir_instr_type_tex:
7808 fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
7809 break;
7810
7811 case nir_instr_type_load_const:
7812 fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
7813 break;
7814
7815 case nir_instr_type_undef:
7816 /* We create a new VGRF for undefs on every use (by handling
7817 * them in get_nir_src()), rather than for each definition.
7818 * This helps register coalescing eliminate MOVs from undef.
7819 */
7820 break;
7821
7822 case nir_instr_type_jump:
7823 fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
7824 break;
7825
7826 default:
7827 unreachable("unknown instruction type");
7828 }
7829 }
7830
7831 static unsigned
brw_rnd_mode_from_nir(unsigned mode,unsigned * mask)7832 brw_rnd_mode_from_nir(unsigned mode, unsigned *mask)
7833 {
7834 unsigned brw_mode = 0;
7835 *mask = 0;
7836
7837 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
7838 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
7839 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
7840 mode) {
7841 brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT;
7842 *mask |= BRW_CR0_RND_MODE_MASK;
7843 }
7844 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
7845 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
7846 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
7847 mode) {
7848 brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT;
7849 *mask |= BRW_CR0_RND_MODE_MASK;
7850 }
7851 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
7852 brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE;
7853 *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
7854 }
7855 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
7856 brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE;
7857 *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
7858 }
7859 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
7860 brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE;
7861 *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
7862 }
7863 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
7864 *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
7865 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
7866 *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
7867 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
7868 *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
7869 if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7870 *mask |= BRW_CR0_FP_MODE_MASK;
7871
7872 if (*mask != 0)
7873 assert((*mask & brw_mode) == brw_mode);
7874
7875 return brw_mode;
7876 }
7877
7878 static void
emit_shader_float_controls_execution_mode(nir_to_brw_state & ntb)7879 emit_shader_float_controls_execution_mode(nir_to_brw_state &ntb)
7880 {
7881 const fs_builder &bld = ntb.bld;
7882 fs_visitor &s = ntb.s;
7883
7884 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
7885 if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7886 return;
7887
7888 fs_builder ubld = bld.exec_all().group(1, 0);
7889 fs_builder abld = ubld.annotate("shader floats control execution mode");
7890 unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask);
7891
7892 if (mask == 0)
7893 return;
7894
7895 abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
7896 brw_imm_d(mode), brw_imm_d(mask));
7897 }
7898
7899 /**
7900 * Test the dispatch mask packing assumptions of
7901 * brw_stage_has_packed_dispatch(). Call this from e.g. the top of
7902 * nir_to_brw() to cause a GPU hang if any shader invocation is
7903 * executed with an unexpected dispatch mask.
7904 */
7905 static UNUSED void
brw_fs_test_dispatch_packing(const fs_builder & bld)7906 brw_fs_test_dispatch_packing(const fs_builder &bld)
7907 {
7908 const fs_visitor *shader = bld.shader;
7909 const gl_shader_stage stage = shader->stage;
7910 const bool uses_vmask =
7911 stage == MESA_SHADER_FRAGMENT &&
7912 brw_wm_prog_data(shader->prog_data)->uses_vmask;
7913
7914 if (brw_stage_has_packed_dispatch(shader->devinfo, stage,
7915 shader->max_polygons,
7916 shader->prog_data)) {
7917 const fs_builder ubld = bld.exec_all().group(1, 0);
7918 const brw_reg tmp = component(bld.vgrf(BRW_TYPE_UD), 0);
7919 const brw_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
7920
7921 ubld.ADD(tmp, mask, brw_imm_ud(1));
7922 ubld.AND(tmp, mask, tmp);
7923
7924 /* This will loop forever if the dispatch mask doesn't have the expected
7925 * form '2^n-1', in which case tmp will be non-zero.
7926 */
7927 bld.emit(BRW_OPCODE_DO);
7928 bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
7929 set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
7930 }
7931 }
7932
7933 void
nir_to_brw(fs_visitor * s)7934 nir_to_brw(fs_visitor *s)
7935 {
7936 nir_to_brw_state ntb = {
7937 .s = *s,
7938 .nir = s->nir,
7939 .devinfo = s->devinfo,
7940 .mem_ctx = ralloc_context(NULL),
7941 .bld = fs_builder(s).at_end(),
7942 };
7943
7944 if (INTEL_DEBUG(DEBUG_ANNOTATION))
7945 ntb.annotate = true;
7946
7947 if (ENABLE_FS_TEST_DISPATCH_PACKING)
7948 brw_fs_test_dispatch_packing(ntb.bld);
7949
7950 for (unsigned i = 0; i < s->nir->printf_info_count; i++) {
7951 brw_stage_prog_data_add_printf(s->prog_data,
7952 s->mem_ctx,
7953 &s->nir->printf_info[i]);
7954 }
7955
7956 emit_shader_float_controls_execution_mode(ntb);
7957
7958 /* emit the arrays used for inputs and outputs - load/store intrinsics will
7959 * be converted to reads/writes of these arrays
7960 */
7961 fs_nir_setup_outputs(ntb);
7962 fs_nir_setup_uniforms(ntb.s);
7963 fs_nir_emit_system_values(ntb);
7964 ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
7965
7966 fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
7967
7968 ntb.bld.emit(SHADER_OPCODE_HALT_TARGET);
7969
7970 ralloc_free(ntb.mem_ctx);
7971
7972 brw_shader_phase_update(*s, BRW_SHADER_PHASE_AFTER_NIR);
7973 }
7974