1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_fs_builder.h"
26 #include "brw_nir.h"
27 #include "brw_eu.h"
28 #include "nir.h"
29 #include "nir_intrinsics.h"
30 #include "nir_search_helpers.h"
31 #include "util/u_math.h"
32 #include "util/bitscan.h"
33
34 #include <vector>
35
36 using namespace brw;
37
38 struct brw_fs_bind_info {
39 bool valid;
40 bool bindless;
41 unsigned block;
42 unsigned set;
43 unsigned binding;
44 };
45
46 struct nir_to_brw_state {
47 fs_visitor &s;
48 const nir_shader *nir;
49 const intel_device_info *devinfo;
50 void *mem_ctx;
51
52 /* Points to the end of the program. Annotated with the current NIR
53 * instruction when applicable.
54 */
55 fs_builder bld;
56
57 fs_reg *ssa_values;
58 fs_inst **resource_insts;
59 struct brw_fs_bind_info *ssa_bind_infos;
60 fs_reg *resource_values;
61 fs_reg *system_values;
62 };
63
64 static fs_reg get_nir_src(nir_to_brw_state &ntb, const nir_src &src);
65 static fs_reg get_nir_def(nir_to_brw_state &ntb, const nir_def &def);
66 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
67
68 static void fs_nir_emit_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
69 static fs_reg emit_samplepos_setup(nir_to_brw_state &ntb);
70 static fs_reg emit_sampleid_setup(nir_to_brw_state &ntb);
71 static fs_reg emit_samplemaskin_setup(nir_to_brw_state &ntb);
72 static fs_reg emit_shading_rate_setup(nir_to_brw_state &ntb);
73
74 static void fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl);
75 static void fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list);
76 static void fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt);
77 static void fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop);
78 static void fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block);
79 static void fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr);
80
81 static void fs_nir_emit_surface_atomic(nir_to_brw_state &ntb,
82 const fs_builder &bld,
83 nir_intrinsic_instr *instr,
84 fs_reg surface,
85 bool bindless);
86 static void fs_nir_emit_global_atomic(nir_to_brw_state &ntb,
87 const fs_builder &bld,
88 nir_intrinsic_instr *instr);
89
90 static fs_reg
setup_imm_b(const fs_builder & bld,int8_t v)91 setup_imm_b(const fs_builder &bld, int8_t v)
92 {
93 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
94 bld.MOV(tmp, brw_imm_w(v));
95 return tmp;
96 }
97
98 static void
fs_nir_setup_outputs(nir_to_brw_state & ntb)99 fs_nir_setup_outputs(nir_to_brw_state &ntb)
100 {
101 fs_visitor &s = ntb.s;
102
103 if (s.stage == MESA_SHADER_TESS_CTRL ||
104 s.stage == MESA_SHADER_TASK ||
105 s.stage == MESA_SHADER_MESH ||
106 s.stage == MESA_SHADER_FRAGMENT)
107 return;
108
109 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
110
111 /* Calculate the size of output registers in a separate pass, before
112 * allocating them. With ARB_enhanced_layouts, multiple output variables
113 * may occupy the same slot, but have different type sizes.
114 */
115 nir_foreach_shader_out_variable(var, s.nir) {
116 const int loc = var->data.driver_location;
117 const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
118 vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
119 }
120
121 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
122 if (vec4s[loc] == 0) {
123 loc++;
124 continue;
125 }
126
127 unsigned reg_size = vec4s[loc];
128
129 /* Check if there are any ranges that start within this range and extend
130 * past it. If so, include them in this allocation.
131 */
132 for (unsigned i = 1; i < reg_size; i++) {
133 assert(i + loc < ARRAY_SIZE(vec4s));
134 reg_size = MAX2(vec4s[i + loc] + i, reg_size);
135 }
136
137 fs_reg reg = ntb.bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
138 for (unsigned i = 0; i < reg_size; i++) {
139 assert(loc + i < ARRAY_SIZE(s.outputs));
140 s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
141 }
142
143 loc += reg_size;
144 }
145 }
146
147 static void
fs_nir_setup_uniforms(fs_visitor & s)148 fs_nir_setup_uniforms(fs_visitor &s)
149 {
150 const intel_device_info *devinfo = s.devinfo;
151
152 /* Only the first compile gets to set up uniforms. */
153 if (s.push_constant_loc)
154 return;
155
156 s.uniforms = s.nir->num_uniforms / 4;
157
158 if (gl_shader_stage_is_compute(s.stage) && devinfo->verx10 < 125) {
159 /* Add uniforms for builtins after regular NIR uniforms. */
160 assert(s.uniforms == s.prog_data->nr_params);
161
162 /* Subgroup ID must be the last uniform on the list. This will make
163 * easier later to split between cross thread and per thread
164 * uniforms.
165 */
166 uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1);
167 *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
168 s.uniforms++;
169 }
170 }
171
172 static fs_reg
emit_work_group_id_setup(nir_to_brw_state & ntb)173 emit_work_group_id_setup(nir_to_brw_state &ntb)
174 {
175 fs_visitor &s = ntb.s;
176 const fs_builder &bld = ntb.bld;
177
178 assert(gl_shader_stage_is_compute(s.stage));
179
180 fs_reg id = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
181
182 struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
183 bld.MOV(id, r0_1);
184
185 struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
186 struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
187 bld.MOV(offset(id, bld, 1), r0_6);
188 bld.MOV(offset(id, bld, 2), r0_7);
189
190 return id;
191 }
192
193 static bool
emit_system_values_block(nir_to_brw_state & ntb,nir_block * block)194 emit_system_values_block(nir_to_brw_state &ntb, nir_block *block)
195 {
196 fs_visitor &s = ntb.s;
197 fs_reg *reg;
198
199 nir_foreach_instr(instr, block) {
200 if (instr->type != nir_instr_type_intrinsic)
201 continue;
202
203 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
204 switch (intrin->intrinsic) {
205 case nir_intrinsic_load_vertex_id:
206 case nir_intrinsic_load_base_vertex:
207 unreachable("should be lowered by nir_lower_system_values().");
208
209 case nir_intrinsic_load_vertex_id_zero_base:
210 case nir_intrinsic_load_is_indexed_draw:
211 case nir_intrinsic_load_first_vertex:
212 case nir_intrinsic_load_instance_id:
213 case nir_intrinsic_load_base_instance:
214 unreachable("should be lowered by brw_nir_lower_vs_inputs().");
215 break;
216
217 case nir_intrinsic_load_draw_id:
218 /* For Task/Mesh, draw_id will be handled later in
219 * nir_emit_mesh_task_intrinsic().
220 */
221 if (!gl_shader_stage_is_mesh(s.stage))
222 unreachable("should be lowered by brw_nir_lower_vs_inputs().");
223 break;
224
225 case nir_intrinsic_load_invocation_id:
226 if (s.stage == MESA_SHADER_TESS_CTRL)
227 break;
228 assert(s.stage == MESA_SHADER_GEOMETRY);
229 reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
230 if (reg->file == BAD_FILE) {
231 *reg = s.gs_payload().instance_id;
232 }
233 break;
234
235 case nir_intrinsic_load_sample_pos:
236 case nir_intrinsic_load_sample_pos_or_center:
237 assert(s.stage == MESA_SHADER_FRAGMENT);
238 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
239 if (reg->file == BAD_FILE)
240 *reg = emit_samplepos_setup(ntb);
241 break;
242
243 case nir_intrinsic_load_sample_id:
244 assert(s.stage == MESA_SHADER_FRAGMENT);
245 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
246 if (reg->file == BAD_FILE)
247 *reg = emit_sampleid_setup(ntb);
248 break;
249
250 case nir_intrinsic_load_sample_mask_in:
251 assert(s.stage == MESA_SHADER_FRAGMENT);
252 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
253 if (reg->file == BAD_FILE)
254 *reg = emit_samplemaskin_setup(ntb);
255 break;
256
257 case nir_intrinsic_load_workgroup_id:
258 case nir_intrinsic_load_workgroup_id_zero_base:
259 if (gl_shader_stage_is_mesh(s.stage))
260 unreachable("should be lowered by nir_lower_compute_system_values().");
261 assert(gl_shader_stage_is_compute(s.stage));
262 reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
263 if (reg->file == BAD_FILE)
264 *reg = emit_work_group_id_setup(ntb);
265 break;
266
267 case nir_intrinsic_load_helper_invocation:
268 assert(s.stage == MESA_SHADER_FRAGMENT);
269 reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
270 if (reg->file == BAD_FILE) {
271 const fs_builder abld =
272 ntb.bld.annotate("gl_HelperInvocation", NULL);
273
274 /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
275 * pixel mask is in g1.7 of the thread payload.
276 *
277 * We move the per-channel pixel enable bit to the low bit of each
278 * channel by shifting the byte containing the pixel mask by the
279 * vector immediate 0x76543210UV.
280 *
281 * The region of <1,8,0> reads only 1 byte (the pixel masks for
282 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
283 * masks for 2 and 3) in SIMD16.
284 */
285 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
286
287 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
288 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
289 /* According to the "PS Thread Payload for Normal
290 * Dispatch" pages on the BSpec, the dispatch mask is
291 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
292 * gfx6+.
293 */
294 const struct brw_reg reg = s.devinfo->ver >= 20 ?
295 xe2_vec1_grf(i, 15) : brw_vec1_grf(i + 1, 7);
296 hbld.SHR(offset(shifted, hbld, i),
297 stride(retype(reg, BRW_REGISTER_TYPE_UB), 1, 8, 0),
298 brw_imm_v(0x76543210));
299 }
300
301 /* A set bit in the pixel mask means the channel is enabled, but
302 * that is the opposite of gl_HelperInvocation so we need to invert
303 * the mask.
304 *
305 * The negate source-modifier bit of logical instructions on Gfx8+
306 * performs 1's complement negation, so we can use that instead of
307 * a NOT instruction.
308 */
309 fs_reg inverted = negate(shifted);
310
311 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
312 * with 1 and negating.
313 */
314 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
315 abld.AND(anded, inverted, brw_imm_uw(1));
316
317 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
318 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
319 *reg = dst;
320 }
321 break;
322
323 case nir_intrinsic_load_frag_shading_rate:
324 reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
325 if (reg->file == BAD_FILE)
326 *reg = emit_shading_rate_setup(ntb);
327 break;
328
329 default:
330 break;
331 }
332 }
333
334 return true;
335 }
336
337 static void
fs_nir_emit_system_values(nir_to_brw_state & ntb)338 fs_nir_emit_system_values(nir_to_brw_state &ntb)
339 {
340 const fs_builder &bld = ntb.bld;
341 fs_visitor &s = ntb.s;
342
343 ntb.system_values = ralloc_array(ntb.mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
344 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
345 ntb.system_values[i] = fs_reg();
346 }
347
348 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we
349 * never end up using it.
350 */
351 {
352 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
353 fs_reg ® = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
354 reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
355 abld.UNDEF(reg);
356
357 const fs_builder allbld8 = abld.group(8, 0).exec_all();
358 allbld8.MOV(reg, brw_imm_v(0x76543210));
359 if (s.dispatch_width > 8)
360 allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
361 if (s.dispatch_width > 16) {
362 const fs_builder allbld16 = abld.group(16, 0).exec_all();
363 allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
364 }
365 }
366
367 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
368 nir_foreach_block(block, impl)
369 emit_system_values_block(ntb, block);
370 }
371
372 static void
fs_nir_emit_impl(nir_to_brw_state & ntb,nir_function_impl * impl)373 fs_nir_emit_impl(nir_to_brw_state &ntb, nir_function_impl *impl)
374 {
375 ntb.ssa_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc);
376 ntb.resource_insts = rzalloc_array(ntb.mem_ctx, fs_inst *, impl->ssa_alloc);
377 ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct brw_fs_bind_info, impl->ssa_alloc);
378 ntb.resource_values = rzalloc_array(ntb.mem_ctx, fs_reg, impl->ssa_alloc);
379
380 fs_nir_emit_cf_list(ntb, &impl->body);
381 }
382
383 static void
fs_nir_emit_cf_list(nir_to_brw_state & ntb,exec_list * list)384 fs_nir_emit_cf_list(nir_to_brw_state &ntb, exec_list *list)
385 {
386 exec_list_validate(list);
387 foreach_list_typed(nir_cf_node, node, node, list) {
388 switch (node->type) {
389 case nir_cf_node_if:
390 fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
391 break;
392
393 case nir_cf_node_loop:
394 fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
395 break;
396
397 case nir_cf_node_block:
398 fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
399 break;
400
401 default:
402 unreachable("Invalid CFG node block");
403 }
404 }
405 }
406
407 static void
fs_nir_emit_if(nir_to_brw_state & ntb,nir_if * if_stmt)408 fs_nir_emit_if(nir_to_brw_state &ntb, nir_if *if_stmt)
409 {
410 const fs_builder &bld = ntb.bld;
411
412 bool invert;
413 fs_reg cond_reg;
414
415 /* If the condition has the form !other_condition, use other_condition as
416 * the source, but invert the predicate on the if instruction.
417 */
418 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
419 if (cond != NULL && cond->op == nir_op_inot) {
420 invert = true;
421 cond_reg = get_nir_src(ntb, cond->src[0].src);
422 cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
423 } else {
424 invert = false;
425 cond_reg = get_nir_src(ntb, if_stmt->condition);
426 }
427
428 /* first, put the condition into f0 */
429 fs_inst *inst = bld.MOV(bld.null_reg_d(),
430 retype(cond_reg, BRW_REGISTER_TYPE_D));
431 inst->conditional_mod = BRW_CONDITIONAL_NZ;
432
433 bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
434
435 fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
436
437 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
438 bld.emit(BRW_OPCODE_ELSE);
439 fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
440 }
441
442 bld.emit(BRW_OPCODE_ENDIF);
443 }
444
445 static void
fs_nir_emit_loop(nir_to_brw_state & ntb,nir_loop * loop)446 fs_nir_emit_loop(nir_to_brw_state &ntb, nir_loop *loop)
447 {
448 const fs_builder &bld = ntb.bld;
449
450 assert(!nir_loop_has_continue_construct(loop));
451 bld.emit(BRW_OPCODE_DO);
452
453 fs_nir_emit_cf_list(ntb, &loop->body);
454
455 bld.emit(BRW_OPCODE_WHILE);
456 }
457
458 static void
fs_nir_emit_block(nir_to_brw_state & ntb,nir_block * block)459 fs_nir_emit_block(nir_to_brw_state &ntb, nir_block *block)
460 {
461 fs_builder bld = ntb.bld;
462
463 nir_foreach_instr(instr, block) {
464 fs_nir_emit_instr(ntb, instr);
465 }
466
467 ntb.bld = bld;
468 }
469
470 /**
471 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
472 * match instr.
473 */
474 static bool
optimize_extract_to_float(nir_to_brw_state & ntb,nir_alu_instr * instr,const fs_reg & result)475 optimize_extract_to_float(nir_to_brw_state &ntb, nir_alu_instr *instr,
476 const fs_reg &result)
477 {
478 const intel_device_info *devinfo = ntb.devinfo;
479 const fs_builder &bld = ntb.bld;
480
481 if (!instr->src[0].src.ssa->parent_instr)
482 return false;
483
484 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
485 return false;
486
487 nir_alu_instr *src0 =
488 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
489
490 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
491 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
492 return false;
493
494 unsigned element = nir_src_as_uint(src0->src[1].src);
495
496 /* Element type to extract.*/
497 const brw_reg_type type = brw_int_type(
498 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
499 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
500
501 fs_reg op0 = get_nir_src(ntb, src0->src[0].src);
502 op0.type = brw_type_for_nir_type(devinfo,
503 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
504 nir_src_bit_size(src0->src[0].src)));
505 op0 = offset(op0, bld, src0->src[0].swizzle[0]);
506
507 bld.MOV(result, subscript(op0, type, element));
508 return true;
509 }
510
511 static bool
optimize_frontfacing_ternary(nir_to_brw_state & ntb,nir_alu_instr * instr,const fs_reg & result)512 optimize_frontfacing_ternary(nir_to_brw_state &ntb,
513 nir_alu_instr *instr,
514 const fs_reg &result)
515 {
516 const intel_device_info *devinfo = ntb.devinfo;
517 fs_visitor &s = ntb.s;
518
519 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
520 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
521 return false;
522
523 if (!nir_src_is_const(instr->src[1].src) ||
524 !nir_src_is_const(instr->src[2].src))
525 return false;
526
527 const float value1 = nir_src_as_float(instr->src[1].src);
528 const float value2 = nir_src_as_float(instr->src[2].src);
529 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
530 return false;
531
532 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
533 assert(value1 == -value2);
534
535 fs_reg tmp = s.vgrf(glsl_int_type());
536
537 if (devinfo->ver >= 20) {
538 /* Gfx20+ has separate back-facing bits for each pair of
539 * subspans in order to support multiple polygons, so we need to
540 * use a <1;8,0> region in order to select the correct word for
541 * each channel. Unfortunately they're no longer aligned to the
542 * sign bit of a 16-bit word, so a left shift is necessary.
543 */
544 fs_reg ff = ntb.bld.vgrf(BRW_REGISTER_TYPE_UW);
545
546 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
547 const fs_builder hbld = ntb.bld.group(16, i);
548 const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
549 BRW_REGISTER_TYPE_UW);
550 hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), brw_imm_ud(4));
551 }
552
553 if (value1 == -1.0f)
554 ff.negate = true;
555
556 ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_UW, 1), ff,
557 brw_imm_uw(0x3f80));
558
559 } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
560 /* According to the BSpec "PS Thread Payload for Normal
561 * Dispatch", the front/back facing interpolation bit is stored
562 * as bit 15 of either the R1.1 or R1.6 poly info field, for the
563 * first and second polygons respectively in multipolygon PS
564 * dispatch mode.
565 */
566 assert(s.dispatch_width == 16);
567
568 for (unsigned i = 0; i < s.max_polygons; i++) {
569 const fs_builder hbld = ntb.bld.group(8, i);
570 struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
571 BRW_REGISTER_TYPE_UW);
572
573 if (value1 == -1.0f)
574 g1.negate = true;
575
576 hbld.OR(subscript(offset(tmp, hbld, i), BRW_REGISTER_TYPE_UW, 1),
577 g1, brw_imm_uw(0x3f80));
578 }
579
580 } else if (devinfo->ver >= 12) {
581 /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
582 fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
583
584 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
585 *
586 * or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W
587 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
588 *
589 * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
590 */
591 if (value1 == -1.0f)
592 g1.negate = true;
593
594 ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
595 g1, brw_imm_uw(0x3f80));
596 } else {
597 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
598 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
599
600 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
601 *
602 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
603 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
604 *
605 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
606 *
607 * This negation looks like it's safe in practice, because bits 0:4 will
608 * surely be TRIANGLES
609 */
610
611 if (value1 == -1.0f) {
612 g0.negate = true;
613 }
614
615 ntb.bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
616 g0, brw_imm_uw(0x3f80));
617 }
618 ntb.bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
619
620 return true;
621 }
622
623 static brw_rnd_mode
brw_rnd_mode_from_nir_op(const nir_op op)624 brw_rnd_mode_from_nir_op (const nir_op op) {
625 switch (op) {
626 case nir_op_f2f16_rtz:
627 return BRW_RND_MODE_RTZ;
628 case nir_op_f2f16_rtne:
629 return BRW_RND_MODE_RTNE;
630 default:
631 unreachable("Operation doesn't support rounding mode");
632 }
633 }
634
635 static brw_rnd_mode
brw_rnd_mode_from_execution_mode(unsigned execution_mode)636 brw_rnd_mode_from_execution_mode(unsigned execution_mode)
637 {
638 if (nir_has_any_rounding_mode_rtne(execution_mode))
639 return BRW_RND_MODE_RTNE;
640 if (nir_has_any_rounding_mode_rtz(execution_mode))
641 return BRW_RND_MODE_RTZ;
642 return BRW_RND_MODE_UNSPECIFIED;
643 }
644
645 static fs_reg
prepare_alu_destination_and_sources(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,fs_reg * op,bool need_dest)646 prepare_alu_destination_and_sources(nir_to_brw_state &ntb,
647 const fs_builder &bld,
648 nir_alu_instr *instr,
649 fs_reg *op,
650 bool need_dest)
651 {
652 const intel_device_info *devinfo = ntb.devinfo;
653
654 fs_reg result =
655 need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
656
657 result.type = brw_type_for_nir_type(devinfo,
658 (nir_alu_type)(nir_op_infos[instr->op].output_type |
659 instr->def.bit_size));
660
661 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
662 op[i] = get_nir_src(ntb, instr->src[i].src);
663 op[i].type = brw_type_for_nir_type(devinfo,
664 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
665 nir_src_bit_size(instr->src[i].src)));
666 }
667
668 /* Move and vecN instrutions may still be vectored. Return the raw,
669 * vectored source and destination so that fs_visitor::nir_emit_alu can
670 * handle it. Other callers should not have to handle these kinds of
671 * instructions.
672 */
673 switch (instr->op) {
674 case nir_op_mov:
675 case nir_op_vec2:
676 case nir_op_vec3:
677 case nir_op_vec4:
678 case nir_op_vec8:
679 case nir_op_vec16:
680 return result;
681 default:
682 break;
683 }
684
685 /* At this point, we have dealt with any instruction that operates on
686 * more than a single channel. Therefore, we can just adjust the source
687 * and destination registers for that channel and emit the instruction.
688 */
689 unsigned channel = 0;
690 if (nir_op_infos[instr->op].output_size == 0) {
691 /* Since NIR is doing the scalarizing for us, we should only ever see
692 * vectorized operations with a single channel.
693 */
694 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
695 assert(util_bitcount(write_mask) == 1);
696 channel = ffs(write_mask) - 1;
697
698 result = offset(result, bld, channel);
699 }
700
701 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
702 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
703 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
704 }
705
706 return result;
707 }
708
709 static fs_reg
resolve_source_modifiers(const fs_builder & bld,const fs_reg & src)710 resolve_source_modifiers(const fs_builder &bld, const fs_reg &src)
711 {
712 if (!src.abs && !src.negate)
713 return src;
714
715 fs_reg temp = bld.vgrf(src.type);
716 bld.MOV(temp, src);
717
718 return temp;
719 }
720
721 static void
resolve_inot_sources(nir_to_brw_state & ntb,const fs_builder & bld,nir_alu_instr * instr,fs_reg * op)722 resolve_inot_sources(nir_to_brw_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
723 fs_reg *op)
724 {
725 for (unsigned i = 0; i < 2; i++) {
726 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
727
728 if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
729 /* The source of the inot is now the source of instr. */
730 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
731
732 assert(!op[i].negate);
733 op[i].negate = true;
734 } else {
735 op[i] = resolve_source_modifiers(bld, op[i]);
736 }
737 }
738 }
739
740 static bool
try_emit_b2fi_of_inot(nir_to_brw_state & ntb,const fs_builder & bld,fs_reg result,nir_alu_instr * instr)741 try_emit_b2fi_of_inot(nir_to_brw_state &ntb, const fs_builder &bld,
742 fs_reg result,
743 nir_alu_instr *instr)
744 {
745 const intel_device_info *devinfo = bld.shader->devinfo;
746
747 if (devinfo->verx10 >= 125)
748 return false;
749
750 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
751
752 if (inot_instr == NULL || inot_instr->op != nir_op_inot)
753 return false;
754
755 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set
756 * of valid size-changing combinations is a bit more complex.
757 *
758 * The source restriction is just because I was lazy about generating the
759 * constant below.
760 */
761 if (instr->def.bit_size != 32 ||
762 nir_src_bit_size(inot_instr->src[0].src) != 32)
763 return false;
764
765 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,
766 * this is float(1 + a).
767 */
768 fs_reg op;
769
770 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
771
772 /* Ignore the saturate modifier, if there is one. The result of the
773 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
774 */
775 bld.ADD(result, op, brw_imm_d(1));
776
777 return true;
778 }
779
780 /**
781 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
782 *
783 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
784 * the source of \c instr that is a \c nir_op_fsign.
785 */
786 static void
emit_fsign(nir_to_brw_state & ntb,const fs_builder & bld,const nir_alu_instr * instr,fs_reg result,fs_reg * op,unsigned fsign_src)787 emit_fsign(nir_to_brw_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
788 fs_reg result, fs_reg *op, unsigned fsign_src)
789 {
790 fs_visitor &s = ntb.s;
791 const intel_device_info *devinfo = ntb.devinfo;
792
793 fs_inst *inst;
794
795 assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
796 assert(fsign_src < nir_op_infos[instr->op].num_inputs);
797
798 if (instr->op != nir_op_fsign) {
799 const nir_alu_instr *const fsign_instr =
800 nir_src_as_alu_instr(instr->src[fsign_src].src);
801
802 /* op[fsign_src] has the nominal result of the fsign, and op[1 -
803 * fsign_src] has the other multiply source. This must be rearranged so
804 * that op[0] is the source of the fsign op[1] is the other multiply
805 * source.
806 */
807 if (fsign_src != 0)
808 op[1] = op[0];
809
810 op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
811
812 const nir_alu_type t =
813 (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
814 nir_src_bit_size(fsign_instr->src[0].src));
815
816 op[0].type = brw_type_for_nir_type(devinfo, t);
817
818 unsigned channel = 0;
819 if (nir_op_infos[instr->op].output_size == 0) {
820 /* Since NIR is doing the scalarizing for us, we should only ever see
821 * vectorized operations with a single channel.
822 */
823 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
824 assert(util_bitcount(write_mask) == 1);
825 channel = ffs(write_mask) - 1;
826 }
827
828 op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
829 }
830
831 if (type_sz(op[0].type) == 2) {
832 /* AND(val, 0x8000) gives the sign bit.
833 *
834 * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
835 */
836 fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
837 bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
838
839 op[0].type = BRW_REGISTER_TYPE_UW;
840 result.type = BRW_REGISTER_TYPE_UW;
841 bld.AND(result, op[0], brw_imm_uw(0x8000u));
842
843 if (instr->op == nir_op_fsign)
844 inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
845 else {
846 /* Use XOR here to get the result sign correct. */
847 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
848 }
849
850 inst->predicate = BRW_PREDICATE_NORMAL;
851 } else if (type_sz(op[0].type) == 4) {
852 /* AND(val, 0x80000000) gives the sign bit.
853 *
854 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
855 * zero.
856 */
857 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
858
859 op[0].type = BRW_REGISTER_TYPE_UD;
860 result.type = BRW_REGISTER_TYPE_UD;
861 bld.AND(result, op[0], brw_imm_ud(0x80000000u));
862
863 if (instr->op == nir_op_fsign)
864 inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
865 else {
866 /* Use XOR here to get the result sign correct. */
867 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
868 }
869
870 inst->predicate = BRW_PREDICATE_NORMAL;
871 } else {
872 /* For doubles we do the same but we need to consider:
873 *
874 * - 2-src instructions can't operate with 64-bit immediates
875 * - The sign is encoded in the high 32-bit of each DF
876 * - We need to produce a DF result.
877 */
878
879 fs_reg zero = s.vgrf(glsl_double_type());
880 bld.MOV(zero, brw_imm_df(0.0));
881 bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
882
883 bld.MOV(result, zero);
884
885 fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
886 bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
887 brw_imm_ud(0x80000000u));
888
889 if (instr->op == nir_op_fsign) {
890 set_predicate(BRW_PREDICATE_NORMAL,
891 bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
892 } else {
893 if (devinfo->has_64bit_int) {
894 /* This could be done better in some cases. If the scale is an
895 * immediate with the low 32-bits all 0, emitting a separate XOR and
896 * OR would allow an algebraic optimization to remove the OR. There
897 * are currently zero instances of fsign(double(x))*IMM in shader-db
898 * or any test suite, so it is hard to care at this time.
899 */
900 fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
901 inst = bld.XOR(result_int64, result_int64,
902 retype(op[1], BRW_REGISTER_TYPE_UQ));
903 } else {
904 fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
905 bld.MOV(subscript(result_int64, BRW_REGISTER_TYPE_UD, 0),
906 subscript(op[1], BRW_REGISTER_TYPE_UD, 0));
907 bld.XOR(subscript(result_int64, BRW_REGISTER_TYPE_UD, 1),
908 subscript(result_int64, BRW_REGISTER_TYPE_UD, 1),
909 subscript(op[1], BRW_REGISTER_TYPE_UD, 1));
910 }
911 }
912 }
913 }
914
915 /**
916 * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
917 *
918 * Checks the operands of a \c nir_op_fmul to determine whether or not
919 * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
920 *
921 * \param instr The multiplication instruction
922 *
923 * \param fsign_src The source of \c instr that may or may not be a
924 * \c nir_op_fsign
925 */
926 static bool
can_fuse_fmul_fsign(nir_alu_instr * instr,unsigned fsign_src)927 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
928 {
929 assert(instr->op == nir_op_fmul);
930
931 nir_alu_instr *const fsign_instr =
932 nir_src_as_alu_instr(instr->src[fsign_src].src);
933
934 /* Rules:
935 *
936 * 1. instr->src[fsign_src] must be a nir_op_fsign.
937 * 2. The nir_op_fsign can only be used by this multiplication.
938 * 3. The source that is the nir_op_fsign does not have source modifiers.
939 * \c emit_fsign only examines the source modifiers of the source of the
940 * \c nir_op_fsign.
941 *
942 * The nir_op_fsign must also not have the saturate modifier, but steps
943 * have already been taken (in nir_opt_algebraic) to ensure that.
944 */
945 return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
946 is_used_once(fsign_instr);
947 }
948
949 static bool
is_const_zero(const nir_src & src)950 is_const_zero(const nir_src &src)
951 {
952 return nir_src_is_const(src) && nir_src_as_int(src) == 0;
953 }
954
955 static void
fs_nir_emit_alu(nir_to_brw_state & ntb,nir_alu_instr * instr,bool need_dest)956 fs_nir_emit_alu(nir_to_brw_state &ntb, nir_alu_instr *instr,
957 bool need_dest)
958 {
959 const intel_device_info *devinfo = ntb.devinfo;
960 const fs_builder &bld = ntb.bld;
961 fs_visitor &s = ntb.s;
962
963 fs_inst *inst;
964 unsigned execution_mode =
965 bld.shader->nir->info.float_controls_execution_mode;
966
967 fs_reg op[NIR_MAX_VEC_COMPONENTS];
968 fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
969
970 #ifndef NDEBUG
971 /* Everything except raw moves, some type conversions, iabs, and ineg
972 * should have 8-bit sources lowered by nir_lower_bit_size in
973 * brw_preprocess_nir or by brw_nir_lower_conversions in
974 * brw_postprocess_nir.
975 */
976 switch (instr->op) {
977 case nir_op_mov:
978 case nir_op_vec2:
979 case nir_op_vec3:
980 case nir_op_vec4:
981 case nir_op_vec8:
982 case nir_op_vec16:
983 case nir_op_i2f16:
984 case nir_op_i2f32:
985 case nir_op_i2i16:
986 case nir_op_i2i32:
987 case nir_op_u2f16:
988 case nir_op_u2f32:
989 case nir_op_u2u16:
990 case nir_op_u2u32:
991 case nir_op_iabs:
992 case nir_op_ineg:
993 case nir_op_pack_32_4x8_split:
994 break;
995
996 default:
997 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
998 assert(type_sz(op[i].type) > 1);
999 }
1000 }
1001 #endif
1002
1003 switch (instr->op) {
1004 case nir_op_mov:
1005 case nir_op_vec2:
1006 case nir_op_vec3:
1007 case nir_op_vec4:
1008 case nir_op_vec8:
1009 case nir_op_vec16: {
1010 fs_reg temp = result;
1011 bool need_extra_copy = false;
1012
1013 nir_intrinsic_instr *store_reg =
1014 nir_store_reg_for_def(&instr->def);
1015 if (store_reg != NULL) {
1016 nir_def *dest_reg = store_reg->src[1].ssa;
1017 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1018 nir_intrinsic_instr *load_reg =
1019 nir_load_reg_for_def(instr->src[i].src.ssa);
1020 if (load_reg == NULL)
1021 continue;
1022
1023 if (load_reg->src[0].ssa == dest_reg) {
1024 need_extra_copy = true;
1025 temp = bld.vgrf(result.type, 4);
1026 break;
1027 }
1028 }
1029 }
1030
1031 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
1032 unsigned last_bit = util_last_bit(write_mask);
1033
1034 for (unsigned i = 0; i < last_bit; i++) {
1035 if (!(write_mask & (1 << i)))
1036 continue;
1037
1038 if (instr->op == nir_op_mov) {
1039 bld.MOV(offset(temp, bld, i),
1040 offset(op[0], bld, instr->src[0].swizzle[i]));
1041 } else {
1042 bld.MOV(offset(temp, bld, i),
1043 offset(op[i], bld, instr->src[i].swizzle[0]));
1044 }
1045 }
1046
1047 /* In this case the source and destination registers were the same,
1048 * so we need to insert an extra set of moves in order to deal with
1049 * any swizzling.
1050 */
1051 if (need_extra_copy) {
1052 for (unsigned i = 0; i < last_bit; i++) {
1053 if (!(write_mask & (1 << i)))
1054 continue;
1055
1056 bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1057 }
1058 }
1059 return;
1060 }
1061
1062 case nir_op_i2f32:
1063 case nir_op_u2f32:
1064 if (optimize_extract_to_float(ntb, instr, result))
1065 return;
1066 inst = bld.MOV(result, op[0]);
1067 break;
1068
1069 case nir_op_f2f16_rtne:
1070 case nir_op_f2f16_rtz:
1071 case nir_op_f2f16: {
1072 brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
1073
1074 if (nir_op_f2f16 == instr->op)
1075 rnd = brw_rnd_mode_from_execution_mode(execution_mode);
1076 else
1077 rnd = brw_rnd_mode_from_nir_op(instr->op);
1078
1079 if (BRW_RND_MODE_UNSPECIFIED != rnd)
1080 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
1081
1082 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1083 inst = bld.MOV(result, op[0]);
1084 break;
1085 }
1086
1087 case nir_op_b2i8:
1088 case nir_op_b2i16:
1089 case nir_op_b2i32:
1090 case nir_op_b2i64:
1091 case nir_op_b2f16:
1092 case nir_op_b2f32:
1093 case nir_op_b2f64:
1094 if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1095 break;
1096 op[0].type = BRW_REGISTER_TYPE_D;
1097 op[0].negate = !op[0].negate;
1098 FALLTHROUGH;
1099 case nir_op_i2f64:
1100 case nir_op_i2i64:
1101 case nir_op_u2f64:
1102 case nir_op_u2u64:
1103 case nir_op_f2f64:
1104 case nir_op_f2i64:
1105 case nir_op_f2u64:
1106 case nir_op_i2i32:
1107 case nir_op_u2u32:
1108 case nir_op_f2i32:
1109 case nir_op_f2u32:
1110 case nir_op_i2f16:
1111 case nir_op_u2f16:
1112 case nir_op_f2i16:
1113 case nir_op_f2u16:
1114 case nir_op_f2i8:
1115 case nir_op_f2u8:
1116 if (result.type == BRW_REGISTER_TYPE_B ||
1117 result.type == BRW_REGISTER_TYPE_UB ||
1118 result.type == BRW_REGISTER_TYPE_HF)
1119 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1120
1121 if (op[0].type == BRW_REGISTER_TYPE_B ||
1122 op[0].type == BRW_REGISTER_TYPE_UB ||
1123 op[0].type == BRW_REGISTER_TYPE_HF)
1124 assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1125
1126 inst = bld.MOV(result, op[0]);
1127 break;
1128
1129 case nir_op_i2i8:
1130 case nir_op_u2u8:
1131 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1132 FALLTHROUGH;
1133 case nir_op_i2i16:
1134 case nir_op_u2u16: {
1135 /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1136 * Emitting the instructions one by one results in two MOV instructions
1137 * that won't be propagated. By handling both instructions here, a
1138 * single MOV is emitted.
1139 */
1140 nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1141 if (extract_instr != NULL) {
1142 if (extract_instr->op == nir_op_extract_u8 ||
1143 extract_instr->op == nir_op_extract_i8) {
1144 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1145
1146 const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1147 const brw_reg_type type =
1148 brw_int_type(1, extract_instr->op == nir_op_extract_i8);
1149
1150 op[0] = subscript(op[0], type, byte);
1151 } else if (extract_instr->op == nir_op_extract_u16 ||
1152 extract_instr->op == nir_op_extract_i16) {
1153 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1154
1155 const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1156 const brw_reg_type type =
1157 brw_int_type(2, extract_instr->op == nir_op_extract_i16);
1158
1159 op[0] = subscript(op[0], type, word);
1160 }
1161 }
1162
1163 inst = bld.MOV(result, op[0]);
1164 break;
1165 }
1166
1167 case nir_op_fsat:
1168 inst = bld.MOV(result, op[0]);
1169 inst->saturate = true;
1170 break;
1171
1172 case nir_op_fneg:
1173 case nir_op_ineg:
1174 op[0].negate = true;
1175 inst = bld.MOV(result, op[0]);
1176 break;
1177
1178 case nir_op_fabs:
1179 case nir_op_iabs:
1180 op[0].negate = false;
1181 op[0].abs = true;
1182 inst = bld.MOV(result, op[0]);
1183 break;
1184
1185 case nir_op_f2f32:
1186 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1187 brw_rnd_mode rnd =
1188 brw_rnd_mode_from_execution_mode(execution_mode);
1189 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1190 brw_imm_d(rnd));
1191 }
1192
1193 if (op[0].type == BRW_REGISTER_TYPE_HF)
1194 assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1195
1196 inst = bld.MOV(result, op[0]);
1197 break;
1198
1199 case nir_op_fsign:
1200 emit_fsign(ntb, bld, instr, result, op, 0);
1201 break;
1202
1203 case nir_op_frcp:
1204 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
1205 break;
1206
1207 case nir_op_fexp2:
1208 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
1209 break;
1210
1211 case nir_op_flog2:
1212 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
1213 break;
1214
1215 case nir_op_fsin:
1216 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
1217 break;
1218
1219 case nir_op_fcos:
1220 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1221 break;
1222
1223 case nir_op_fddx_fine:
1224 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1225 break;
1226 case nir_op_fddx:
1227 case nir_op_fddx_coarse:
1228 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1229 break;
1230 case nir_op_fddy_fine:
1231 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1232 break;
1233 case nir_op_fddy:
1234 case nir_op_fddy_coarse:
1235 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1236 break;
1237
1238 case nir_op_fadd:
1239 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1240 brw_rnd_mode rnd =
1241 brw_rnd_mode_from_execution_mode(execution_mode);
1242 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1243 brw_imm_d(rnd));
1244 }
1245 FALLTHROUGH;
1246 case nir_op_iadd:
1247 inst = bld.ADD(result, op[0], op[1]);
1248 break;
1249
1250 case nir_op_iadd3:
1251 inst = bld.ADD3(result, op[0], op[1], op[2]);
1252 break;
1253
1254 case nir_op_iadd_sat:
1255 case nir_op_uadd_sat:
1256 inst = bld.ADD(result, op[0], op[1]);
1257 inst->saturate = true;
1258 break;
1259
1260 case nir_op_isub_sat:
1261 bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1262 break;
1263
1264 case nir_op_usub_sat:
1265 bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1266 break;
1267
1268 case nir_op_irhadd:
1269 case nir_op_urhadd:
1270 assert(instr->def.bit_size < 64);
1271 inst = bld.AVG(result, op[0], op[1]);
1272 break;
1273
1274 case nir_op_ihadd:
1275 case nir_op_uhadd: {
1276 assert(instr->def.bit_size < 64);
1277 fs_reg tmp = bld.vgrf(result.type);
1278
1279 op[0] = resolve_source_modifiers(bld, op[0]);
1280 op[1] = resolve_source_modifiers(bld, op[1]);
1281
1282 /* AVG(x, y) - ((x ^ y) & 1) */
1283 bld.XOR(tmp, op[0], op[1]);
1284 bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type));
1285 bld.AVG(result, op[0], op[1]);
1286 inst = bld.ADD(result, result, tmp);
1287 inst->src[1].negate = true;
1288 break;
1289 }
1290
1291 case nir_op_fmul:
1292 for (unsigned i = 0; i < 2; i++) {
1293 if (can_fuse_fmul_fsign(instr, i)) {
1294 emit_fsign(ntb, bld, instr, result, op, i);
1295 return;
1296 }
1297 }
1298
1299 /* We emit the rounding mode after the previous fsign optimization since
1300 * it won't result in a MUL, but will try to negate the value by other
1301 * means.
1302 */
1303 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1304 brw_rnd_mode rnd =
1305 brw_rnd_mode_from_execution_mode(execution_mode);
1306 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1307 brw_imm_d(rnd));
1308 }
1309
1310 inst = bld.MUL(result, op[0], op[1]);
1311 break;
1312
1313 case nir_op_imul_2x32_64:
1314 case nir_op_umul_2x32_64:
1315 bld.MUL(result, op[0], op[1]);
1316 break;
1317
1318 case nir_op_imul_32x16:
1319 case nir_op_umul_32x16: {
1320 const bool ud = instr->op == nir_op_umul_32x16;
1321 const enum brw_reg_type word_type =
1322 ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W;
1323 const enum brw_reg_type dword_type =
1324 ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
1325
1326 assert(instr->def.bit_size == 32);
1327
1328 /* Before copy propagation there are no immediate values. */
1329 assert(op[0].file != IMM && op[1].file != IMM);
1330
1331 op[1] = subscript(op[1], word_type, 0);
1332
1333 bld.MUL(result, retype(op[0], dword_type), op[1]);
1334
1335 break;
1336 }
1337
1338 case nir_op_imul:
1339 assert(instr->def.bit_size < 64);
1340 bld.MUL(result, op[0], op[1]);
1341 break;
1342
1343 case nir_op_imul_high:
1344 case nir_op_umul_high:
1345 assert(instr->def.bit_size < 64);
1346 if (instr->def.bit_size == 32) {
1347 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1348 } else {
1349 fs_reg tmp = bld.vgrf(brw_reg_type_from_bit_size(32, op[0].type));
1350 bld.MUL(tmp, op[0], op[1]);
1351 bld.MOV(result, subscript(tmp, result.type, 1));
1352 }
1353 break;
1354
1355 case nir_op_idiv:
1356 case nir_op_udiv:
1357 assert(instr->def.bit_size < 64);
1358 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1359 break;
1360
1361 case nir_op_uadd_carry:
1362 unreachable("Should have been lowered by carry_to_arith().");
1363
1364 case nir_op_usub_borrow:
1365 unreachable("Should have been lowered by borrow_to_arith().");
1366
1367 case nir_op_umod:
1368 case nir_op_irem:
1369 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1370 * appears that our hardware just does the right thing for signed
1371 * remainder.
1372 */
1373 assert(instr->def.bit_size < 64);
1374 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1375 break;
1376
1377 case nir_op_imod: {
1378 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
1379 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1380
1381 /* Math instructions don't support conditional mod */
1382 inst = bld.MOV(bld.null_reg_d(), result);
1383 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1384
1385 /* Now, we need to determine if signs of the sources are different.
1386 * When we XOR the sources, the top bit is 0 if they are the same and 1
1387 * if they are different. We can then use a conditional modifier to
1388 * turn that into a predicate. This leads us to an XOR.l instruction.
1389 *
1390 * Technically, according to the PRM, you're not allowed to use .l on a
1391 * XOR instruction. However, empirical experiments and Curro's reading
1392 * of the simulator source both indicate that it's safe.
1393 */
1394 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1395 inst = bld.XOR(tmp, op[0], op[1]);
1396 inst->predicate = BRW_PREDICATE_NORMAL;
1397 inst->conditional_mod = BRW_CONDITIONAL_L;
1398
1399 /* If the result of the initial remainder operation is non-zero and the
1400 * two sources have different signs, add in a copy of op[1] to get the
1401 * final integer modulus value.
1402 */
1403 inst = bld.ADD(result, result, op[1]);
1404 inst->predicate = BRW_PREDICATE_NORMAL;
1405 break;
1406 }
1407
1408 case nir_op_flt32:
1409 case nir_op_fge32:
1410 case nir_op_feq32:
1411 case nir_op_fneu32: {
1412 fs_reg dest = result;
1413
1414 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1415 if (bit_size != 32) {
1416 dest = bld.vgrf(op[0].type, 1);
1417 bld.UNDEF(dest);
1418 }
1419
1420 bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
1421
1422 if (bit_size > 32) {
1423 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1424 } else if(bit_size < 32) {
1425 /* When we convert the result to 32-bit we need to be careful and do
1426 * it as a signed conversion to get sign extension (for 32-bit true)
1427 */
1428 const brw_reg_type src_type =
1429 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1430
1431 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1432 }
1433 break;
1434 }
1435
1436 case nir_op_ilt32:
1437 case nir_op_ult32:
1438 case nir_op_ige32:
1439 case nir_op_uge32:
1440 case nir_op_ieq32:
1441 case nir_op_ine32: {
1442 fs_reg dest = result;
1443
1444 const uint32_t bit_size = type_sz(op[0].type) * 8;
1445 if (bit_size != 32) {
1446 dest = bld.vgrf(op[0].type, 1);
1447 bld.UNDEF(dest);
1448 }
1449
1450 bld.CMP(dest, op[0], op[1],
1451 brw_cmod_for_nir_comparison(instr->op));
1452
1453 if (bit_size > 32) {
1454 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1455 } else if (bit_size < 32) {
1456 /* When we convert the result to 32-bit we need to be careful and do
1457 * it as a signed conversion to get sign extension (for 32-bit true)
1458 */
1459 const brw_reg_type src_type =
1460 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1461
1462 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1463 }
1464 break;
1465 }
1466
1467 case nir_op_inot: {
1468 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1469
1470 if (inot_src_instr != NULL &&
1471 (inot_src_instr->op == nir_op_ior ||
1472 inot_src_instr->op == nir_op_ixor ||
1473 inot_src_instr->op == nir_op_iand)) {
1474 /* The sources of the source logical instruction are now the
1475 * sources of the instruction that will be generated.
1476 */
1477 prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
1478 resolve_inot_sources(ntb, bld, inot_src_instr, op);
1479
1480 /* Smash all of the sources and destination to be signed. This
1481 * doesn't matter for the operation of the instruction, but cmod
1482 * propagation fails on unsigned sources with negation (due to
1483 * fs_inst::can_do_cmod returning false).
1484 */
1485 result.type =
1486 brw_type_for_nir_type(devinfo,
1487 (nir_alu_type)(nir_type_int |
1488 instr->def.bit_size));
1489 op[0].type =
1490 brw_type_for_nir_type(devinfo,
1491 (nir_alu_type)(nir_type_int |
1492 nir_src_bit_size(inot_src_instr->src[0].src)));
1493 op[1].type =
1494 brw_type_for_nir_type(devinfo,
1495 (nir_alu_type)(nir_type_int |
1496 nir_src_bit_size(inot_src_instr->src[1].src)));
1497
1498 /* For XOR, only invert one of the sources. Arbitrarily choose
1499 * the first source.
1500 */
1501 op[0].negate = !op[0].negate;
1502 if (inot_src_instr->op != nir_op_ixor)
1503 op[1].negate = !op[1].negate;
1504
1505 switch (inot_src_instr->op) {
1506 case nir_op_ior:
1507 bld.AND(result, op[0], op[1]);
1508 return;
1509
1510 case nir_op_iand:
1511 bld.OR(result, op[0], op[1]);
1512 return;
1513
1514 case nir_op_ixor:
1515 bld.XOR(result, op[0], op[1]);
1516 return;
1517
1518 default:
1519 unreachable("impossible opcode");
1520 }
1521 }
1522 op[0] = resolve_source_modifiers(bld, op[0]);
1523 bld.NOT(result, op[0]);
1524 break;
1525 }
1526
1527 case nir_op_ixor:
1528 resolve_inot_sources(ntb, bld, instr, op);
1529 bld.XOR(result, op[0], op[1]);
1530 break;
1531 case nir_op_ior:
1532 resolve_inot_sources(ntb, bld, instr, op);
1533 bld.OR(result, op[0], op[1]);
1534 break;
1535 case nir_op_iand:
1536 resolve_inot_sources(ntb, bld, instr, op);
1537 bld.AND(result, op[0], op[1]);
1538 break;
1539
1540 case nir_op_fdot2:
1541 case nir_op_fdot3:
1542 case nir_op_fdot4:
1543 case nir_op_b32all_fequal2:
1544 case nir_op_b32all_iequal2:
1545 case nir_op_b32all_fequal3:
1546 case nir_op_b32all_iequal3:
1547 case nir_op_b32all_fequal4:
1548 case nir_op_b32all_iequal4:
1549 case nir_op_b32any_fnequal2:
1550 case nir_op_b32any_inequal2:
1551 case nir_op_b32any_fnequal3:
1552 case nir_op_b32any_inequal3:
1553 case nir_op_b32any_fnequal4:
1554 case nir_op_b32any_inequal4:
1555 unreachable("Lowered by nir_lower_alu_reductions");
1556
1557 case nir_op_ldexp:
1558 unreachable("not reached: should be handled by ldexp_to_arith()");
1559
1560 case nir_op_fsqrt:
1561 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1562 break;
1563
1564 case nir_op_frsq:
1565 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1566 break;
1567
1568 case nir_op_ftrunc:
1569 inst = bld.RNDZ(result, op[0]);
1570 break;
1571
1572 case nir_op_fceil: {
1573 op[0].negate = !op[0].negate;
1574 fs_reg temp = s.vgrf(glsl_float_type());
1575 bld.RNDD(temp, op[0]);
1576 temp.negate = true;
1577 inst = bld.MOV(result, temp);
1578 break;
1579 }
1580 case nir_op_ffloor:
1581 inst = bld.RNDD(result, op[0]);
1582 break;
1583 case nir_op_ffract:
1584 inst = bld.FRC(result, op[0]);
1585 break;
1586 case nir_op_fround_even:
1587 inst = bld.RNDE(result, op[0]);
1588 break;
1589
1590 case nir_op_fquantize2f16: {
1591 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1592 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1593 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1594
1595 /* The destination stride must be at least as big as the source stride. */
1596 tmp16 = subscript(tmp16, BRW_REGISTER_TYPE_HF, 0);
1597
1598 /* Check for denormal */
1599 fs_reg abs_src0 = op[0];
1600 abs_src0.abs = true;
1601 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1602 BRW_CONDITIONAL_L);
1603 /* Get the appropriately signed zero */
1604 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1605 retype(op[0], BRW_REGISTER_TYPE_UD),
1606 brw_imm_ud(0x80000000));
1607 /* Do the actual F32 -> F16 -> F32 conversion */
1608 bld.MOV(tmp16, op[0]);
1609 bld.MOV(tmp32, tmp16);
1610 /* Select that or zero based on normal status */
1611 inst = bld.SEL(result, zero, tmp32);
1612 inst->predicate = BRW_PREDICATE_NORMAL;
1613 break;
1614 }
1615
1616 case nir_op_imin:
1617 case nir_op_umin:
1618 case nir_op_fmin:
1619 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1620 break;
1621
1622 case nir_op_imax:
1623 case nir_op_umax:
1624 case nir_op_fmax:
1625 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1626 break;
1627
1628 case nir_op_pack_snorm_2x16:
1629 case nir_op_pack_snorm_4x8:
1630 case nir_op_pack_unorm_2x16:
1631 case nir_op_pack_unorm_4x8:
1632 case nir_op_unpack_snorm_2x16:
1633 case nir_op_unpack_snorm_4x8:
1634 case nir_op_unpack_unorm_2x16:
1635 case nir_op_unpack_unorm_4x8:
1636 case nir_op_unpack_half_2x16:
1637 case nir_op_pack_half_2x16:
1638 unreachable("not reached: should be handled by lower_packing_builtins");
1639
1640 case nir_op_unpack_half_2x16_split_x_flush_to_zero:
1641 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1642 FALLTHROUGH;
1643 case nir_op_unpack_half_2x16_split_x:
1644 inst = bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 0));
1645 break;
1646
1647 case nir_op_unpack_half_2x16_split_y_flush_to_zero:
1648 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1649 FALLTHROUGH;
1650 case nir_op_unpack_half_2x16_split_y:
1651 inst = bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_HF, 1));
1652 break;
1653
1654 case nir_op_pack_64_2x32_split:
1655 case nir_op_pack_32_2x16_split:
1656 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1657 break;
1658
1659 case nir_op_pack_32_4x8_split:
1660 bld.emit(FS_OPCODE_PACK, result, op, 4);
1661 break;
1662
1663 case nir_op_unpack_64_2x32_split_x:
1664 case nir_op_unpack_64_2x32_split_y: {
1665 if (instr->op == nir_op_unpack_64_2x32_split_x)
1666 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1667 else
1668 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1669 break;
1670 }
1671
1672 case nir_op_unpack_32_2x16_split_x:
1673 case nir_op_unpack_32_2x16_split_y: {
1674 if (instr->op == nir_op_unpack_32_2x16_split_x)
1675 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1676 else
1677 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1678 break;
1679 }
1680
1681 case nir_op_fpow:
1682 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1683 break;
1684
1685 case nir_op_bitfield_reverse:
1686 assert(instr->def.bit_size == 32);
1687 assert(nir_src_bit_size(instr->src[0].src) == 32);
1688 bld.BFREV(result, op[0]);
1689 break;
1690
1691 case nir_op_bit_count:
1692 assert(instr->def.bit_size == 32);
1693 assert(nir_src_bit_size(instr->src[0].src) < 64);
1694 bld.CBIT(result, op[0]);
1695 break;
1696
1697 case nir_op_uclz:
1698 assert(instr->def.bit_size == 32);
1699 assert(nir_src_bit_size(instr->src[0].src) == 32);
1700 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1701 break;
1702
1703 case nir_op_ifind_msb: {
1704 assert(instr->def.bit_size == 32);
1705 assert(nir_src_bit_size(instr->src[0].src) == 32);
1706
1707 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1708
1709 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1710 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1711 * subtract the result from 31 to convert the MSB count into an LSB
1712 * count.
1713 */
1714 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1715
1716 inst = bld.ADD(result, result, brw_imm_d(31));
1717 inst->predicate = BRW_PREDICATE_NORMAL;
1718 inst->src[0].negate = true;
1719 break;
1720 }
1721
1722 case nir_op_find_lsb:
1723 assert(instr->def.bit_size == 32);
1724 assert(nir_src_bit_size(instr->src[0].src) == 32);
1725 bld.FBL(result, op[0]);
1726 break;
1727
1728 case nir_op_ubitfield_extract:
1729 case nir_op_ibitfield_extract:
1730 unreachable("should have been lowered");
1731 case nir_op_ubfe:
1732 case nir_op_ibfe:
1733 assert(instr->def.bit_size < 64);
1734 bld.BFE(result, op[2], op[1], op[0]);
1735 break;
1736 case nir_op_bfm:
1737 assert(instr->def.bit_size < 64);
1738 bld.BFI1(result, op[0], op[1]);
1739 break;
1740 case nir_op_bfi:
1741 assert(instr->def.bit_size < 64);
1742
1743 /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1744 * either 0 or src0. Replacing the 0 with another value can eliminate a
1745 * temporary register.
1746 */
1747 if (is_const_zero(instr->src[2].src))
1748 bld.BFI2(result, op[0], op[1], op[0]);
1749 else
1750 bld.BFI2(result, op[0], op[1], op[2]);
1751
1752 break;
1753
1754 case nir_op_bitfield_insert:
1755 unreachable("not reached: should have been lowered");
1756
1757 /* With regards to implicit masking of the shift counts for 8- and 16-bit
1758 * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1759 * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1760 * src0) are used. The Bspec (backed by data from experimentation) state
1761 * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1762 * types.
1763 *
1764 * The match the behavior expected for the NIR opcodes, explicit masks for
1765 * 8- and 16-bit types must be added.
1766 */
1767 case nir_op_ishl:
1768 if (instr->def.bit_size < 32) {
1769 bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
1770 bld.SHL(result, op[0], result);
1771 } else {
1772 bld.SHL(result, op[0], op[1]);
1773 }
1774
1775 break;
1776 case nir_op_ishr:
1777 if (instr->def.bit_size < 32) {
1778 bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
1779 bld.ASR(result, op[0], result);
1780 } else {
1781 bld.ASR(result, op[0], op[1]);
1782 }
1783
1784 break;
1785 case nir_op_ushr:
1786 if (instr->def.bit_size < 32) {
1787 bld.AND(result, op[1], brw_imm_ud(instr->def.bit_size - 1));
1788 bld.SHR(result, op[0], result);
1789 } else {
1790 bld.SHR(result, op[0], op[1]);
1791 }
1792
1793 break;
1794
1795 case nir_op_urol:
1796 bld.ROL(result, op[0], op[1]);
1797 break;
1798 case nir_op_uror:
1799 bld.ROR(result, op[0], op[1]);
1800 break;
1801
1802 case nir_op_pack_half_2x16_split:
1803 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1804 break;
1805
1806 case nir_op_sdot_4x8_iadd:
1807 case nir_op_sdot_4x8_iadd_sat:
1808 inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
1809 retype(op[2], BRW_REGISTER_TYPE_D),
1810 retype(op[0], BRW_REGISTER_TYPE_D),
1811 retype(op[1], BRW_REGISTER_TYPE_D));
1812
1813 if (instr->op == nir_op_sdot_4x8_iadd_sat)
1814 inst->saturate = true;
1815 break;
1816
1817 case nir_op_udot_4x8_uadd:
1818 case nir_op_udot_4x8_uadd_sat:
1819 inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_UD),
1820 retype(op[2], BRW_REGISTER_TYPE_UD),
1821 retype(op[0], BRW_REGISTER_TYPE_UD),
1822 retype(op[1], BRW_REGISTER_TYPE_UD));
1823
1824 if (instr->op == nir_op_udot_4x8_uadd_sat)
1825 inst->saturate = true;
1826 break;
1827
1828 case nir_op_sudot_4x8_iadd:
1829 case nir_op_sudot_4x8_iadd_sat:
1830 inst = bld.DP4A(retype(result, BRW_REGISTER_TYPE_D),
1831 retype(op[2], BRW_REGISTER_TYPE_D),
1832 retype(op[0], BRW_REGISTER_TYPE_D),
1833 retype(op[1], BRW_REGISTER_TYPE_UD));
1834
1835 if (instr->op == nir_op_sudot_4x8_iadd_sat)
1836 inst->saturate = true;
1837 break;
1838
1839 case nir_op_ffma:
1840 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1841 brw_rnd_mode rnd =
1842 brw_rnd_mode_from_execution_mode(execution_mode);
1843 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1844 brw_imm_d(rnd));
1845 }
1846
1847 inst = bld.MAD(result, op[2], op[1], op[0]);
1848 break;
1849
1850 case nir_op_flrp:
1851 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1852 brw_rnd_mode rnd =
1853 brw_rnd_mode_from_execution_mode(execution_mode);
1854 bld.exec_all().emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1855 brw_imm_d(rnd));
1856 }
1857
1858 inst = bld.LRP(result, op[0], op[1], op[2]);
1859 break;
1860
1861 case nir_op_b32csel:
1862 if (optimize_frontfacing_ternary(ntb, instr, result))
1863 return;
1864
1865 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1866 inst = bld.SEL(result, op[1], op[2]);
1867 inst->predicate = BRW_PREDICATE_NORMAL;
1868 break;
1869
1870 case nir_op_extract_u8:
1871 case nir_op_extract_i8: {
1872 unsigned byte = nir_src_as_uint(instr->src[1].src);
1873
1874 /* The PRMs say:
1875 *
1876 * BDW+
1877 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1878 * Use two instructions and a word or DWord intermediate integer type.
1879 */
1880 if (instr->def.bit_size == 64) {
1881 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1882
1883 if (instr->op == nir_op_extract_i8) {
1884 /* If we need to sign extend, extract to a word first */
1885 fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1886 bld.MOV(w_temp, subscript(op[0], type, byte));
1887 bld.MOV(result, w_temp);
1888 } else if (byte & 1) {
1889 /* Extract the high byte from the word containing the desired byte
1890 * offset.
1891 */
1892 bld.SHR(result,
1893 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1894 brw_imm_uw(8));
1895 } else {
1896 /* Otherwise use an AND with 0xff and a word type */
1897 bld.AND(result,
1898 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1899 brw_imm_uw(0xff));
1900 }
1901 } else {
1902 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1903 bld.MOV(result, subscript(op[0], type, byte));
1904 }
1905 break;
1906 }
1907
1908 case nir_op_extract_u16:
1909 case nir_op_extract_i16: {
1910 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1911 unsigned word = nir_src_as_uint(instr->src[1].src);
1912 bld.MOV(result, subscript(op[0], type, word));
1913 break;
1914 }
1915
1916 default:
1917 unreachable("unhandled instruction");
1918 }
1919 }
1920
1921 static void
fs_nir_emit_load_const(nir_to_brw_state & ntb,nir_load_const_instr * instr)1922 fs_nir_emit_load_const(nir_to_brw_state &ntb,
1923 nir_load_const_instr *instr)
1924 {
1925 const intel_device_info *devinfo = ntb.devinfo;
1926 const fs_builder &bld = ntb.bld;
1927
1928 const brw_reg_type reg_type =
1929 brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1930 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1931
1932 switch (instr->def.bit_size) {
1933 case 8:
1934 for (unsigned i = 0; i < instr->def.num_components; i++)
1935 bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
1936 break;
1937
1938 case 16:
1939 for (unsigned i = 0; i < instr->def.num_components; i++)
1940 bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
1941 break;
1942
1943 case 32:
1944 for (unsigned i = 0; i < instr->def.num_components; i++)
1945 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
1946 break;
1947
1948 case 64:
1949 if (!devinfo->has_64bit_int) {
1950 for (unsigned i = 0; i < instr->def.num_components; i++) {
1951 bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1952 brw_imm_df(instr->value[i].f64));
1953 }
1954 } else {
1955 for (unsigned i = 0; i < instr->def.num_components; i++)
1956 bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
1957 }
1958 break;
1959
1960 default:
1961 unreachable("Invalid bit size");
1962 }
1963
1964 ntb.ssa_values[instr->def.index] = reg;
1965 }
1966
1967 static bool
get_nir_src_bindless(nir_to_brw_state & ntb,const nir_src & src)1968 get_nir_src_bindless(nir_to_brw_state &ntb, const nir_src &src)
1969 {
1970 return ntb.ssa_bind_infos[src.ssa->index].bindless;
1971 }
1972
1973 static bool
is_resource_src(nir_src src)1974 is_resource_src(nir_src src)
1975 {
1976 return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
1977 nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
1978 }
1979
1980 static fs_reg
get_resource_nir_src(nir_to_brw_state & ntb,const nir_src & src)1981 get_resource_nir_src(nir_to_brw_state &ntb, const nir_src &src)
1982 {
1983 if (!is_resource_src(src))
1984 return fs_reg();
1985 return ntb.resource_values[src.ssa->index];
1986 }
1987
1988 static fs_reg
get_nir_src(nir_to_brw_state & ntb,const nir_src & src)1989 get_nir_src(nir_to_brw_state &ntb, const nir_src &src)
1990 {
1991 nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
1992
1993 fs_reg reg;
1994 if (!load_reg) {
1995 if (nir_src_is_undef(src)) {
1996 const brw_reg_type reg_type =
1997 brw_reg_type_from_bit_size(src.ssa->bit_size,
1998 BRW_REGISTER_TYPE_D);
1999 reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
2000 } else {
2001 reg = ntb.ssa_values[src.ssa->index];
2002 }
2003 } else {
2004 nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
2005 /* We don't handle indirects on locals */
2006 assert(nir_intrinsic_base(load_reg) == 0);
2007 assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
2008 reg = ntb.ssa_values[decl_reg->def.index];
2009 }
2010
2011 /* To avoid floating-point denorm flushing problems, set the type by
2012 * default to an integer type - instructions that need floating point
2013 * semantics will set this to F if they need to
2014 */
2015 reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
2016 BRW_REGISTER_TYPE_D);
2017
2018 return reg;
2019 }
2020
2021 /**
2022 * Return an IMM for constants; otherwise call get_nir_src() as normal.
2023 *
2024 * This function should not be called on any value which may be 64 bits.
2025 * We could theoretically support 64-bit on gfx8+ but we choose not to
2026 * because it wouldn't work in general (no gfx7 support) and there are
2027 * enough restrictions in 64-bit immediates that you can't take the return
2028 * value and treat it the same as the result of get_nir_src().
2029 */
2030 static fs_reg
get_nir_src_imm(nir_to_brw_state & ntb,const nir_src & src)2031 get_nir_src_imm(nir_to_brw_state &ntb, const nir_src &src)
2032 {
2033 assert(nir_src_bit_size(src) == 32);
2034 return nir_src_is_const(src) ?
2035 fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
2036 }
2037
2038 static fs_reg
get_nir_def(nir_to_brw_state & ntb,const nir_def & def)2039 get_nir_def(nir_to_brw_state &ntb, const nir_def &def)
2040 {
2041 const fs_builder &bld = ntb.bld;
2042
2043 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2044 if (!store_reg) {
2045 const brw_reg_type reg_type =
2046 brw_reg_type_from_bit_size(def.bit_size,
2047 def.bit_size == 8 ?
2048 BRW_REGISTER_TYPE_D :
2049 BRW_REGISTER_TYPE_F);
2050 ntb.ssa_values[def.index] =
2051 bld.vgrf(reg_type, def.num_components);
2052 bld.UNDEF(ntb.ssa_values[def.index]);
2053 return ntb.ssa_values[def.index];
2054 } else {
2055 nir_intrinsic_instr *decl_reg =
2056 nir_reg_get_decl(store_reg->src[1].ssa);
2057 /* We don't handle indirects on locals */
2058 assert(nir_intrinsic_base(store_reg) == 0);
2059 assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
2060 return ntb.ssa_values[decl_reg->def.index];
2061 }
2062 }
2063
2064 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)2065 get_nir_write_mask(const nir_def &def)
2066 {
2067 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2068 if (!store_reg) {
2069 return nir_component_mask(def.num_components);
2070 } else {
2071 return nir_intrinsic_write_mask(store_reg);
2072 }
2073 }
2074
2075 static fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum opcode opcode,const fs_reg & dst,const fs_reg & src,const fs_reg & desc,const fs_reg & flag_reg,glsl_interp_mode interpolation)2076 emit_pixel_interpolater_send(const fs_builder &bld,
2077 enum opcode opcode,
2078 const fs_reg &dst,
2079 const fs_reg &src,
2080 const fs_reg &desc,
2081 const fs_reg &flag_reg,
2082 glsl_interp_mode interpolation)
2083 {
2084 struct brw_wm_prog_data *wm_prog_data =
2085 brw_wm_prog_data(bld.shader->stage_prog_data);
2086
2087 fs_reg srcs[INTERP_NUM_SRCS];
2088 srcs[INTERP_SRC_OFFSET] = src;
2089 srcs[INTERP_SRC_MSG_DESC] = desc;
2090 srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2091
2092 fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2093 /* 2 floats per slot returned */
2094 inst->size_written = 2 * dst.component_size(inst->exec_size);
2095 if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2096 inst->pi_noperspective = true;
2097 /* TGL BSpec says:
2098 * This field cannot be set to "Linear Interpolation"
2099 * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2100 */
2101 wm_prog_data->uses_nonperspective_interp_modes = true;
2102 }
2103
2104 wm_prog_data->pulls_bary = true;
2105
2106 return inst;
2107 }
2108
2109 /**
2110 * Computes 1 << x, given a D/UD register containing some value x.
2111 */
2112 static fs_reg
intexp2(const fs_builder & bld,const fs_reg & x)2113 intexp2(const fs_builder &bld, const fs_reg &x)
2114 {
2115 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
2116
2117 fs_reg result = bld.vgrf(x.type, 1);
2118 fs_reg one = bld.vgrf(x.type, 1);
2119
2120 bld.MOV(one, retype(brw_imm_d(1), one.type));
2121 bld.SHL(result, one, x);
2122 return result;
2123 }
2124
2125 static void
emit_gs_end_primitive(nir_to_brw_state & ntb,const nir_src & vertex_count_nir_src)2126 emit_gs_end_primitive(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src)
2127 {
2128 fs_visitor &s = ntb.s;
2129 assert(s.stage == MESA_SHADER_GEOMETRY);
2130
2131 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2132
2133 if (s.gs_compile->control_data_header_size_bits == 0)
2134 return;
2135
2136 /* We can only do EndPrimitive() functionality when the control data
2137 * consists of cut bits. Fortunately, the only time it isn't is when the
2138 * output type is points, in which case EndPrimitive() is a no-op.
2139 */
2140 if (gs_prog_data->control_data_format !=
2141 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2142 return;
2143 }
2144
2145 /* Cut bits use one bit per vertex. */
2146 assert(s.gs_compile->control_data_bits_per_vertex == 1);
2147
2148 fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2149 vertex_count.type = BRW_REGISTER_TYPE_UD;
2150
2151 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2152 * vertex n, 0 otherwise. So all we need to do here is mark bit
2153 * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2154 * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2155 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2156 *
2157 * Note that if EndPrimitive() is called before emitting any vertices, this
2158 * will cause us to set bit 31 of the control_data_bits register to 1.
2159 * That's fine because:
2160 *
2161 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2162 * output, so the hardware will ignore cut bit 31.
2163 *
2164 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2165 * last vertex, so setting cut bit 31 has no effect (since the primitive
2166 * is automatically ended when the GS terminates).
2167 *
2168 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2169 * control_data_bits register to 0 when the first vertex is emitted.
2170 */
2171
2172 const fs_builder abld = ntb.bld.annotate("end primitive");
2173
2174 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2175 fs_reg prev_count = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2176 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2177 fs_reg mask = intexp2(abld, prev_count);
2178 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2179 * attention to the lower 5 bits of its second source argument, so on this
2180 * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2181 * ((vertex_count - 1) % 32).
2182 */
2183 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2184 }
2185
2186 void
emit_gs_control_data_bits(const fs_reg & vertex_count)2187 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
2188 {
2189 assert(stage == MESA_SHADER_GEOMETRY);
2190 assert(gs_compile->control_data_bits_per_vertex != 0);
2191
2192 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2193
2194 const fs_builder bld = fs_builder(this).at_end();
2195 const fs_builder abld = bld.annotate("emit control data bits");
2196 const fs_builder fwa_bld = bld.exec_all();
2197
2198 /* We use a single UD register to accumulate control data bits (32 bits
2199 * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
2200 * at a time.
2201 *
2202 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2203 * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2204 * use the Channel Mask phase to enable/disable which DWord within that
2205 * group to write. (Remember, different SIMD8 channels may have emitted
2206 * different numbers of vertices, so we may need per-slot offsets.)
2207 *
2208 * Channel masking presents an annoying problem: we may have to replicate
2209 * the data up to 4 times:
2210 *
2211 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2212 *
2213 * To avoid penalizing shaders that emit a small number of vertices, we
2214 * can avoid these sometimes: if the size of the control data header is
2215 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
2216 * land in the same 128-bit group, so we can skip per-slot offsets.
2217 *
2218 * Similarly, if the control data header is <= 32 bits, there is only one
2219 * DWord, so we can skip channel masks.
2220 */
2221 fs_reg channel_mask, per_slot_offset;
2222
2223 if (gs_compile->control_data_header_size_bits > 32)
2224 channel_mask = vgrf(glsl_uint_type());
2225
2226 if (gs_compile->control_data_header_size_bits > 128)
2227 per_slot_offset = vgrf(glsl_uint_type());
2228
2229 /* Figure out which DWord we're trying to write to using the formula:
2230 *
2231 * dword_index = (vertex_count - 1) * bits_per_vertex / 32
2232 *
2233 * Since bits_per_vertex is a power of two, and is known at compile
2234 * time, this can be optimized to:
2235 *
2236 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2237 */
2238 if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
2239 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2240 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2241 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2242 unsigned log2_bits_per_vertex =
2243 util_last_bit(gs_compile->control_data_bits_per_vertex);
2244 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2245
2246 if (per_slot_offset.file != BAD_FILE) {
2247 /* Set the per-slot offset to dword_index / 4, so that we'll write to
2248 * the appropriate OWord within the control data header.
2249 */
2250 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
2251 }
2252
2253 /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2254 * write to the appropriate DWORD within the OWORD.
2255 */
2256 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2257 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
2258 channel_mask = intexp2(fwa_bld, channel);
2259 /* Then the channel masks need to be in bits 23:16. */
2260 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
2261 }
2262
2263 /* If there are channel masks, add 3 extra copies of the data. */
2264 const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2265 fs_reg sources[4];
2266
2267 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
2268 sources[i] = this->control_data_bits;
2269
2270 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2271 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2272 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2273 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2274 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, length);
2275 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
2276 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2277
2278 fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2279 srcs, ARRAY_SIZE(srcs));
2280
2281 /* We need to increment Global Offset by 256-bits to make room for
2282 * Broadwell's extra "Vertex Count" payload at the beginning of the
2283 * URB entry. Since this is an OWord message, Global Offset is counted
2284 * in 128-bit units, so we must set it to 2.
2285 */
2286 if (gs_prog_data->static_vertex_count == -1)
2287 inst->offset = 2;
2288 }
2289
2290 static void
set_gs_stream_control_data_bits(nir_to_brw_state & ntb,const fs_reg & vertex_count,unsigned stream_id)2291 set_gs_stream_control_data_bits(nir_to_brw_state &ntb, const fs_reg &vertex_count,
2292 unsigned stream_id)
2293 {
2294 fs_visitor &s = ntb.s;
2295
2296 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2297
2298 /* Note: we are calling this *before* increasing vertex_count, so
2299 * this->vertex_count == vertex_count - 1 in the formula above.
2300 */
2301
2302 /* Stream mode uses 2 bits per vertex */
2303 assert(s.gs_compile->control_data_bits_per_vertex == 2);
2304
2305 /* Must be a valid stream */
2306 assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2307
2308 /* Control data bits are initialized to 0 so we don't have to set any
2309 * bits when sending vertices to stream 0.
2310 */
2311 if (stream_id == 0)
2312 return;
2313
2314 const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL);
2315
2316 /* reg::sid = stream_id */
2317 fs_reg sid = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2318 abld.MOV(sid, brw_imm_ud(stream_id));
2319
2320 /* reg:shift_count = 2 * (vertex_count - 1) */
2321 fs_reg shift_count = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2322 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2323
2324 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2325 * attention to the lower 5 bits of its second source argument, so on this
2326 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2327 * stream_id << ((2 * (vertex_count - 1)) % 32).
2328 */
2329 fs_reg mask = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2330 abld.SHL(mask, sid, shift_count);
2331 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2332 }
2333
2334 static void
emit_gs_vertex(nir_to_brw_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2335 emit_gs_vertex(nir_to_brw_state &ntb, const nir_src &vertex_count_nir_src,
2336 unsigned stream_id)
2337 {
2338 fs_visitor &s = ntb.s;
2339
2340 assert(s.stage == MESA_SHADER_GEOMETRY);
2341
2342 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2343
2344 fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2345 vertex_count.type = BRW_REGISTER_TYPE_UD;
2346
2347 /* Haswell and later hardware ignores the "Render Stream Select" bits
2348 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2349 * and instead sends all primitives down the pipeline for rasterization.
2350 * If the SOL stage is enabled, "Render Stream Select" is honored and
2351 * primitives bound to non-zero streams are discarded after stream output.
2352 *
2353 * Since the only purpose of primives sent to non-zero streams is to
2354 * be recorded by transform feedback, we can simply discard all geometry
2355 * bound to these streams when transform feedback is disabled.
2356 */
2357 if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2358 return;
2359
2360 /* If we're outputting 32 control data bits or less, then we can wait
2361 * until the shader is over to output them all. Otherwise we need to
2362 * output them as we go. Now is the time to do it, since we're about to
2363 * output the vertex_count'th vertex, so it's guaranteed that the
2364 * control data bits associated with the (vertex_count - 1)th vertex are
2365 * correct.
2366 */
2367 if (s.gs_compile->control_data_header_size_bits > 32) {
2368 const fs_builder abld =
2369 ntb.bld.annotate("emit vertex: emit control data bits");
2370
2371 /* Only emit control data bits if we've finished accumulating a batch
2372 * of 32 bits. This is the case when:
2373 *
2374 * (vertex_count * bits_per_vertex) % 32 == 0
2375 *
2376 * (in other words, when the last 5 bits of vertex_count *
2377 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
2378 * integer n (which is always the case, since bits_per_vertex is
2379 * always 1 or 2), this is equivalent to requiring that the last 5-n
2380 * bits of vertex_count are 0:
2381 *
2382 * vertex_count & (2^(5-n) - 1) == 0
2383 *
2384 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2385 * equivalent to:
2386 *
2387 * vertex_count & (32 / bits_per_vertex - 1) == 0
2388 *
2389 * TODO: If vertex_count is an immediate, we could do some of this math
2390 * at compile time...
2391 */
2392 fs_inst *inst =
2393 abld.AND(ntb.bld.null_reg_d(), vertex_count,
2394 brw_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2395 inst->conditional_mod = BRW_CONDITIONAL_Z;
2396
2397 abld.IF(BRW_PREDICATE_NORMAL);
2398 /* If vertex_count is 0, then no control data bits have been
2399 * accumulated yet, so we can skip emitting them.
2400 */
2401 abld.CMP(ntb.bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2402 BRW_CONDITIONAL_NEQ);
2403 abld.IF(BRW_PREDICATE_NORMAL);
2404 s.emit_gs_control_data_bits(vertex_count);
2405 abld.emit(BRW_OPCODE_ENDIF);
2406
2407 /* Reset control_data_bits to 0 so we can start accumulating a new
2408 * batch.
2409 *
2410 * Note: in the case where vertex_count == 0, this neutralizes the
2411 * effect of any call to EndPrimitive() that the shader may have
2412 * made before outputting its first vertex.
2413 */
2414 inst = abld.MOV(s.control_data_bits, brw_imm_ud(0u));
2415 inst->force_writemask_all = true;
2416 abld.emit(BRW_OPCODE_ENDIF);
2417 }
2418
2419 s.emit_urb_writes(vertex_count);
2420
2421 /* In stream mode we have to set control data bits for all vertices
2422 * unless we have disabled control data bits completely (which we do
2423 * do for MESA_PRIM_POINTS outputs that don't use streams).
2424 */
2425 if (s.gs_compile->control_data_header_size_bits > 0 &&
2426 gs_prog_data->control_data_format ==
2427 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2428 set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2429 }
2430 }
2431
2432 static void
emit_gs_input_load(nir_to_brw_state & ntb,const fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2433 emit_gs_input_load(nir_to_brw_state &ntb, const fs_reg &dst,
2434 const nir_src &vertex_src,
2435 unsigned base_offset,
2436 const nir_src &offset_src,
2437 unsigned num_components,
2438 unsigned first_component)
2439 {
2440 const fs_builder &bld = ntb.bld;
2441 fs_visitor &s = ntb.s;
2442
2443 assert(type_sz(dst.type) == 4);
2444 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(s.prog_data);
2445 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2446
2447 /* TODO: figure out push input layout for invocations == 1 */
2448 if (gs_prog_data->invocations == 1 &&
2449 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2450 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2451 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2452 nir_src_as_uint(vertex_src) * push_reg_count;
2453 const fs_reg attr = fs_reg(ATTR, 0, dst.type);
2454 for (unsigned i = 0; i < num_components; i++) {
2455 ntb.bld.MOV(offset(dst, bld, i),
2456 offset(attr, bld, imm_offset + i + first_component));
2457 }
2458 return;
2459 }
2460
2461 /* Resort to the pull model. Ensure the VUE handles are provided. */
2462 assert(gs_prog_data->base.include_vue_handles);
2463
2464 fs_reg start = s.gs_payload().icp_handle_start;
2465 fs_reg icp_handle = ntb.bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2466
2467 if (gs_prog_data->invocations == 1) {
2468 if (nir_src_is_const(vertex_src)) {
2469 /* The vertex index is constant; just select the proper URB handle. */
2470 icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src));
2471 } else {
2472 /* The vertex index is non-constant. We need to use indirect
2473 * addressing to fetch the proper URB handle.
2474 *
2475 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2476 * indicating that channel <n> should read the handle from
2477 * DWord <n>. We convert that to bytes by multiplying by 4.
2478 *
2479 * Next, we convert the vertex index to bytes by multiplying
2480 * by 32 (shifting by 5), and add the two together. This is
2481 * the final indirect byte offset.
2482 */
2483 fs_reg sequence =
2484 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2485 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2486 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2487 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2488
2489 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2490 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2491 /* Convert vertex_index to bytes (multiply by 32) */
2492 bld.SHL(vertex_offset_bytes,
2493 retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
2494 brw_imm_ud(5u));
2495 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2496
2497 /* Use first_icp_handle as the base offset. There is one register
2498 * of URB handles per vertex, so inform the register allocator that
2499 * we might read up to nir->info.gs.vertices_in registers.
2500 */
2501 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2502 fs_reg(icp_offset_bytes),
2503 brw_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE));
2504 }
2505 } else {
2506 assert(gs_prog_data->invocations > 1);
2507
2508 if (nir_src_is_const(vertex_src)) {
2509 unsigned vertex = nir_src_as_uint(vertex_src);
2510 bld.MOV(icp_handle, component(start, vertex));
2511 } else {
2512 /* The vertex index is non-constant. We need to use indirect
2513 * addressing to fetch the proper URB handle.
2514 *
2515 */
2516 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2517
2518 /* Convert vertex_index to bytes (multiply by 4) */
2519 bld.SHL(icp_offset_bytes,
2520 retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
2521 brw_imm_ud(2u));
2522
2523 /* Use first_icp_handle as the base offset. There is one DWord
2524 * of URB handles per vertex, so inform the register allocator that
2525 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2526 */
2527 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2528 fs_reg(icp_offset_bytes),
2529 brw_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2530 REG_SIZE));
2531 }
2532 }
2533
2534 fs_inst *inst;
2535 fs_reg indirect_offset = get_nir_src(ntb, offset_src);
2536
2537 if (nir_src_is_const(offset_src)) {
2538 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2539 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2540
2541 /* Constant indexing - use global offset. */
2542 if (first_component != 0) {
2543 unsigned read_components = num_components + first_component;
2544 fs_reg tmp = bld.vgrf(dst.type, read_components);
2545 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2546 ARRAY_SIZE(srcs));
2547 inst->size_written = read_components *
2548 tmp.component_size(inst->exec_size);
2549 for (unsigned i = 0; i < num_components; i++) {
2550 bld.MOV(offset(dst, bld, i),
2551 offset(tmp, bld, i + first_component));
2552 }
2553 } else {
2554 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2555 ARRAY_SIZE(srcs));
2556 inst->size_written = num_components *
2557 dst.component_size(inst->exec_size);
2558 }
2559 inst->offset = base_offset + nir_src_as_uint(offset_src);
2560 } else {
2561 /* Indirect indexing - use per-slot offsets as well. */
2562 unsigned read_components = num_components + first_component;
2563 fs_reg tmp = bld.vgrf(dst.type, read_components);
2564
2565 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2566 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2567 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2568
2569 if (first_component != 0) {
2570 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2571 srcs, ARRAY_SIZE(srcs));
2572 inst->size_written = read_components *
2573 tmp.component_size(inst->exec_size);
2574 for (unsigned i = 0; i < num_components; i++) {
2575 bld.MOV(offset(dst, bld, i),
2576 offset(tmp, bld, i + first_component));
2577 }
2578 } else {
2579 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
2580 srcs, ARRAY_SIZE(srcs));
2581 inst->size_written = num_components *
2582 dst.component_size(inst->exec_size);
2583 }
2584 inst->offset = base_offset;
2585 }
2586 }
2587
2588 static fs_reg
get_indirect_offset(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)2589 get_indirect_offset(nir_to_brw_state &ntb, nir_intrinsic_instr *instr)
2590 {
2591 nir_src *offset_src = nir_get_io_offset_src(instr);
2592
2593 if (nir_src_is_const(*offset_src)) {
2594 /* The only constant offset we should find is 0. brw_nir.c's
2595 * add_const_offset_to_base() will fold other constant offsets
2596 * into the "base" index.
2597 */
2598 assert(nir_src_as_uint(*offset_src) == 0);
2599 return fs_reg();
2600 }
2601
2602 return get_nir_src(ntb, *offset_src);
2603 }
2604
2605 static void
fs_nir_emit_vs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)2606 fs_nir_emit_vs_intrinsic(nir_to_brw_state &ntb,
2607 nir_intrinsic_instr *instr)
2608 {
2609 const fs_builder &bld = ntb.bld;
2610 fs_visitor &s = ntb.s;
2611 assert(s.stage == MESA_SHADER_VERTEX);
2612
2613 fs_reg dest;
2614 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2615 dest = get_nir_def(ntb, instr->def);
2616
2617 switch (instr->intrinsic) {
2618 case nir_intrinsic_load_vertex_id:
2619 case nir_intrinsic_load_base_vertex:
2620 unreachable("should be lowered by nir_lower_system_values()");
2621
2622 case nir_intrinsic_load_input: {
2623 assert(instr->def.bit_size == 32);
2624 const fs_reg src = offset(fs_reg(ATTR, 0, dest.type), bld,
2625 nir_intrinsic_base(instr) * 4 +
2626 nir_intrinsic_component(instr) +
2627 nir_src_as_uint(instr->src[0]));
2628
2629 for (unsigned i = 0; i < instr->num_components; i++)
2630 bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2631 break;
2632 }
2633
2634 case nir_intrinsic_load_vertex_id_zero_base:
2635 case nir_intrinsic_load_instance_id:
2636 case nir_intrinsic_load_base_instance:
2637 case nir_intrinsic_load_draw_id:
2638 case nir_intrinsic_load_first_vertex:
2639 case nir_intrinsic_load_is_indexed_draw:
2640 unreachable("lowered by brw_nir_lower_vs_inputs");
2641
2642 default:
2643 fs_nir_emit_intrinsic(ntb, bld, instr);
2644 break;
2645 }
2646 }
2647
2648 static fs_reg
get_tcs_single_patch_icp_handle(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2649 get_tcs_single_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
2650 nir_intrinsic_instr *instr)
2651 {
2652 fs_visitor &s = ntb.s;
2653
2654 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
2655 const nir_src &vertex_src = instr->src[0];
2656 nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2657
2658 const fs_reg start = s.tcs_payload().icp_handle_start;
2659
2660 fs_reg icp_handle;
2661
2662 if (nir_src_is_const(vertex_src)) {
2663 /* Emit a MOV to resolve <0,1,0> regioning. */
2664 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2665 unsigned vertex = nir_src_as_uint(vertex_src);
2666 bld.MOV(icp_handle, component(start, vertex));
2667 } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2668 vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2669 /* For the common case of only 1 instance, an array index of
2670 * gl_InvocationID means reading the handles from the start. Skip all
2671 * the indirect work.
2672 */
2673 icp_handle = start;
2674 } else {
2675 /* The vertex index is non-constant. We need to use indirect
2676 * addressing to fetch the proper URB handle.
2677 */
2678 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2679
2680 /* Each ICP handle is a single DWord (4 bytes) */
2681 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2682 bld.SHL(vertex_offset_bytes,
2683 retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
2684 brw_imm_ud(2u));
2685
2686 /* We might read up to 4 registers. */
2687 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2688 start, vertex_offset_bytes,
2689 brw_imm_ud(4 * REG_SIZE));
2690 }
2691
2692 return icp_handle;
2693 }
2694
2695 static fs_reg
get_tcs_multi_patch_icp_handle(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2696 get_tcs_multi_patch_icp_handle(nir_to_brw_state &ntb, const fs_builder &bld,
2697 nir_intrinsic_instr *instr)
2698 {
2699 fs_visitor &s = ntb.s;
2700 const intel_device_info *devinfo = s.devinfo;
2701
2702 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) s.key;
2703 const nir_src &vertex_src = instr->src[0];
2704 const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2705
2706 const fs_reg start = s.tcs_payload().icp_handle_start;
2707
2708 if (nir_src_is_const(vertex_src))
2709 return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2710
2711 /* The vertex index is non-constant. We need to use indirect
2712 * addressing to fetch the proper URB handle.
2713 *
2714 * First, we start with the sequence indicating that channel <n>
2715 * should read the handle from DWord <n>. We convert that to bytes
2716 * by multiplying by 4.
2717 *
2718 * Next, we convert the vertex index to bytes by multiplying
2719 * by the GRF size (by shifting), and add the two together. This is
2720 * the final indirect byte offset.
2721 */
2722 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2723 fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2724 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2725 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2726 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2727
2728 /* Offsets will be 0, 4, 8, ... */
2729 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2730 /* Convert vertex_index to bytes (multiply by 32) */
2731 assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2732 bld.SHL(vertex_offset_bytes,
2733 retype(get_nir_src(ntb, vertex_src), BRW_REGISTER_TYPE_UD),
2734 brw_imm_ud(ffs(grf_size_bytes) - 1));
2735 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2736
2737 /* Use start of ICP handles as the base offset. There is one register
2738 * of URB handles per vertex, so inform the register allocator that
2739 * we might read up to nir->info.gs.vertices_in registers.
2740 */
2741 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2742 icp_offset_bytes,
2743 brw_imm_ud(brw_tcs_prog_key_input_vertices(tcs_key) *
2744 grf_size_bytes));
2745
2746 return icp_handle;
2747 }
2748
2749 static void
setup_barrier_message_payload_gfx125(const fs_builder & bld,const fs_reg & msg_payload)2750 setup_barrier_message_payload_gfx125(const fs_builder &bld,
2751 const fs_reg &msg_payload)
2752 {
2753 assert(bld.shader->devinfo->verx10 >= 125);
2754
2755 /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
2756 fs_reg m0_10ub = component(retype(msg_payload, BRW_REGISTER_TYPE_UB), 10);
2757 fs_reg r0_11ub =
2758 stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11),
2759 0, 1, 0);
2760 bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
2761 }
2762
2763 static void
emit_barrier(nir_to_brw_state & ntb)2764 emit_barrier(nir_to_brw_state &ntb)
2765 {
2766 const intel_device_info *devinfo = ntb.devinfo;
2767 const fs_builder &bld = ntb.bld;
2768 fs_visitor &s = ntb.s;
2769
2770 /* We are getting the barrier ID from the compute shader header */
2771 assert(gl_shader_stage_uses_workgroup(s.stage));
2772
2773 fs_reg payload = fs_reg(VGRF, s.alloc.allocate(1), BRW_REGISTER_TYPE_UD);
2774
2775 /* Clear the message payload */
2776 bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u));
2777
2778 if (devinfo->verx10 >= 125) {
2779 setup_barrier_message_payload_gfx125(bld, payload);
2780 } else {
2781 assert(gl_shader_stage_is_compute(s.stage));
2782
2783 uint32_t barrier_id_mask;
2784 switch (devinfo->ver) {
2785 case 7:
2786 case 8:
2787 barrier_id_mask = 0x0f000000u; break;
2788 case 9:
2789 barrier_id_mask = 0x8f000000u; break;
2790 case 11:
2791 case 12:
2792 barrier_id_mask = 0x7f000000u; break;
2793 default:
2794 unreachable("barrier is only available on gen >= 7");
2795 }
2796
2797 /* Copy the barrier id from r0.2 to the message payload reg.2 */
2798 fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
2799 bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
2800 brw_imm_ud(barrier_id_mask));
2801 }
2802
2803 /* Emit a gateway "barrier" message using the payload we set up, followed
2804 * by a wait instruction.
2805 */
2806 bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
2807 }
2808
2809 static void
emit_tcs_barrier(nir_to_brw_state & ntb)2810 emit_tcs_barrier(nir_to_brw_state &ntb)
2811 {
2812 const intel_device_info *devinfo = ntb.devinfo;
2813 const fs_builder &bld = ntb.bld;
2814 fs_visitor &s = ntb.s;
2815
2816 assert(s.stage == MESA_SHADER_TESS_CTRL);
2817 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
2818
2819 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2820 fs_reg m0_2 = component(m0, 2);
2821
2822 const fs_builder chanbld = bld.exec_all().group(1, 0);
2823
2824 /* Zero the message header */
2825 bld.exec_all().MOV(m0, brw_imm_ud(0u));
2826
2827 if (devinfo->verx10 >= 125) {
2828 setup_barrier_message_payload_gfx125(bld, m0);
2829 } else if (devinfo->ver >= 11) {
2830 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2831 brw_imm_ud(INTEL_MASK(30, 24)));
2832
2833 /* Set the Barrier Count and the enable bit */
2834 chanbld.OR(m0_2, m0_2,
2835 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2836 } else {
2837 /* Copy "Barrier ID" from r0.2, bits 16:13 */
2838 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2839 brw_imm_ud(INTEL_MASK(16, 13)));
2840
2841 /* Shift it up to bits 27:24. */
2842 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2843
2844 /* Set the Barrier Count and the enable bit */
2845 chanbld.OR(m0_2, m0_2,
2846 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2847 }
2848
2849 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2850 }
2851
2852 static void
fs_nir_emit_tcs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)2853 fs_nir_emit_tcs_intrinsic(nir_to_brw_state &ntb,
2854 nir_intrinsic_instr *instr)
2855 {
2856 const intel_device_info *devinfo = ntb.devinfo;
2857 const fs_builder &bld = ntb.bld;
2858 fs_visitor &s = ntb.s;
2859
2860 assert(s.stage == MESA_SHADER_TESS_CTRL);
2861 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(s.prog_data);
2862 struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2863
2864 fs_reg dst;
2865 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2866 dst = get_nir_def(ntb, instr->def);
2867
2868 switch (instr->intrinsic) {
2869 case nir_intrinsic_load_primitive_id:
2870 bld.MOV(dst, s.tcs_payload().primitive_id);
2871 break;
2872 case nir_intrinsic_load_invocation_id:
2873 bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
2874 break;
2875
2876 case nir_intrinsic_barrier:
2877 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
2878 fs_nir_emit_intrinsic(ntb, bld, instr);
2879 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
2880 if (tcs_prog_data->instances != 1)
2881 emit_tcs_barrier(ntb);
2882 }
2883 break;
2884
2885 case nir_intrinsic_load_input:
2886 unreachable("nir_lower_io should never give us these.");
2887 break;
2888
2889 case nir_intrinsic_load_per_vertex_input: {
2890 assert(instr->def.bit_size == 32);
2891 fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2892 unsigned imm_offset = nir_intrinsic_base(instr);
2893 fs_inst *inst;
2894
2895 const bool multi_patch =
2896 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
2897
2898 fs_reg icp_handle = multi_patch ?
2899 get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
2900 get_tcs_single_patch_icp_handle(ntb, bld, instr);
2901
2902 /* We can only read two double components with each URB read, so
2903 * we send two read messages in that case, each one loading up to
2904 * two double components.
2905 */
2906 unsigned num_components = instr->num_components;
2907 unsigned first_component = nir_intrinsic_component(instr);
2908
2909 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2910 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2911
2912 if (indirect_offset.file == BAD_FILE) {
2913 /* Constant indexing - use global offset. */
2914 if (first_component != 0) {
2915 unsigned read_components = num_components + first_component;
2916 fs_reg tmp = bld.vgrf(dst.type, read_components);
2917 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2918 ARRAY_SIZE(srcs));
2919 for (unsigned i = 0; i < num_components; i++) {
2920 bld.MOV(offset(dst, bld, i),
2921 offset(tmp, bld, i + first_component));
2922 }
2923 } else {
2924 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2925 ARRAY_SIZE(srcs));
2926 }
2927 inst->offset = imm_offset;
2928 } else {
2929 /* Indirect indexing - use per-slot offsets as well. */
2930 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2931
2932 if (first_component != 0) {
2933 unsigned read_components = num_components + first_component;
2934 fs_reg tmp = bld.vgrf(dst.type, read_components);
2935 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2936 srcs, ARRAY_SIZE(srcs));
2937 for (unsigned i = 0; i < num_components; i++) {
2938 bld.MOV(offset(dst, bld, i),
2939 offset(tmp, bld, i + first_component));
2940 }
2941 } else {
2942 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
2943 srcs, ARRAY_SIZE(srcs));
2944 }
2945 inst->offset = imm_offset;
2946 }
2947 inst->size_written = (num_components + first_component) *
2948 inst->dst.component_size(inst->exec_size);
2949
2950 /* Copy the temporary to the destination to deal with writemasking.
2951 *
2952 * Also attempt to deal with gl_PointSize being in the .w component.
2953 */
2954 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2955 assert(type_sz(dst.type) == 4);
2956 inst->dst = bld.vgrf(dst.type, 4);
2957 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
2958 bld.MOV(dst, offset(inst->dst, bld, 3));
2959 }
2960 break;
2961 }
2962
2963 case nir_intrinsic_load_output:
2964 case nir_intrinsic_load_per_vertex_output: {
2965 assert(instr->def.bit_size == 32);
2966 fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2967 unsigned imm_offset = nir_intrinsic_base(instr);
2968 unsigned first_component = nir_intrinsic_component(instr);
2969
2970 fs_inst *inst;
2971 if (indirect_offset.file == BAD_FILE) {
2972 /* This MOV replicates the output handle to all enabled channels
2973 * is SINGLE_PATCH mode.
2974 */
2975 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2976 bld.MOV(patch_handle, s.tcs_payload().patch_urb_output);
2977
2978 {
2979 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2980 srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
2981
2982 if (first_component != 0) {
2983 unsigned read_components =
2984 instr->num_components + first_component;
2985 fs_reg tmp = bld.vgrf(dst.type, read_components);
2986 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2987 srcs, ARRAY_SIZE(srcs));
2988 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2989 for (unsigned i = 0; i < instr->num_components; i++) {
2990 bld.MOV(offset(dst, bld, i),
2991 offset(tmp, bld, i + first_component));
2992 }
2993 } else {
2994 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
2995 srcs, ARRAY_SIZE(srcs));
2996 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2997 }
2998 inst->offset = imm_offset;
2999 }
3000 } else {
3001 /* Indirect indexing - use per-slot offsets as well. */
3002 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3003 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3004 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3005
3006 if (first_component != 0) {
3007 unsigned read_components =
3008 instr->num_components + first_component;
3009 fs_reg tmp = bld.vgrf(dst.type, read_components);
3010 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3011 srcs, ARRAY_SIZE(srcs));
3012 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3013 for (unsigned i = 0; i < instr->num_components; i++) {
3014 bld.MOV(offset(dst, bld, i),
3015 offset(tmp, bld, i + first_component));
3016 }
3017 } else {
3018 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dst,
3019 srcs, ARRAY_SIZE(srcs));
3020 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3021 }
3022 inst->offset = imm_offset;
3023 }
3024 break;
3025 }
3026
3027 case nir_intrinsic_store_output:
3028 case nir_intrinsic_store_per_vertex_output: {
3029 assert(nir_src_bit_size(instr->src[0]) == 32);
3030 fs_reg value = get_nir_src(ntb, instr->src[0]);
3031 fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3032 unsigned imm_offset = nir_intrinsic_base(instr);
3033 unsigned mask = nir_intrinsic_write_mask(instr);
3034
3035 if (mask == 0)
3036 break;
3037
3038 unsigned num_components = util_last_bit(mask);
3039 unsigned first_component = nir_intrinsic_component(instr);
3040 assert((first_component + num_components) <= 4);
3041
3042 mask = mask << first_component;
3043
3044 const bool has_urb_lsc = devinfo->ver >= 20;
3045
3046 fs_reg mask_reg;
3047 if (mask != WRITEMASK_XYZW)
3048 mask_reg = brw_imm_ud(mask << 16);
3049
3050 fs_reg sources[4];
3051
3052 unsigned m = has_urb_lsc ? 0 : first_component;
3053 for (unsigned i = 0; i < num_components; i++) {
3054 int c = i + first_component;
3055 if (mask & (1 << c)) {
3056 sources[m++] = offset(value, bld, i);
3057 } else if (devinfo->ver < 20) {
3058 m++;
3059 }
3060 }
3061
3062 assert(has_urb_lsc || m == (first_component + num_components));
3063
3064 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3065 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3066 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3067 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
3068 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_F, m);
3069 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(m);
3070 bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
3071
3072 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
3073 srcs, ARRAY_SIZE(srcs));
3074 inst->offset = imm_offset;
3075 break;
3076 }
3077
3078 default:
3079 fs_nir_emit_intrinsic(ntb, bld, instr);
3080 break;
3081 }
3082 }
3083
3084 static void
fs_nir_emit_tes_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3085 fs_nir_emit_tes_intrinsic(nir_to_brw_state &ntb,
3086 nir_intrinsic_instr *instr)
3087 {
3088 const intel_device_info *devinfo = ntb.devinfo;
3089 const fs_builder &bld = ntb.bld;
3090 fs_visitor &s = ntb.s;
3091
3092 assert(s.stage == MESA_SHADER_TESS_EVAL);
3093 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(s.prog_data);
3094
3095 fs_reg dest;
3096 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3097 dest = get_nir_def(ntb, instr->def);
3098
3099 switch (instr->intrinsic) {
3100 case nir_intrinsic_load_primitive_id:
3101 bld.MOV(dest, s.tes_payload().primitive_id);
3102 break;
3103
3104 case nir_intrinsic_load_tess_coord:
3105 for (unsigned i = 0; i < 3; i++)
3106 bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3107 break;
3108
3109 case nir_intrinsic_load_input:
3110 case nir_intrinsic_load_per_vertex_input: {
3111 assert(instr->def.bit_size == 32);
3112 fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3113 unsigned imm_offset = nir_intrinsic_base(instr);
3114 unsigned first_component = nir_intrinsic_component(instr);
3115
3116 fs_inst *inst;
3117 if (indirect_offset.file == BAD_FILE) {
3118 /* Arbitrarily only push up to 32 vec4 slots worth of data,
3119 * which is 16 registers (since each holds 2 vec4 slots).
3120 */
3121 const unsigned max_push_slots = 32;
3122 if (imm_offset < max_push_slots) {
3123 const fs_reg src = horiz_offset(fs_reg(ATTR, 0, dest.type),
3124 4 * imm_offset + first_component);
3125 for (int i = 0; i < instr->num_components; i++)
3126 bld.MOV(offset(dest, bld, i), component(src, i));
3127
3128 tes_prog_data->base.urb_read_length =
3129 MAX2(tes_prog_data->base.urb_read_length,
3130 (imm_offset / 2) + 1);
3131 } else {
3132 /* Replicate the patch handle to all enabled channels */
3133 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3134 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3135
3136 if (first_component != 0) {
3137 unsigned read_components =
3138 instr->num_components + first_component;
3139 fs_reg tmp = bld.vgrf(dest.type, read_components);
3140 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3141 srcs, ARRAY_SIZE(srcs));
3142 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3143 for (unsigned i = 0; i < instr->num_components; i++) {
3144 bld.MOV(offset(dest, bld, i),
3145 offset(tmp, bld, i + first_component));
3146 }
3147 } else {
3148 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3149 srcs, ARRAY_SIZE(srcs));
3150 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3151 }
3152 inst->offset = imm_offset;
3153 }
3154 } else {
3155 /* Indirect indexing - use per-slot offsets as well. */
3156
3157 /* We can only read two double components with each URB read, so
3158 * we send two read messages in that case, each one loading up to
3159 * two double components.
3160 */
3161 unsigned num_components = instr->num_components;
3162
3163 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3164 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3165 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3166
3167 if (first_component != 0) {
3168 unsigned read_components =
3169 num_components + first_component;
3170 fs_reg tmp = bld.vgrf(dest.type, read_components);
3171 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3172 srcs, ARRAY_SIZE(srcs));
3173 for (unsigned i = 0; i < num_components; i++) {
3174 bld.MOV(offset(dest, bld, i),
3175 offset(tmp, bld, i + first_component));
3176 }
3177 } else {
3178 inst = bld.emit(SHADER_OPCODE_URB_READ_LOGICAL, dest,
3179 srcs, ARRAY_SIZE(srcs));
3180 }
3181 inst->offset = imm_offset;
3182 inst->size_written = (num_components + first_component) *
3183 inst->dst.component_size(inst->exec_size);
3184 }
3185 break;
3186 }
3187 default:
3188 fs_nir_emit_intrinsic(ntb, bld, instr);
3189 break;
3190 }
3191 }
3192
3193 static void
fs_nir_emit_gs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3194 fs_nir_emit_gs_intrinsic(nir_to_brw_state &ntb,
3195 nir_intrinsic_instr *instr)
3196 {
3197 const fs_builder &bld = ntb.bld;
3198 fs_visitor &s = ntb.s;
3199
3200 assert(s.stage == MESA_SHADER_GEOMETRY);
3201 fs_reg indirect_offset;
3202
3203 fs_reg dest;
3204 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3205 dest = get_nir_def(ntb, instr->def);
3206
3207 switch (instr->intrinsic) {
3208 case nir_intrinsic_load_primitive_id:
3209 assert(s.stage == MESA_SHADER_GEOMETRY);
3210 assert(brw_gs_prog_data(s.prog_data)->include_primitive_id);
3211 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), s.gs_payload().primitive_id);
3212 break;
3213
3214 case nir_intrinsic_load_input:
3215 unreachable("load_input intrinsics are invalid for the GS stage");
3216
3217 case nir_intrinsic_load_per_vertex_input:
3218 emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3219 instr->src[1], instr->num_components,
3220 nir_intrinsic_component(instr));
3221 break;
3222
3223 case nir_intrinsic_emit_vertex_with_counter:
3224 emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3225 break;
3226
3227 case nir_intrinsic_end_primitive_with_counter:
3228 emit_gs_end_primitive(ntb, instr->src[0]);
3229 break;
3230
3231 case nir_intrinsic_set_vertex_and_primitive_count:
3232 bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3233 break;
3234
3235 case nir_intrinsic_load_invocation_id: {
3236 fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3237 assert(val.file != BAD_FILE);
3238 dest.type = val.type;
3239 bld.MOV(dest, val);
3240 break;
3241 }
3242
3243 default:
3244 fs_nir_emit_intrinsic(ntb, bld, instr);
3245 break;
3246 }
3247 }
3248
3249 /**
3250 * Fetch the current render target layer index.
3251 */
3252 static fs_reg
fetch_render_target_array_index(const fs_builder & bld)3253 fetch_render_target_array_index(const fs_builder &bld)
3254 {
3255 const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
3256
3257 if (bld.shader->devinfo->ver >= 20) {
3258 /* Gfx20+ has separate Render Target Array indices for each pair
3259 * of subspans in order to support multiple polygons, so we need
3260 * to use a <1;8,0> region in order to select the correct word
3261 * for each channel.
3262 */
3263 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3264
3265 for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
3266 const fs_builder hbld = bld.group(16, i);
3267 const struct brw_reg reg = retype(brw_vec1_grf(2 * i + 1, 1),
3268 BRW_REGISTER_TYPE_UW);
3269 hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
3270 brw_imm_uw(0x7ff));
3271 }
3272
3273 return idx;
3274 } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
3275 /* According to the BSpec "PS Thread Payload for Normal
3276 * Dispatch", the render target array index is stored as bits
3277 * 26:16 of either the R1.1 or R1.6 poly info dwords, for the
3278 * first and second polygons respectively in multipolygon PS
3279 * dispatch mode.
3280 */
3281 assert(bld.dispatch_width() == 16);
3282 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3283
3284 for (unsigned i = 0; i < v->max_polygons; i++) {
3285 const fs_builder hbld = bld.group(8, i);
3286 const struct brw_reg g1 = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3 + 10 * i);
3287 hbld.AND(offset(idx, hbld, i), g1, brw_imm_uw(0x7ff));
3288 }
3289
3290 return idx;
3291 } else if (bld.shader->devinfo->ver >= 12) {
3292 /* The render target array index is provided in the thread payload as
3293 * bits 26:16 of r1.1.
3294 */
3295 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3296 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
3297 brw_imm_uw(0x7ff));
3298 return idx;
3299 } else {
3300 /* The render target array index is provided in the thread payload as
3301 * bits 26:16 of r0.0.
3302 */
3303 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3304 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3305 brw_imm_uw(0x7ff));
3306 return idx;
3307 }
3308 }
3309
3310 /* Sample from the MCS surface attached to this multisample texture. */
3311 static fs_reg
emit_mcs_fetch(nir_to_brw_state & ntb,const fs_reg & coordinate,unsigned components,const fs_reg & texture,const fs_reg & texture_handle)3312 emit_mcs_fetch(nir_to_brw_state &ntb, const fs_reg &coordinate, unsigned components,
3313 const fs_reg &texture,
3314 const fs_reg &texture_handle)
3315 {
3316 const fs_builder &bld = ntb.bld;
3317
3318 const fs_reg dest = ntb.s.vgrf(glsl_uvec4_type());
3319
3320 fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3321 srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3322 srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3323 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
3324 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3325 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
3326 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
3327 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
3328
3329 fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3330 ARRAY_SIZE(srcs));
3331
3332 /* We only care about one or two regs of response, but the sampler always
3333 * writes 4/8.
3334 */
3335 inst->size_written = 4 * dest.component_size(inst->exec_size);
3336
3337 return dest;
3338 }
3339
3340 /**
3341 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3342 * framebuffer at the current fragment coordinates and sample index.
3343 */
3344 static fs_inst *
emit_non_coherent_fb_read(nir_to_brw_state & ntb,const fs_builder & bld,const fs_reg & dst,unsigned target)3345 emit_non_coherent_fb_read(nir_to_brw_state &ntb, const fs_builder &bld, const fs_reg &dst,
3346 unsigned target)
3347 {
3348 fs_visitor &s = ntb.s;
3349 const struct intel_device_info *devinfo = s.devinfo;
3350
3351 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3352 const brw_wm_prog_key *wm_key =
3353 reinterpret_cast<const brw_wm_prog_key *>(s.key);
3354 assert(!wm_key->coherent_fb_fetch);
3355
3356 /* Calculate the fragment coordinates. */
3357 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3358 bld.MOV(offset(coords, bld, 0), s.pixel_x);
3359 bld.MOV(offset(coords, bld, 1), s.pixel_y);
3360 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3361
3362 /* Calculate the sample index and MCS payload when multisampling. Luckily
3363 * the MCS fetch message behaves deterministically for UMS surfaces, so it
3364 * shouldn't be necessary to recompile based on whether the framebuffer is
3365 * CMS or UMS.
3366 */
3367 assert(wm_key->multisample_fbo == BRW_ALWAYS ||
3368 wm_key->multisample_fbo == BRW_NEVER);
3369 if (wm_key->multisample_fbo &&
3370 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3371 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3372
3373 const fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3374 const fs_reg mcs = wm_key->multisample_fbo ?
3375 emit_mcs_fetch(ntb, coords, 3, brw_imm_ud(target), fs_reg()) : fs_reg();
3376
3377 /* Use either a normal or a CMS texel fetch message depending on whether
3378 * the framebuffer is single or multisample. On SKL+ use the wide CMS
3379 * message just in case the framebuffer uses 16x multisampling, it should
3380 * be equivalent to the normal CMS fetch for lower multisampling modes.
3381 */
3382 opcode op;
3383 if (wm_key->multisample_fbo) {
3384 /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
3385 * multisampling, it should be equivalent to the normal CMS fetch for
3386 * lower multisampling modes.
3387 *
3388 * On Gfx12HP, there is only CMS_W variant available.
3389 */
3390 if (devinfo->verx10 >= 125)
3391 op = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
3392 else
3393 op = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
3394 } else {
3395 op = SHADER_OPCODE_TXF_LOGICAL;
3396 }
3397
3398 /* Emit the instruction. */
3399 fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3400 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords;
3401 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0);
3402 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample;
3403 srcs[TEX_LOGICAL_SRC_MCS] = mcs;
3404 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(target);
3405 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
3406 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3407 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);
3408 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0);
3409
3410 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3411 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3412
3413 return inst;
3414 }
3415
3416 /**
3417 * Actual coherent framebuffer read implemented using the native render target
3418 * read message. Requires SKL+.
3419 */
3420 static fs_inst *
emit_coherent_fb_read(const fs_builder & bld,const fs_reg & dst,unsigned target)3421 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3422 {
3423 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3424 inst->target = target;
3425 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3426
3427 return inst;
3428 }
3429
3430 static fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,fs_reg * regs,unsigned n)3431 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3432 {
3433 if (n && regs[0].file != BAD_FILE) {
3434 return regs[0];
3435
3436 } else {
3437 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3438
3439 for (unsigned i = 0; i < n; i++)
3440 regs[i] = tmp;
3441
3442 return tmp;
3443 }
3444 }
3445
3446 static fs_reg
alloc_frag_output(nir_to_brw_state & ntb,unsigned location)3447 alloc_frag_output(nir_to_brw_state &ntb, unsigned location)
3448 {
3449 fs_visitor &s = ntb.s;
3450
3451 assert(s.stage == MESA_SHADER_FRAGMENT);
3452 const brw_wm_prog_key *const key =
3453 reinterpret_cast<const brw_wm_prog_key *>(s.key);
3454 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3455 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3456
3457 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3458 return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3459
3460 else if (l == FRAG_RESULT_COLOR)
3461 return alloc_temporary(ntb.bld, 4, s.outputs,
3462 MAX2(key->nr_color_regions, 1));
3463
3464 else if (l == FRAG_RESULT_DEPTH)
3465 return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3466
3467 else if (l == FRAG_RESULT_STENCIL)
3468 return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3469
3470 else if (l == FRAG_RESULT_SAMPLE_MASK)
3471 return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3472
3473 else if (l >= FRAG_RESULT_DATA0 &&
3474 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3475 return alloc_temporary(ntb.bld, 4,
3476 &s.outputs[l - FRAG_RESULT_DATA0], 1);
3477
3478 else
3479 unreachable("Invalid location");
3480 }
3481
3482 static void
emit_is_helper_invocation(nir_to_brw_state & ntb,fs_reg result)3483 emit_is_helper_invocation(nir_to_brw_state &ntb, fs_reg result)
3484 {
3485 const fs_builder &bld = ntb.bld;
3486
3487 /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3488 * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3489 * consideration demoted invocations.
3490 */
3491 result.type = BRW_REGISTER_TYPE_UD;
3492
3493 bld.MOV(result, brw_imm_ud(0));
3494
3495 /* See brw_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3496 unsigned width = bld.dispatch_width();
3497 for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3498 const fs_builder b = bld.group(MIN2(width, 16), i);
3499
3500 fs_inst *mov = b.MOV(offset(result, b, i), brw_imm_ud(~0));
3501
3502 /* The at() ensures that any code emitted to get the predicate happens
3503 * before the mov right above. This is not an issue elsewhere because
3504 * lowering code already set up the builder this way.
3505 */
3506 brw_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3507 mov->predicate_inverse = true;
3508 }
3509 }
3510
3511 static void
emit_fragcoord_interpolation(nir_to_brw_state & ntb,fs_reg wpos)3512 emit_fragcoord_interpolation(nir_to_brw_state &ntb, fs_reg wpos)
3513 {
3514 const fs_builder &bld = ntb.bld;
3515 fs_visitor &s = ntb.s;
3516
3517 assert(s.stage == MESA_SHADER_FRAGMENT);
3518
3519 /* gl_FragCoord.x */
3520 bld.MOV(wpos, s.pixel_x);
3521 wpos = offset(wpos, bld, 1);
3522
3523 /* gl_FragCoord.y */
3524 bld.MOV(wpos, s.pixel_y);
3525 wpos = offset(wpos, bld, 1);
3526
3527 /* gl_FragCoord.z */
3528 bld.MOV(wpos, s.pixel_z);
3529 wpos = offset(wpos, bld, 1);
3530
3531 /* gl_FragCoord.w: Already set up in emit_interpolation */
3532 bld.MOV(wpos, s.wpos_w);
3533 }
3534
3535 static fs_reg
emit_frontfacing_interpolation(nir_to_brw_state & ntb)3536 emit_frontfacing_interpolation(nir_to_brw_state &ntb)
3537 {
3538 const intel_device_info *devinfo = ntb.devinfo;
3539 const fs_builder &bld = ntb.bld;
3540 fs_visitor &s = ntb.s;
3541
3542 fs_reg ff = bld.vgrf(BRW_REGISTER_TYPE_D);
3543
3544 if (devinfo->ver >= 20) {
3545 /* Gfx20+ has separate back-facing bits for each pair of
3546 * subspans in order to support multiple polygons, so we need to
3547 * use a <1;8,0> region in order to select the correct word for
3548 * each channel.
3549 */
3550 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
3551
3552 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3553 const fs_builder hbld = bld.group(16, i);
3554 const struct brw_reg gi_uw = retype(xe2_vec1_grf(i, 9),
3555 BRW_REGISTER_TYPE_UW);
3556 hbld.AND(offset(tmp, hbld, i), gi_uw, brw_imm_uw(0x800));
3557 }
3558
3559 bld.CMP(ff, tmp, brw_imm_uw(0), BRW_CONDITIONAL_Z);
3560
3561 } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
3562 /* According to the BSpec "PS Thread Payload for Normal
3563 * Dispatch", the front/back facing interpolation bit is stored
3564 * as bit 15 of either the R1.1 or R1.6 poly info field, for the
3565 * first and second polygons respectively in multipolygon PS
3566 * dispatch mode.
3567 */
3568 assert(s.dispatch_width == 16);
3569 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
3570
3571 for (unsigned i = 0; i < s.max_polygons; i++) {
3572 const fs_builder hbld = bld.group(8, i);
3573 const struct brw_reg g1 = retype(brw_vec1_grf(1, 1 + 5 * i),
3574 BRW_REGISTER_TYPE_W);
3575 hbld.ASR(offset(tmp, hbld, i), g1, brw_imm_d(15));
3576 }
3577
3578 bld.NOT(ff, tmp);
3579
3580 } else if (devinfo->ver >= 12) {
3581 fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
3582
3583 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
3584 bld.ASR(tmp, g1, brw_imm_d(15));
3585 bld.NOT(ff, tmp);
3586 } else {
3587 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3588 * a boolean result from this (~0/true or 0/false).
3589 *
3590 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3591 * this task in only one instruction:
3592 * - a negation source modifier will flip the bit; and
3593 * - a W -> D type conversion will sign extend the bit into the high
3594 * word of the destination.
3595 *
3596 * An ASR 15 fills the low word of the destination.
3597 */
3598 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
3599 g0.negate = true;
3600
3601 bld.ASR(ff, g0, brw_imm_d(15));
3602 }
3603
3604 return ff;
3605 }
3606
3607 static fs_reg
emit_samplepos_setup(nir_to_brw_state & ntb)3608 emit_samplepos_setup(nir_to_brw_state &ntb)
3609 {
3610 const fs_builder &bld = ntb.bld;
3611 fs_visitor &s = ntb.s;
3612
3613 assert(s.stage == MESA_SHADER_FRAGMENT);
3614 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
3615
3616 const fs_builder abld = bld.annotate("compute sample position");
3617 fs_reg pos = abld.vgrf(BRW_REGISTER_TYPE_F, 2);
3618
3619 if (wm_prog_data->persample_dispatch == BRW_NEVER) {
3620 /* From ARB_sample_shading specification:
3621 * "When rendering to a non-multisample buffer, or if multisample
3622 * rasterization is disabled, gl_SamplePosition will always be
3623 * (0.5, 0.5).
3624 */
3625 bld.MOV(offset(pos, bld, 0), brw_imm_f(0.5f));
3626 bld.MOV(offset(pos, bld, 1), brw_imm_f(0.5f));
3627 return pos;
3628 }
3629
3630 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3631 * mode will be enabled.
3632 *
3633 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3634 * R31.1:0 Position Offset X/Y for Slot[3:0]
3635 * R31.3:2 Position Offset X/Y for Slot[7:4]
3636 * .....
3637 *
3638 * The X, Y sample positions come in as bytes in thread payload. So, read
3639 * the positions using vstride=16, width=8, hstride=2.
3640 */
3641 const fs_reg sample_pos_reg =
3642 fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, BRW_REGISTER_TYPE_W);
3643
3644 for (unsigned i = 0; i < 2; i++) {
3645 fs_reg tmp_d = bld.vgrf(BRW_REGISTER_TYPE_D);
3646 abld.MOV(tmp_d, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, i));
3647 /* Convert int_sample_pos to floating point */
3648 fs_reg tmp_f = bld.vgrf(BRW_REGISTER_TYPE_F);
3649 abld.MOV(tmp_f, tmp_d);
3650 /* Scale to the range [0, 1] */
3651 abld.MUL(offset(pos, abld, i), tmp_f, brw_imm_f(1 / 16.0f));
3652 }
3653
3654 if (wm_prog_data->persample_dispatch == BRW_SOMETIMES) {
3655 check_dynamic_msaa_flag(abld, wm_prog_data,
3656 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3657 for (unsigned i = 0; i < 2; i++) {
3658 set_predicate(BRW_PREDICATE_NORMAL,
3659 bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3660 brw_imm_f(0.5f)));
3661 }
3662 }
3663
3664 return pos;
3665 }
3666
3667 static fs_reg
emit_sampleid_setup(nir_to_brw_state & ntb)3668 emit_sampleid_setup(nir_to_brw_state &ntb)
3669 {
3670 const intel_device_info *devinfo = ntb.devinfo;
3671 const fs_builder &bld = ntb.bld;
3672 fs_visitor &s = ntb.s;
3673
3674 assert(s.stage == MESA_SHADER_FRAGMENT);
3675 ASSERTED brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
3676 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
3677
3678 const fs_builder abld = bld.annotate("compute sample id");
3679 fs_reg sample_id = abld.vgrf(BRW_REGISTER_TYPE_UD);
3680
3681 assert(key->multisample_fbo != BRW_NEVER);
3682
3683 /* Sample ID comes in as 4-bit numbers in g1.0:
3684 *
3685 * 15:12 Slot 3 SampleID (only used in SIMD16)
3686 * 11:8 Slot 2 SampleID (only used in SIMD16)
3687 * 7:4 Slot 1 SampleID
3688 * 3:0 Slot 0 SampleID
3689 *
3690 * Each slot corresponds to four channels, so we want to replicate each
3691 * half-byte value to 4 channels in a row:
3692 *
3693 * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
3694 * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
3695 *
3696 * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
3697 * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
3698 *
3699 * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
3700 * channels to read the first byte (7:0), and the second group of 8
3701 * channels to read the second byte (15:8). Then, we shift right by
3702 * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
3703 * values into place. Finally, we AND with 0xf to keep the low nibble.
3704 *
3705 * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
3706 * and(16) dst<1>D tmp<8,8,1>W 0xf:W
3707 *
3708 * TODO: These payload bits exist on Gfx7 too, but they appear to always
3709 * be zero, so this code fails to work. We should find out why.
3710 */
3711 const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
3712
3713 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3714 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
3715 /* According to the "PS Thread Payload for Normal Dispatch"
3716 * pages on the BSpec, the sample ids are stored in R0.8/R1.8
3717 * on gfx20+ and in R1.0/R2.0 on gfx8+.
3718 */
3719 const struct brw_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
3720 brw_vec1_grf(i + 1, 0);
3721 hbld.SHR(offset(tmp, hbld, i),
3722 stride(retype(id_reg, BRW_REGISTER_TYPE_UB), 1, 8, 0),
3723 brw_imm_v(0x44440000));
3724 }
3725
3726 abld.AND(sample_id, tmp, brw_imm_w(0xf));
3727
3728 if (key->multisample_fbo == BRW_SOMETIMES) {
3729 check_dynamic_msaa_flag(abld, wm_prog_data,
3730 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3731 set_predicate(BRW_PREDICATE_NORMAL,
3732 abld.SEL(sample_id, sample_id, brw_imm_ud(0)));
3733 }
3734
3735 return sample_id;
3736 }
3737
3738 static fs_reg
emit_samplemaskin_setup(nir_to_brw_state & ntb)3739 emit_samplemaskin_setup(nir_to_brw_state &ntb)
3740 {
3741 const fs_builder &bld = ntb.bld;
3742 fs_visitor &s = ntb.s;
3743
3744 assert(s.stage == MESA_SHADER_FRAGMENT);
3745 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
3746
3747 /* The HW doesn't provide us with expected values. */
3748 assert(wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS);
3749
3750 fs_reg coverage_mask =
3751 fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, BRW_REGISTER_TYPE_D);
3752
3753 if (wm_prog_data->persample_dispatch == BRW_NEVER)
3754 return coverage_mask;
3755
3756 /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
3757 * and a mask representing which sample is being processed by the
3758 * current shader invocation.
3759 *
3760 * From the OES_sample_variables specification:
3761 * "When per-sample shading is active due to the use of a fragment input
3762 * qualified by "sample" or due to the use of the gl_SampleID or
3763 * gl_SamplePosition variables, only the bit for the current sample is
3764 * set in gl_SampleMaskIn."
3765 */
3766 const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
3767
3768 if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3769 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3770
3771 fs_reg one = s.vgrf(glsl_int_type());
3772 fs_reg enabled_mask = s.vgrf(glsl_int_type());
3773 abld.MOV(one, brw_imm_d(1));
3774 abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
3775 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_D);
3776 abld.AND(mask, enabled_mask, coverage_mask);
3777
3778 if (wm_prog_data->persample_dispatch == BRW_ALWAYS)
3779 return mask;
3780
3781 check_dynamic_msaa_flag(abld, wm_prog_data,
3782 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3783 set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
3784
3785 return mask;
3786 }
3787
3788 static fs_reg
emit_shading_rate_setup(nir_to_brw_state & ntb)3789 emit_shading_rate_setup(nir_to_brw_state &ntb)
3790 {
3791 const intel_device_info *devinfo = ntb.devinfo;
3792 const fs_builder &bld = ntb.bld;
3793
3794 assert(devinfo->ver >= 11);
3795
3796 struct brw_wm_prog_data *wm_prog_data =
3797 brw_wm_prog_data(bld.shader->stage_prog_data);
3798
3799 /* Coarse pixel shading size fields overlap with other fields of not in
3800 * coarse pixel dispatch mode, so report 0 when that's not the case.
3801 */
3802 if (wm_prog_data->coarse_pixel_dispatch == BRW_NEVER)
3803 return brw_imm_ud(0);
3804
3805 const fs_builder abld = bld.annotate("compute fragment shading rate");
3806
3807 /* The shading rates provided in the shader are the actual 2D shading
3808 * rate while the SPIR-V built-in is the enum value that has the shading
3809 * rate encoded as a bitfield. Fortunately, the bitfield value is just
3810 * the shading rate divided by two and shifted.
3811 */
3812
3813 /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
3814 fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
3815 /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
3816 fs_reg actual_y = byte_offset(actual_x, 1);
3817
3818 fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
3819 fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
3820
3821 abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
3822 abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
3823 abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
3824
3825 fs_reg rate = abld.vgrf(BRW_REGISTER_TYPE_UD);
3826 abld.OR(rate, int_rate_x, int_rate_y);
3827
3828 if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS)
3829 return rate;
3830
3831 check_dynamic_msaa_flag(abld, wm_prog_data,
3832 INTEL_MSAA_FLAG_COARSE_RT_WRITES);
3833 set_predicate(BRW_PREDICATE_NORMAL, abld.SEL(rate, rate, brw_imm_ud(0)));
3834
3835 return rate;
3836 }
3837
3838 static void
fs_nir_emit_fs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)3839 fs_nir_emit_fs_intrinsic(nir_to_brw_state &ntb,
3840 nir_intrinsic_instr *instr)
3841 {
3842 const intel_device_info *devinfo = ntb.devinfo;
3843 const fs_builder &bld = ntb.bld;
3844 fs_visitor &s = ntb.s;
3845
3846 assert(s.stage == MESA_SHADER_FRAGMENT);
3847
3848 fs_reg dest;
3849 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3850 dest = get_nir_def(ntb, instr->def);
3851
3852 switch (instr->intrinsic) {
3853 case nir_intrinsic_load_front_face:
3854 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3855 emit_frontfacing_interpolation(ntb));
3856 break;
3857
3858 case nir_intrinsic_load_sample_pos:
3859 case nir_intrinsic_load_sample_pos_or_center: {
3860 fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
3861 assert(sample_pos.file != BAD_FILE);
3862 dest.type = sample_pos.type;
3863 bld.MOV(dest, sample_pos);
3864 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3865 break;
3866 }
3867
3868 case nir_intrinsic_load_layer_id:
3869 dest.type = BRW_REGISTER_TYPE_UD;
3870 bld.MOV(dest, fetch_render_target_array_index(bld));
3871 break;
3872
3873 case nir_intrinsic_is_helper_invocation:
3874 emit_is_helper_invocation(ntb, dest);
3875 break;
3876
3877 case nir_intrinsic_load_helper_invocation:
3878 case nir_intrinsic_load_sample_mask_in:
3879 case nir_intrinsic_load_sample_id:
3880 case nir_intrinsic_load_frag_shading_rate: {
3881 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3882 fs_reg val = ntb.system_values[sv];
3883 assert(val.file != BAD_FILE);
3884 dest.type = val.type;
3885 bld.MOV(dest, val);
3886 break;
3887 }
3888
3889 case nir_intrinsic_store_output: {
3890 const fs_reg src = get_nir_src(ntb, instr->src[0]);
3891 const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3892 const unsigned location = nir_intrinsic_base(instr) +
3893 SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3894 const fs_reg new_dest = retype(alloc_frag_output(ntb, location),
3895 src.type);
3896
3897 for (unsigned j = 0; j < instr->num_components; j++)
3898 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3899 offset(src, bld, j));
3900
3901 break;
3902 }
3903
3904 case nir_intrinsic_load_output: {
3905 const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3906 BRW_NIR_FRAG_OUTPUT_LOCATION);
3907 assert(l >= FRAG_RESULT_DATA0);
3908 const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3909 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3910 const fs_reg tmp = bld.vgrf(dest.type, 4);
3911
3912 if (reinterpret_cast<const brw_wm_prog_key *>(s.key)->coherent_fb_fetch)
3913 emit_coherent_fb_read(bld, tmp, target);
3914 else
3915 emit_non_coherent_fb_read(ntb, bld, tmp, target);
3916
3917 for (unsigned j = 0; j < instr->num_components; j++) {
3918 bld.MOV(offset(dest, bld, j),
3919 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3920 }
3921
3922 break;
3923 }
3924
3925 case nir_intrinsic_demote:
3926 case nir_intrinsic_discard:
3927 case nir_intrinsic_terminate:
3928 case nir_intrinsic_demote_if:
3929 case nir_intrinsic_discard_if:
3930 case nir_intrinsic_terminate_if: {
3931 /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we
3932 * can update just the flag bits that aren't yet discarded. If there's
3933 * no condition, we emit a CMP of g0 != g0, so all currently executing
3934 * channels will get turned off.
3935 */
3936 fs_inst *cmp = NULL;
3937 if (instr->intrinsic == nir_intrinsic_demote_if ||
3938 instr->intrinsic == nir_intrinsic_discard_if ||
3939 instr->intrinsic == nir_intrinsic_terminate_if) {
3940 nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3941
3942 if (alu != NULL &&
3943 alu->op != nir_op_bcsel) {
3944 /* Re-emit the instruction that generated the Boolean value, but
3945 * do not store it. Since this instruction will be conditional,
3946 * other instructions that want to use the real Boolean value may
3947 * get garbage. This was a problem for piglit's fs-discard-exit-2
3948 * test.
3949 *
3950 * Ideally we'd detect that the instruction cannot have a
3951 * conditional modifier before emitting the instructions. Alas,
3952 * that is nigh impossible. Instead, we're going to assume the
3953 * instruction (or last instruction) generated can have a
3954 * conditional modifier. If it cannot, fallback to the old-style
3955 * compare, and hope dead code elimination will clean up the
3956 * extra instructions generated.
3957 */
3958 fs_nir_emit_alu(ntb, alu, false);
3959
3960 cmp = (fs_inst *) s.instructions.get_tail();
3961 if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
3962 if (cmp->can_do_cmod())
3963 cmp->conditional_mod = BRW_CONDITIONAL_Z;
3964 else
3965 cmp = NULL;
3966 } else {
3967 /* The old sequence that would have been generated is,
3968 * basically, bool_result == false. This is equivalent to
3969 * !bool_result, so negate the old modifier.
3970 */
3971 cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
3972 }
3973 }
3974
3975 if (cmp == NULL) {
3976 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
3977 brw_imm_d(0), BRW_CONDITIONAL_Z);
3978 }
3979 } else {
3980 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3981 BRW_REGISTER_TYPE_UW));
3982 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3983 }
3984
3985 cmp->predicate = BRW_PREDICATE_NORMAL;
3986 cmp->flag_subreg = sample_mask_flag_subreg(s);
3987
3988 fs_inst *jump = bld.emit(BRW_OPCODE_HALT);
3989 jump->flag_subreg = sample_mask_flag_subreg(s);
3990 jump->predicate_inverse = true;
3991
3992 if (instr->intrinsic == nir_intrinsic_terminate ||
3993 instr->intrinsic == nir_intrinsic_terminate_if) {
3994 jump->predicate = BRW_PREDICATE_NORMAL;
3995 } else {
3996 /* Only jump when the whole quad is demoted. For historical
3997 * reasons this is also used for discard.
3998 */
3999 jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
4000 BRW_PREDICATE_ALIGN1_ANY4H);
4001 }
4002 break;
4003 }
4004
4005 case nir_intrinsic_load_input: {
4006 /* In Fragment Shaders load_input is used either for flat inputs or
4007 * per-primitive inputs.
4008 */
4009 assert(instr->def.bit_size == 32);
4010 unsigned base = nir_intrinsic_base(instr);
4011 unsigned comp = nir_intrinsic_component(instr);
4012 unsigned num_components = instr->num_components;
4013
4014 const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) s.key;
4015
4016 if (wm_key->mesh_input == BRW_SOMETIMES) {
4017 assert(devinfo->verx10 >= 125);
4018 /* The FS payload gives us the viewport and layer clamped to valid
4019 * ranges, but the spec for gl_ViewportIndex and gl_Layer includes
4020 * the language:
4021 * the fragment stage will read the same value written by the
4022 * geometry stage, even if that value is out of range.
4023 *
4024 * Which is why these are normally passed as regular attributes.
4025 * This isn't tested anywhere except some GL-only piglit tests
4026 * though, so for the case where the FS may be used against either a
4027 * traditional pipeline or a mesh one, where the position of these
4028 * will change depending on the previous stage, read them from the
4029 * payload to simplify things until the requisite magic is in place.
4030 */
4031 if (base == VARYING_SLOT_LAYER || base == VARYING_SLOT_VIEWPORT) {
4032 assert(num_components == 1);
4033 fs_reg g1(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
4034
4035 unsigned mask, shift_count;
4036 if (base == VARYING_SLOT_LAYER) {
4037 shift_count = 16;
4038 mask = 0x7ff << shift_count;
4039 } else {
4040 shift_count = 27;
4041 mask = 0xf << shift_count;
4042 }
4043
4044 fs_reg vp_or_layer = bld.vgrf(BRW_REGISTER_TYPE_UD);
4045 bld.AND(vp_or_layer, g1, brw_imm_ud(mask));
4046 fs_reg shifted_value = bld.vgrf(BRW_REGISTER_TYPE_UD);
4047 bld.SHR(shifted_value, vp_or_layer, brw_imm_ud(shift_count));
4048 bld.MOV(offset(dest, bld, 0), retype(shifted_value, dest.type));
4049 break;
4050 }
4051 }
4052
4053 /* TODO(mesh): Multiview. Verify and handle these special cases for Mesh. */
4054
4055 /* Special case fields in the VUE header */
4056 if (base == VARYING_SLOT_LAYER)
4057 comp = 1;
4058 else if (base == VARYING_SLOT_VIEWPORT)
4059 comp = 2;
4060
4061 if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
4062 assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
4063 for (unsigned int i = 0; i < num_components; i++) {
4064 bld.MOV(offset(dest, bld, i),
4065 retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
4066 }
4067 } else {
4068 /* Gfx20+ packs the plane parameters of a single logical
4069 * input in a vec3 format instead of the previously used vec4
4070 * format.
4071 */
4072 const unsigned k = devinfo->ver >= 20 ? 0 : 3;
4073 for (unsigned int i = 0; i < num_components; i++) {
4074 bld.MOV(offset(dest, bld, i),
4075 retype(s.interp_reg(bld, base, comp + i, k), dest.type));
4076 }
4077 }
4078 break;
4079 }
4080
4081 case nir_intrinsic_load_fs_input_interp_deltas: {
4082 assert(s.stage == MESA_SHADER_FRAGMENT);
4083 assert(nir_src_as_uint(instr->src[0]) == 0);
4084 const unsigned base = nir_intrinsic_base(instr);
4085 const unsigned comp = nir_intrinsic_component(instr);
4086 dest.type = BRW_REGISTER_TYPE_F;
4087
4088 /* Gfx20+ packs the plane parameters of a single logical
4089 * input in a vec3 format instead of the previously used vec4
4090 * format.
4091 */
4092 if (devinfo->ver >= 20) {
4093 bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 0));
4094 bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 2));
4095 bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 1));
4096 } else {
4097 bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
4098 bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
4099 bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
4100 }
4101
4102 break;
4103 }
4104
4105 case nir_intrinsic_load_barycentric_pixel:
4106 case nir_intrinsic_load_barycentric_centroid:
4107 case nir_intrinsic_load_barycentric_sample: {
4108 /* Use the delta_xy values computed from the payload */
4109 enum brw_barycentric_mode bary = brw_barycentric_mode(instr);
4110 const fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
4111 offset(s.delta_xy[bary], bld, 1) };
4112 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4113 break;
4114 }
4115
4116 case nir_intrinsic_load_barycentric_at_sample: {
4117 const glsl_interp_mode interpolation =
4118 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4119
4120 fs_reg msg_data;
4121 if (nir_src_is_const(instr->src[0])) {
4122 msg_data = brw_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
4123 } else {
4124 const fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
4125 BRW_REGISTER_TYPE_UD);
4126 const fs_reg sample_id = bld.emit_uniformize(sample_src);
4127 msg_data = component(bld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD), 0);
4128 bld.exec_all().group(1, 0).SHL(msg_data, sample_id, brw_imm_ud(4u));
4129 }
4130
4131 fs_reg flag_reg;
4132 struct brw_wm_prog_key *wm_prog_key = (struct brw_wm_prog_key *) s.key;
4133 if (wm_prog_key->multisample_fbo == BRW_SOMETIMES) {
4134 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
4135
4136 check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
4137 wm_prog_data,
4138 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
4139 flag_reg = brw_flag_reg(0, 0);
4140 }
4141
4142 emit_pixel_interpolater_send(bld,
4143 FS_OPCODE_INTERPOLATE_AT_SAMPLE,
4144 dest,
4145 fs_reg(), /* src */
4146 msg_data,
4147 flag_reg,
4148 interpolation);
4149 break;
4150 }
4151
4152 case nir_intrinsic_load_barycentric_at_offset: {
4153 const glsl_interp_mode interpolation =
4154 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4155
4156 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4157
4158 if (const_offset) {
4159 assert(nir_src_bit_size(instr->src[0]) == 32);
4160 unsigned off_x = const_offset[0].u32 & 0xf;
4161 unsigned off_y = const_offset[1].u32 & 0xf;
4162
4163 emit_pixel_interpolater_send(bld,
4164 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
4165 dest,
4166 fs_reg(), /* src */
4167 brw_imm_ud(off_x | (off_y << 4)),
4168 fs_reg(), /* flag_reg */
4169 interpolation);
4170 } else {
4171 fs_reg src = retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_D);
4172 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
4173 emit_pixel_interpolater_send(bld,
4174 opcode,
4175 dest,
4176 src,
4177 brw_imm_ud(0u),
4178 fs_reg(), /* flag_reg */
4179 interpolation);
4180 }
4181 break;
4182 }
4183
4184 case nir_intrinsic_load_frag_coord:
4185 emit_fragcoord_interpolation(ntb, dest);
4186 break;
4187
4188 case nir_intrinsic_load_interpolated_input: {
4189 assert(instr->src[0].ssa &&
4190 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
4191 nir_intrinsic_instr *bary_intrinsic =
4192 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
4193 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
4194 fs_reg dst_xy;
4195
4196 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
4197 bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
4198 /* Use the result of the PI message. */
4199 dst_xy = retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_F);
4200 } else {
4201 /* Use the delta_xy values computed from the payload */
4202 enum brw_barycentric_mode bary = brw_barycentric_mode(bary_intrinsic);
4203 dst_xy = s.delta_xy[bary];
4204 }
4205
4206 for (unsigned int i = 0; i < instr->num_components; i++) {
4207 fs_reg interp =
4208 s.interp_reg(bld, nir_intrinsic_base(instr),
4209 nir_intrinsic_component(instr) + i, 0);
4210 interp.type = BRW_REGISTER_TYPE_F;
4211 dest.type = BRW_REGISTER_TYPE_F;
4212
4213 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
4214 }
4215 break;
4216 }
4217
4218 default:
4219 fs_nir_emit_intrinsic(ntb, bld, instr);
4220 break;
4221 }
4222 }
4223
4224 static void
fs_nir_emit_cs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4225 fs_nir_emit_cs_intrinsic(nir_to_brw_state &ntb,
4226 nir_intrinsic_instr *instr)
4227 {
4228 const intel_device_info *devinfo = ntb.devinfo;
4229 const fs_builder &bld = ntb.bld;
4230 fs_visitor &s = ntb.s;
4231
4232 assert(gl_shader_stage_uses_workgroup(s.stage));
4233 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(s.prog_data);
4234
4235 fs_reg dest;
4236 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4237 dest = get_nir_def(ntb, instr->def);
4238
4239 switch (instr->intrinsic) {
4240 case nir_intrinsic_barrier:
4241 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4242 fs_nir_emit_intrinsic(ntb, bld, instr);
4243 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4244 /* The whole workgroup fits in a single HW thread, so all the
4245 * invocations are already executed lock-step. Instead of an actual
4246 * barrier just emit a scheduling fence, that will generate no code.
4247 */
4248 if (!s.nir->info.workgroup_size_variable &&
4249 s.workgroup_size() <= s.dispatch_width) {
4250 bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
4251 break;
4252 }
4253
4254 emit_barrier(ntb);
4255 cs_prog_data->uses_barrier = true;
4256 }
4257 break;
4258
4259 case nir_intrinsic_load_subgroup_id:
4260 s.cs_payload().load_subgroup_id(bld, dest);
4261 break;
4262
4263 case nir_intrinsic_load_local_invocation_id:
4264 /* This is only used for hardware generated local IDs. */
4265 assert(cs_prog_data->generate_local_id);
4266
4267 dest.type = BRW_REGISTER_TYPE_UD;
4268
4269 for (unsigned i = 0; i < 3; i++)
4270 bld.MOV(offset(dest, bld, i), s.cs_payload().local_invocation_id[i]);
4271 break;
4272
4273 case nir_intrinsic_load_workgroup_id:
4274 case nir_intrinsic_load_workgroup_id_zero_base: {
4275 fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4276 assert(val.file != BAD_FILE);
4277 dest.type = val.type;
4278 for (unsigned i = 0; i < 3; i++)
4279 bld.MOV(offset(dest, bld, i), offset(val, bld, i));
4280 break;
4281 }
4282
4283 case nir_intrinsic_load_num_workgroups: {
4284 assert(instr->def.bit_size == 32);
4285
4286 cs_prog_data->uses_num_work_groups = true;
4287
4288 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4289 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(0);
4290 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4291 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */
4292 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0);
4293 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4294 fs_inst *inst =
4295 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4296 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4297 inst->size_written = 3 * s.dispatch_width * 4;
4298 break;
4299 }
4300
4301 case nir_intrinsic_shared_atomic:
4302 case nir_intrinsic_shared_atomic_swap:
4303 fs_nir_emit_surface_atomic(ntb, bld, instr, brw_imm_ud(GFX7_BTI_SLM),
4304 false /* bindless */);
4305 break;
4306
4307 case nir_intrinsic_load_shared: {
4308 const unsigned bit_size = instr->def.bit_size;
4309 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4310 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
4311
4312 fs_reg addr = get_nir_src(ntb, instr->src[0]);
4313 int base = nir_intrinsic_base(instr);
4314 if (base) {
4315 fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
4316 bld.ADD(addr_off, addr, brw_imm_d(base));
4317 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4318 } else {
4319 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4320 }
4321
4322 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4323 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4324
4325 /* Make dest unsigned because that's what the temporary will be */
4326 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4327
4328 /* Read the vector */
4329 assert(bit_size <= 32);
4330 assert(nir_intrinsic_align(instr) > 0);
4331 if (bit_size == 32 &&
4332 nir_intrinsic_align(instr) >= 4) {
4333 assert(instr->def.num_components <= 4);
4334 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4335 fs_inst *inst =
4336 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4337 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4338 inst->size_written = instr->num_components * s.dispatch_width * 4;
4339 } else {
4340 assert(instr->def.num_components == 1);
4341 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4342
4343 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4344 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4345 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4346 bld.MOV(dest, subscript(read_result, dest.type, 0));
4347 }
4348 break;
4349 }
4350
4351 case nir_intrinsic_store_shared: {
4352 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4353 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4354 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
4355
4356 fs_reg addr = get_nir_src(ntb, instr->src[1]);
4357 int base = nir_intrinsic_base(instr);
4358 if (base) {
4359 fs_reg addr_off = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
4360 bld.ADD(addr_off, addr, brw_imm_d(base));
4361 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4362 } else {
4363 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4364 }
4365
4366 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4367 /* No point in masking with sample mask, here we're handling compute
4368 * intrinsics.
4369 */
4370 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4371
4372 fs_reg data = get_nir_src(ntb, instr->src[0]);
4373 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4374
4375 assert(bit_size <= 32);
4376 assert(nir_intrinsic_write_mask(instr) ==
4377 (1u << instr->num_components) - 1);
4378 assert(nir_intrinsic_align(instr) > 0);
4379 if (bit_size == 32 &&
4380 nir_intrinsic_align(instr) >= 4) {
4381 assert(nir_src_num_components(instr->src[0]) <= 4);
4382 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4383 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4384 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4385 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4386 } else {
4387 assert(nir_src_num_components(instr->src[0]) == 1);
4388 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4389
4390 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4391 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4392
4393 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4394 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4395 }
4396 break;
4397 }
4398
4399 case nir_intrinsic_load_workgroup_size: {
4400 /* Should have been lowered by brw_nir_lower_cs_intrinsics() or
4401 * crocus/iris_setup_uniforms() for the variable group size case.
4402 */
4403 unreachable("Should have been lowered");
4404 break;
4405 }
4406
4407 case nir_intrinsic_dpas_intel: {
4408 const unsigned sdepth = nir_intrinsic_systolic_depth(instr);
4409 const unsigned rcount = nir_intrinsic_repeat_count(instr);
4410
4411 const brw_reg_type dest_type =
4412 brw_type_for_nir_type(devinfo, nir_intrinsic_dest_type(instr));
4413 const brw_reg_type src_type =
4414 brw_type_for_nir_type(devinfo, nir_intrinsic_src_type(instr));
4415
4416 dest = retype(dest, dest_type);
4417 fs_reg src2 = retype(get_nir_src(ntb, instr->src[2]), dest_type);
4418 const fs_reg dest_hf = dest;
4419
4420 fs_builder bld8 = bld.exec_all().group(8, 0);
4421 fs_builder bld16 = bld.exec_all().group(16, 0);
4422
4423 /* DG2 cannot have the destination or source 0 of DPAS be float16. It is
4424 * still advantageous to support these formats for memory and bandwidth
4425 * savings.
4426 *
4427 * The float16 source must be expanded to float32.
4428 */
4429 if (devinfo->verx10 == 125 && dest_type == BRW_REGISTER_TYPE_HF &&
4430 !s.compiler->lower_dpas) {
4431 dest = bld8.vgrf(BRW_REGISTER_TYPE_F, rcount);
4432
4433 if (src2.file != ARF) {
4434 const fs_reg src2_hf = src2;
4435
4436 src2 = bld8.vgrf(BRW_REGISTER_TYPE_F, rcount);
4437
4438 for (unsigned i = 0; i < 4; i++) {
4439 bld16.MOV(byte_offset(src2, REG_SIZE * i * 2),
4440 byte_offset(src2_hf, REG_SIZE * i));
4441 }
4442 } else {
4443 src2 = retype(src2, BRW_REGISTER_TYPE_F);
4444 }
4445 }
4446
4447 bld8.DPAS(dest,
4448 src2,
4449 retype(get_nir_src(ntb, instr->src[1]), src_type),
4450 retype(get_nir_src(ntb, instr->src[0]), src_type),
4451 sdepth,
4452 rcount)
4453 ->saturate = nir_intrinsic_saturate(instr);
4454
4455 /* Compact the destination to float16 (from float32). */
4456 if (!dest.equals(dest_hf)) {
4457 for (unsigned i = 0; i < 4; i++) {
4458 bld16.MOV(byte_offset(dest_hf, REG_SIZE * i),
4459 byte_offset(dest, REG_SIZE * i * 2));
4460 }
4461 }
4462
4463 cs_prog_data->uses_systolic = true;
4464 break;
4465 }
4466
4467 default:
4468 fs_nir_emit_intrinsic(ntb, bld, instr);
4469 break;
4470 }
4471 }
4472
4473 static void
emit_rt_lsc_fence(const fs_builder & bld,enum lsc_fence_scope scope,enum lsc_flush_type flush_type)4474 emit_rt_lsc_fence(const fs_builder &bld,
4475 enum lsc_fence_scope scope,
4476 enum lsc_flush_type flush_type)
4477 {
4478 const intel_device_info *devinfo = bld.shader->devinfo;
4479
4480 const fs_builder ubld = bld.exec_all().group(8, 0);
4481 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4482 fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, tmp,
4483 brw_imm_ud(0) /* desc */,
4484 brw_imm_ud(0) /* ex_desc */,
4485 brw_vec8_grf(0, 0) /* payload */);
4486 send->sfid = GFX12_SFID_UGM;
4487 send->desc = lsc_fence_msg_desc(devinfo, scope, flush_type, true);
4488 send->mlen = reg_unit(devinfo); /* g0 header */
4489 send->ex_mlen = 0;
4490 /* Temp write for scheduling */
4491 send->size_written = REG_SIZE * reg_unit(devinfo);
4492 send->send_has_side_effects = true;
4493
4494 ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), tmp);
4495 }
4496
4497
4498 static void
fs_nir_emit_bs_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)4499 fs_nir_emit_bs_intrinsic(nir_to_brw_state &ntb,
4500 nir_intrinsic_instr *instr)
4501 {
4502 const fs_builder &bld = ntb.bld;
4503 fs_visitor &s = ntb.s;
4504
4505 assert(brw_shader_stage_is_bindless(s.stage));
4506 const bs_thread_payload &payload = s.bs_payload();
4507
4508 fs_reg dest;
4509 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4510 dest = get_nir_def(ntb, instr->def);
4511
4512 switch (instr->intrinsic) {
4513 case nir_intrinsic_load_btd_global_arg_addr_intel:
4514 bld.MOV(dest, retype(payload.global_arg_ptr, dest.type));
4515 break;
4516
4517 case nir_intrinsic_load_btd_local_arg_addr_intel:
4518 bld.MOV(dest, retype(payload.local_arg_ptr, dest.type));
4519 break;
4520
4521 case nir_intrinsic_load_btd_shader_type_intel:
4522 payload.load_shader_type(bld, dest);
4523 break;
4524
4525 default:
4526 fs_nir_emit_intrinsic(ntb, bld, instr);
4527 break;
4528 }
4529 }
4530
4531 static fs_reg
brw_nir_reduction_op_identity(const fs_builder & bld,nir_op op,brw_reg_type type)4532 brw_nir_reduction_op_identity(const fs_builder &bld,
4533 nir_op op, brw_reg_type type)
4534 {
4535 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4536 switch (type_sz(type)) {
4537 case 1:
4538 if (type == BRW_REGISTER_TYPE_UB) {
4539 return brw_imm_uw(value.u8);
4540 } else {
4541 assert(type == BRW_REGISTER_TYPE_B);
4542 return brw_imm_w(value.i8);
4543 }
4544 case 2:
4545 return retype(brw_imm_uw(value.u16), type);
4546 case 4:
4547 return retype(brw_imm_ud(value.u32), type);
4548 case 8:
4549 if (type == BRW_REGISTER_TYPE_DF)
4550 return brw_imm_df(value.f64);
4551 else
4552 return retype(brw_imm_u64(value.u64), type);
4553 default:
4554 unreachable("Invalid type size");
4555 }
4556 }
4557
4558 static opcode
brw_op_for_nir_reduction_op(nir_op op)4559 brw_op_for_nir_reduction_op(nir_op op)
4560 {
4561 switch (op) {
4562 case nir_op_iadd: return BRW_OPCODE_ADD;
4563 case nir_op_fadd: return BRW_OPCODE_ADD;
4564 case nir_op_imul: return BRW_OPCODE_MUL;
4565 case nir_op_fmul: return BRW_OPCODE_MUL;
4566 case nir_op_imin: return BRW_OPCODE_SEL;
4567 case nir_op_umin: return BRW_OPCODE_SEL;
4568 case nir_op_fmin: return BRW_OPCODE_SEL;
4569 case nir_op_imax: return BRW_OPCODE_SEL;
4570 case nir_op_umax: return BRW_OPCODE_SEL;
4571 case nir_op_fmax: return BRW_OPCODE_SEL;
4572 case nir_op_iand: return BRW_OPCODE_AND;
4573 case nir_op_ior: return BRW_OPCODE_OR;
4574 case nir_op_ixor: return BRW_OPCODE_XOR;
4575 default:
4576 unreachable("Invalid reduction operation");
4577 }
4578 }
4579
4580 static brw_conditional_mod
brw_cond_mod_for_nir_reduction_op(nir_op op)4581 brw_cond_mod_for_nir_reduction_op(nir_op op)
4582 {
4583 switch (op) {
4584 case nir_op_iadd: return BRW_CONDITIONAL_NONE;
4585 case nir_op_fadd: return BRW_CONDITIONAL_NONE;
4586 case nir_op_imul: return BRW_CONDITIONAL_NONE;
4587 case nir_op_fmul: return BRW_CONDITIONAL_NONE;
4588 case nir_op_imin: return BRW_CONDITIONAL_L;
4589 case nir_op_umin: return BRW_CONDITIONAL_L;
4590 case nir_op_fmin: return BRW_CONDITIONAL_L;
4591 case nir_op_imax: return BRW_CONDITIONAL_GE;
4592 case nir_op_umax: return BRW_CONDITIONAL_GE;
4593 case nir_op_fmax: return BRW_CONDITIONAL_GE;
4594 case nir_op_iand: return BRW_CONDITIONAL_NONE;
4595 case nir_op_ior: return BRW_CONDITIONAL_NONE;
4596 case nir_op_ixor: return BRW_CONDITIONAL_NONE;
4597 default:
4598 unreachable("Invalid reduction operation");
4599 }
4600 }
4601
4602 struct rebuild_resource {
4603 unsigned idx;
4604 std::vector<nir_def *> array;
4605 };
4606
4607 static bool
add_rebuild_src(nir_src * src,void * state)4608 add_rebuild_src(nir_src *src, void *state)
4609 {
4610 struct rebuild_resource *res = (struct rebuild_resource *) state;
4611
4612 for (nir_def *def : res->array) {
4613 if (def == src->ssa)
4614 return true;
4615 }
4616
4617 nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
4618 res->array.push_back(src->ssa);
4619 return true;
4620 }
4621
4622 static fs_reg
try_rebuild_resource(nir_to_brw_state & ntb,const brw::fs_builder & bld,nir_def * resource_def)4623 try_rebuild_resource(nir_to_brw_state &ntb, const brw::fs_builder &bld, nir_def *resource_def)
4624 {
4625 /* Create a build at the location of the resource_intel intrinsic */
4626 fs_builder ubld8 = bld.exec_all().group(8, 0);
4627
4628 struct rebuild_resource resources = {};
4629 resources.idx = 0;
4630
4631 if (!nir_foreach_src(resource_def->parent_instr,
4632 add_rebuild_src, &resources))
4633 return fs_reg();
4634 resources.array.push_back(resource_def);
4635
4636 if (resources.array.size() == 1) {
4637 nir_def *def = resources.array[0];
4638
4639 if (def->parent_instr->type == nir_instr_type_load_const) {
4640 nir_load_const_instr *load_const =
4641 nir_instr_as_load_const(def->parent_instr);
4642 return brw_imm_ud(load_const->value[0].i32);
4643 } else {
4644 assert(def->parent_instr->type == nir_instr_type_intrinsic &&
4645 (nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
4646 nir_intrinsic_load_uniform));
4647 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
4648 unsigned base_offset = nir_intrinsic_base(intrin);
4649 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4650 fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD);
4651 src.offset = load_offset + base_offset % 4;
4652 return src;
4653 }
4654 }
4655
4656 for (unsigned i = 0; i < resources.array.size(); i++) {
4657 nir_def *def = resources.array[i];
4658
4659 nir_instr *instr = def->parent_instr;
4660 switch (instr->type) {
4661 case nir_instr_type_load_const: {
4662 nir_load_const_instr *load_const =
4663 nir_instr_as_load_const(instr);
4664 fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
4665 ntb.resource_insts[def->index] =
4666 ubld8.MOV(dst, brw_imm_ud(load_const->value[0].i32));
4667 break;
4668 }
4669
4670 case nir_instr_type_alu: {
4671 nir_alu_instr *alu = nir_instr_as_alu(instr);
4672
4673 if (nir_op_infos[alu->op].num_inputs == 2) {
4674 if (alu->src[0].swizzle[0] != 0 ||
4675 alu->src[1].swizzle[0] != 0)
4676 break;
4677 } else if (nir_op_infos[alu->op].num_inputs == 3) {
4678 if (alu->src[0].swizzle[0] != 0 ||
4679 alu->src[1].swizzle[0] != 0 ||
4680 alu->src[2].swizzle[0] != 0)
4681 break;
4682 } else {
4683 /* Not supported ALU input count */
4684 break;
4685 }
4686
4687 switch (alu->op) {
4688 case nir_op_iadd: {
4689 fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
4690 fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4691 fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4692 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4693 assert(src0.type == BRW_REGISTER_TYPE_UD);
4694 ntb.resource_insts[def->index] =
4695 ubld8.ADD(dst,
4696 src0.file != IMM ? src0 : src1,
4697 src0.file != IMM ? src1 : src0);
4698 break;
4699 }
4700 case nir_op_iadd3: {
4701 fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
4702 fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4703 fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4704 fs_reg src2 = ntb.resource_insts[alu->src[2].src.ssa->index]->dst;
4705 assert(src0.file != BAD_FILE && src1.file != BAD_FILE && src2.file != BAD_FILE);
4706 assert(src0.type == BRW_REGISTER_TYPE_UD);
4707 ntb.resource_insts[def->index] =
4708 ubld8.ADD3(dst,
4709 src1.file == IMM ? src1 : src0,
4710 src1.file == IMM ? src0 : src1,
4711 src2);
4712 break;
4713 }
4714 case nir_op_ushr: {
4715 fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
4716 fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4717 fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4718 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4719 assert(src0.type == BRW_REGISTER_TYPE_UD);
4720 ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
4721 break;
4722 }
4723 case nir_op_ishl: {
4724 fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
4725 fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4726 fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4727 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4728 assert(src0.type == BRW_REGISTER_TYPE_UD);
4729 ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
4730 break;
4731 }
4732 case nir_op_mov: {
4733 break;
4734 }
4735 default:
4736 break;
4737 }
4738 break;
4739 }
4740
4741 case nir_instr_type_intrinsic: {
4742 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4743 switch (intrin->intrinsic) {
4744 case nir_intrinsic_resource_intel:
4745 ntb.resource_insts[def->index] =
4746 ntb.resource_insts[intrin->src[1].ssa->index];
4747 break;
4748
4749 case nir_intrinsic_load_uniform: {
4750 if (!nir_src_is_const(intrin->src[0]))
4751 break;
4752
4753 unsigned base_offset = nir_intrinsic_base(intrin);
4754 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4755 fs_reg dst = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
4756 fs_reg src(UNIFORM, base_offset / 4, BRW_REGISTER_TYPE_UD);
4757 src.offset = load_offset + base_offset % 4;
4758 ntb.resource_insts[def->index] = ubld8.MOV(dst, src);
4759 break;
4760 }
4761
4762 default:
4763 break;
4764 }
4765 break;
4766 }
4767
4768 default:
4769 break;
4770 }
4771
4772 if (ntb.resource_insts[def->index] == NULL)
4773 return fs_reg();
4774 }
4775
4776 assert(ntb.resource_insts[resource_def->index] != NULL);
4777 return component(ntb.resource_insts[resource_def->index]->dst, 0);
4778 }
4779
4780 static fs_reg
get_nir_image_intrinsic_image(nir_to_brw_state & ntb,const brw::fs_builder & bld,nir_intrinsic_instr * instr)4781 get_nir_image_intrinsic_image(nir_to_brw_state &ntb, const brw::fs_builder &bld,
4782 nir_intrinsic_instr *instr)
4783 {
4784 if (is_resource_src(instr->src[0])) {
4785 fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]);
4786 if (surf_index.file != BAD_FILE)
4787 return surf_index;
4788 }
4789
4790 fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_REGISTER_TYPE_UD);
4791 fs_reg surf_index = image;
4792
4793 return bld.emit_uniformize(surf_index);
4794 }
4795
4796 static fs_reg
get_nir_buffer_intrinsic_index(nir_to_brw_state & ntb,const brw::fs_builder & bld,nir_intrinsic_instr * instr)4797 get_nir_buffer_intrinsic_index(nir_to_brw_state &ntb, const brw::fs_builder &bld,
4798 nir_intrinsic_instr *instr)
4799 {
4800 /* SSBO stores are weird in that their index is in src[1] */
4801 const bool is_store =
4802 instr->intrinsic == nir_intrinsic_store_ssbo ||
4803 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4804 nir_src src = is_store ? instr->src[1] : instr->src[0];
4805
4806 if (nir_src_is_const(src)) {
4807 return brw_imm_ud(nir_src_as_uint(src));
4808 } else if (is_resource_src(src)) {
4809 fs_reg surf_index = get_resource_nir_src(ntb, src);
4810 if (surf_index.file != BAD_FILE)
4811 return surf_index;
4812 }
4813 return bld.emit_uniformize(get_nir_src(ntb, src));
4814 }
4815
4816 /**
4817 * The offsets we get from NIR act as if each SIMD channel has it's own blob
4818 * of contiguous space. However, if we actually place each SIMD channel in
4819 * it's own space, we end up with terrible cache performance because each SIMD
4820 * channel accesses a different cache line even when they're all accessing the
4821 * same byte offset. To deal with this problem, we swizzle the address using
4822 * a simple algorithm which ensures that any time a SIMD message reads or
4823 * writes the same address, it's all in the same cache line. We have to keep
4824 * the bottom two bits fixed so that we can read/write up to a dword at a time
4825 * and the individual element is contiguous. We do this by splitting the
4826 * address as follows:
4827 *
4828 * 31 4-6 2 0
4829 * +-------------------------------+------------+----------+
4830 * | Hi address bits | chan index | addr low |
4831 * +-------------------------------+------------+----------+
4832 *
4833 * In other words, the bottom two address bits stay, and the top 30 get
4834 * shifted up so that we can stick the SIMD channel index in the middle. This
4835 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4836 * at the same logical offset, the scratch read/write instruction acts on
4837 * continuous elements and we get good cache locality.
4838 */
4839 static fs_reg
swizzle_nir_scratch_addr(nir_to_brw_state & ntb,const brw::fs_builder & bld,const fs_reg & nir_addr,bool in_dwords)4840 swizzle_nir_scratch_addr(nir_to_brw_state &ntb,
4841 const brw::fs_builder &bld,
4842 const fs_reg &nir_addr,
4843 bool in_dwords)
4844 {
4845 fs_visitor &s = ntb.s;
4846
4847 const fs_reg &chan_index =
4848 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4849 const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4850
4851 fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4852 if (in_dwords) {
4853 /* In this case, we know the address is aligned to a DWORD and we want
4854 * the final address in DWORDs.
4855 */
4856 bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
4857 bld.OR(addr, addr, chan_index);
4858 } else {
4859 /* This case substantially more annoying because we have to pay
4860 * attention to those pesky two bottom bits.
4861 */
4862 fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
4863 bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
4864 bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
4865 fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4866 bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
4867 bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
4868 bld.OR(addr, addr, addr_hi);
4869 bld.OR(addr, addr, chan_addr);
4870 }
4871 return addr;
4872 }
4873
4874 static unsigned
choose_oword_block_size_dwords(const struct intel_device_info * devinfo,unsigned dwords)4875 choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
4876 unsigned dwords)
4877 {
4878 unsigned block;
4879 if (devinfo->has_lsc && dwords >= 64) {
4880 block = 64;
4881 } else if (dwords >= 32) {
4882 block = 32;
4883 } else if (dwords >= 16) {
4884 block = 16;
4885 } else {
4886 block = 8;
4887 }
4888 assert(block <= dwords);
4889 return block;
4890 }
4891
4892 static void
increment_a64_address(const fs_builder & bld,fs_reg address,uint32_t v)4893 increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
4894 {
4895 if (bld.shader->devinfo->has_64bit_int) {
4896 bld.ADD(address, address, brw_imm_ud(v));
4897 } else {
4898 fs_reg low = retype(address, BRW_REGISTER_TYPE_UD);
4899 fs_reg high = offset(low, bld, 1);
4900
4901 /* Add low and if that overflows, add carry to high. */
4902 bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
4903 bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
4904 }
4905 }
4906
4907 static fs_reg
emit_fence(const fs_builder & bld,enum opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4908 emit_fence(const fs_builder &bld, enum opcode opcode,
4909 uint8_t sfid, uint32_t desc,
4910 bool commit_enable, uint8_t bti)
4911 {
4912 assert(opcode == SHADER_OPCODE_INTERLOCK ||
4913 opcode == SHADER_OPCODE_MEMORY_FENCE);
4914
4915 fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
4916 fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
4917 brw_imm_ud(commit_enable),
4918 brw_imm_ud(bti));
4919 fence->sfid = sfid;
4920 fence->desc = desc;
4921
4922 return dst;
4923 }
4924
4925 static uint32_t
lsc_fence_descriptor_for_intrinsic(const struct intel_device_info * devinfo,nir_intrinsic_instr * instr)4926 lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
4927 nir_intrinsic_instr *instr)
4928 {
4929 assert(devinfo->has_lsc);
4930
4931 enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
4932 enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
4933
4934 if (nir_intrinsic_has_memory_scope(instr)) {
4935 switch (nir_intrinsic_memory_scope(instr)) {
4936 case SCOPE_DEVICE:
4937 case SCOPE_QUEUE_FAMILY:
4938 scope = LSC_FENCE_TILE;
4939 flush_type = LSC_FLUSH_TYPE_EVICT;
4940 break;
4941 case SCOPE_WORKGROUP:
4942 scope = LSC_FENCE_THREADGROUP;
4943 break;
4944 case SCOPE_SHADER_CALL:
4945 case SCOPE_INVOCATION:
4946 case SCOPE_SUBGROUP:
4947 case SCOPE_NONE:
4948 break;
4949 }
4950 } else {
4951 /* No scope defined. */
4952 scope = LSC_FENCE_TILE;
4953 flush_type = LSC_FLUSH_TYPE_EVICT;
4954 }
4955 return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
4956 }
4957
4958 /**
4959 * Create a MOV to read the timestamp register.
4960 */
4961 static fs_reg
get_timestamp(const fs_builder & bld)4962 get_timestamp(const fs_builder &bld)
4963 {
4964 fs_visitor &s = *bld.shader;
4965
4966 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
4967 BRW_ARF_TIMESTAMP,
4968 0),
4969 BRW_REGISTER_TYPE_UD));
4970
4971 fs_reg dst = fs_reg(VGRF, s.alloc.allocate(1), BRW_REGISTER_TYPE_UD);
4972
4973 /* We want to read the 3 fields we care about even if it's not enabled in
4974 * the dispatch.
4975 */
4976 bld.group(4, 0).exec_all().MOV(dst, ts);
4977
4978 return dst;
4979 }
4980
4981 static unsigned
component_from_intrinsic(nir_intrinsic_instr * instr)4982 component_from_intrinsic(nir_intrinsic_instr *instr)
4983 {
4984 if (nir_intrinsic_has_component(instr))
4985 return nir_intrinsic_component(instr);
4986 else
4987 return 0;
4988 }
4989
4990 static void
adjust_handle_and_offset(const fs_builder & bld,fs_reg & urb_handle,unsigned & urb_global_offset)4991 adjust_handle_and_offset(const fs_builder &bld,
4992 fs_reg &urb_handle,
4993 unsigned &urb_global_offset)
4994 {
4995 /* Make sure that URB global offset is below 2048 (2^11), because
4996 * that's the maximum possible value encoded in Message Descriptor.
4997 */
4998 unsigned adjustment = (urb_global_offset >> 11) << 11;
4999
5000 if (adjustment) {
5001 fs_builder ubld8 = bld.group(8, 0).exec_all();
5002 /* Allocate new register to not overwrite the shared URB handle. */
5003 fs_reg new_handle = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
5004 ubld8.ADD(new_handle, urb_handle, brw_imm_ud(adjustment));
5005 urb_handle = new_handle;
5006 urb_global_offset -= adjustment;
5007 }
5008 }
5009
5010 static void
emit_urb_direct_vec4_write(const fs_builder & bld,unsigned urb_global_offset,const fs_reg & src,fs_reg urb_handle,unsigned dst_comp_offset,unsigned comps,unsigned mask)5011 emit_urb_direct_vec4_write(const fs_builder &bld,
5012 unsigned urb_global_offset,
5013 const fs_reg &src,
5014 fs_reg urb_handle,
5015 unsigned dst_comp_offset,
5016 unsigned comps,
5017 unsigned mask)
5018 {
5019 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5020 fs_builder bld8 = bld.group(8, q);
5021
5022 fs_reg payload_srcs[8];
5023 unsigned length = 0;
5024
5025 for (unsigned i = 0; i < dst_comp_offset; i++)
5026 payload_srcs[length++] = reg_undef;
5027
5028 for (unsigned c = 0; c < comps; c++)
5029 payload_srcs[length++] = quarter(offset(src, bld, c), q);
5030
5031 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5032 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5033 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5034 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
5035 BRW_REGISTER_TYPE_F);
5036 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5037 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5038
5039 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5040 reg_undef, srcs, ARRAY_SIZE(srcs));
5041 inst->offset = urb_global_offset;
5042 assert(inst->offset < 2048);
5043 }
5044 }
5045
5046 static void
emit_urb_direct_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src,fs_reg urb_handle)5047 emit_urb_direct_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
5048 const fs_reg &src, fs_reg urb_handle)
5049 {
5050 assert(nir_src_bit_size(instr->src[0]) == 32);
5051
5052 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5053 assert(nir_src_is_const(*offset_nir_src));
5054
5055 const unsigned comps = nir_src_num_components(instr->src[0]);
5056 assert(comps <= 4);
5057
5058 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5059 nir_src_as_uint(*offset_nir_src) +
5060 component_from_intrinsic(instr);
5061
5062 /* URB writes are vec4 aligned but the intrinsic offsets are in dwords.
5063 * We can write up to 8 dwords, so single vec4 write is enough.
5064 */
5065 const unsigned comp_shift = offset_in_dwords % 4;
5066 const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
5067
5068 unsigned urb_global_offset = offset_in_dwords / 4;
5069 adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
5070
5071 emit_urb_direct_vec4_write(bld, urb_global_offset, src, urb_handle,
5072 comp_shift, comps, mask);
5073 }
5074
5075 static void
emit_urb_direct_vec4_write_xe2(const fs_builder & bld,unsigned offset_in_bytes,const fs_reg & src,fs_reg urb_handle,unsigned comps,unsigned mask)5076 emit_urb_direct_vec4_write_xe2(const fs_builder &bld,
5077 unsigned offset_in_bytes,
5078 const fs_reg &src,
5079 fs_reg urb_handle,
5080 unsigned comps,
5081 unsigned mask)
5082 {
5083 const struct intel_device_info *devinfo = bld.shader->devinfo;
5084 const unsigned runit = reg_unit(devinfo);
5085 const unsigned write_size = 8 * runit;
5086
5087 if (offset_in_bytes > 0) {
5088 fs_builder bldall = bld.group(write_size, 0).exec_all();
5089 fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD);
5090 bldall.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_bytes));
5091 urb_handle = new_handle;
5092 }
5093
5094 for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
5095 fs_builder hbld = bld.group(write_size, q);
5096
5097 fs_reg payload_srcs[comps];
5098
5099 for (unsigned c = 0; c < comps; c++)
5100 payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
5101
5102 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5103 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5104 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5105 int nr = bld.shader->alloc.allocate(comps * runit);
5106 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F);
5107 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
5108 hbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
5109
5110 hbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5111 reg_undef, srcs, ARRAY_SIZE(srcs));
5112 }
5113 }
5114
5115 static void
emit_urb_direct_writes_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src,fs_reg urb_handle)5116 emit_urb_direct_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5117 const fs_reg &src, fs_reg urb_handle)
5118 {
5119 assert(nir_src_bit_size(instr->src[0]) == 32);
5120
5121 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5122 assert(nir_src_is_const(*offset_nir_src));
5123
5124 const unsigned comps = nir_src_num_components(instr->src[0]);
5125 assert(comps <= 4);
5126
5127 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5128 nir_src_as_uint(*offset_nir_src) +
5129 component_from_intrinsic(instr);
5130
5131 const unsigned mask = nir_intrinsic_write_mask(instr);
5132
5133 emit_urb_direct_vec4_write_xe2(bld, offset_in_dwords * 4, src,
5134 urb_handle, comps, mask);
5135 }
5136
5137 static void
emit_urb_indirect_vec4_write(const fs_builder & bld,const fs_reg & offset_src,unsigned base,const fs_reg & src,fs_reg urb_handle,unsigned dst_comp_offset,unsigned comps,unsigned mask)5138 emit_urb_indirect_vec4_write(const fs_builder &bld,
5139 const fs_reg &offset_src,
5140 unsigned base,
5141 const fs_reg &src,
5142 fs_reg urb_handle,
5143 unsigned dst_comp_offset,
5144 unsigned comps,
5145 unsigned mask)
5146 {
5147 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5148 fs_builder bld8 = bld.group(8, q);
5149
5150 /* offset is always positive, so signedness doesn't matter */
5151 assert(offset_src.type == BRW_REGISTER_TYPE_D ||
5152 offset_src.type == BRW_REGISTER_TYPE_UD);
5153 fs_reg off = bld8.vgrf(offset_src.type, 1);
5154 bld8.MOV(off, quarter(offset_src, q));
5155 bld8.ADD(off, off, brw_imm_ud(base));
5156 bld8.SHR(off, off, brw_imm_ud(2));
5157
5158 fs_reg payload_srcs[8];
5159 unsigned length = 0;
5160
5161 for (unsigned i = 0; i < dst_comp_offset; i++)
5162 payload_srcs[length++] = reg_undef;
5163
5164 for (unsigned c = 0; c < comps; c++)
5165 payload_srcs[length++] = quarter(offset(src, bld, c), q);
5166
5167 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5168 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5169 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
5170 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5171 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
5172 BRW_REGISTER_TYPE_F);
5173 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5174 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5175
5176 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5177 reg_undef, srcs, ARRAY_SIZE(srcs));
5178 inst->offset = 0;
5179 }
5180 }
5181
5182 static void
emit_urb_indirect_writes_mod(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src,const fs_reg & offset_src,fs_reg urb_handle,unsigned mod)5183 emit_urb_indirect_writes_mod(const fs_builder &bld, nir_intrinsic_instr *instr,
5184 const fs_reg &src, const fs_reg &offset_src,
5185 fs_reg urb_handle, unsigned mod)
5186 {
5187 assert(nir_src_bit_size(instr->src[0]) == 32);
5188
5189 const unsigned comps = nir_src_num_components(instr->src[0]);
5190 assert(comps <= 4);
5191
5192 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5193 component_from_intrinsic(instr);
5194
5195 const unsigned comp_shift = mod;
5196 const unsigned mask = nir_intrinsic_write_mask(instr) << comp_shift;
5197
5198 emit_urb_indirect_vec4_write(bld, offset_src, base_in_dwords, src,
5199 urb_handle, comp_shift, comps, mask);
5200 }
5201
5202 static void
emit_urb_indirect_writes_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src,const fs_reg & offset_src,fs_reg urb_handle)5203 emit_urb_indirect_writes_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5204 const fs_reg &src, const fs_reg &offset_src,
5205 fs_reg urb_handle)
5206 {
5207 assert(nir_src_bit_size(instr->src[0]) == 32);
5208
5209 const struct intel_device_info *devinfo = bld.shader->devinfo;
5210 const unsigned runit = reg_unit(devinfo);
5211 const unsigned write_size = 8 * runit;
5212
5213 const unsigned comps = nir_src_num_components(instr->src[0]);
5214 assert(comps <= 4);
5215
5216 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5217 component_from_intrinsic(instr);
5218
5219 if (base_in_dwords > 0) {
5220 fs_builder bldall = bld.group(write_size, 0).exec_all();
5221 fs_reg new_handle = bldall.vgrf(BRW_REGISTER_TYPE_UD);
5222 bldall.ADD(new_handle, urb_handle, brw_imm_ud(base_in_dwords * 4));
5223 urb_handle = new_handle;
5224 }
5225
5226 const unsigned mask = nir_intrinsic_write_mask(instr);
5227
5228 for (unsigned q = 0; q < bld.dispatch_width() / write_size; q++) {
5229 fs_builder wbld = bld.group(write_size, q);
5230
5231 fs_reg payload_srcs[comps];
5232
5233 for (unsigned c = 0; c < comps; c++)
5234 payload_srcs[c] = horiz_offset(offset(src, bld, c), write_size * q);
5235
5236 fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD);
5237 wbld.SHL(addr, horiz_offset(offset_src, write_size * q), brw_imm_ud(2));
5238 wbld.ADD(addr, addr, urb_handle);
5239
5240 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5241 srcs[URB_LOGICAL_SRC_HANDLE] = addr;
5242 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(mask << 16);
5243 int nr = bld.shader->alloc.allocate(comps * runit);
5244 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, nr, BRW_REGISTER_TYPE_F);
5245 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(comps);
5246 wbld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, comps, 0);
5247
5248 wbld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5249 reg_undef, srcs, ARRAY_SIZE(srcs));
5250 }
5251 }
5252
5253 static void
emit_urb_indirect_writes(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & src,const fs_reg & offset_src,fs_reg urb_handle)5254 emit_urb_indirect_writes(const fs_builder &bld, nir_intrinsic_instr *instr,
5255 const fs_reg &src, const fs_reg &offset_src,
5256 fs_reg urb_handle)
5257 {
5258 assert(nir_src_bit_size(instr->src[0]) == 32);
5259
5260 const unsigned comps = nir_src_num_components(instr->src[0]);
5261 assert(comps <= 4);
5262
5263 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5264 component_from_intrinsic(instr);
5265
5266 /* Use URB write message that allow different offsets per-slot. The offset
5267 * is in units of vec4s (128 bits), so we use a write for each component,
5268 * replicating it in the sources and applying the appropriate mask based on
5269 * the dword offset.
5270 */
5271
5272 for (unsigned c = 0; c < comps; c++) {
5273 if (((1 << c) & nir_intrinsic_write_mask(instr)) == 0)
5274 continue;
5275
5276 fs_reg src_comp = offset(src, bld, c);
5277
5278 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5279 fs_builder bld8 = bld.group(8, q);
5280
5281 /* offset is always positive, so signedness doesn't matter */
5282 assert(offset_src.type == BRW_REGISTER_TYPE_D ||
5283 offset_src.type == BRW_REGISTER_TYPE_UD);
5284 fs_reg off = bld8.vgrf(offset_src.type, 1);
5285 bld8.MOV(off, quarter(offset_src, q));
5286 bld8.ADD(off, off, brw_imm_ud(c + base_in_dwords));
5287
5288 fs_reg mask = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
5289 bld8.AND(mask, off, brw_imm_ud(0x3));
5290
5291 fs_reg one = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
5292 bld8.MOV(one, brw_imm_ud(1));
5293 bld8.SHL(mask, one, mask);
5294 bld8.SHL(mask, mask, brw_imm_ud(16));
5295
5296 bld8.SHR(off, off, brw_imm_ud(2));
5297
5298 fs_reg payload_srcs[4];
5299 unsigned length = 0;
5300
5301 for (unsigned j = 0; j < 4; j++)
5302 payload_srcs[length++] = quarter(src_comp, q);
5303
5304 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5305 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5306 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
5307 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask;
5308 srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF, bld.shader->alloc.allocate(length),
5309 BRW_REGISTER_TYPE_F);
5310 srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
5311 bld8.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], payload_srcs, length, 0);
5312
5313 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
5314 reg_undef, srcs, ARRAY_SIZE(srcs));
5315 inst->offset = 0;
5316 }
5317 }
5318 }
5319
5320 static void
emit_urb_direct_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest,fs_reg urb_handle)5321 emit_urb_direct_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
5322 const fs_reg &dest, fs_reg urb_handle)
5323 {
5324 assert(instr->def.bit_size == 32);
5325
5326 unsigned comps = instr->def.num_components;
5327 if (comps == 0)
5328 return;
5329
5330 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5331 assert(nir_src_is_const(*offset_nir_src));
5332
5333 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5334 nir_src_as_uint(*offset_nir_src) +
5335 component_from_intrinsic(instr);
5336
5337 unsigned urb_global_offset = offset_in_dwords / 4;
5338 adjust_handle_and_offset(bld, urb_handle, urb_global_offset);
5339
5340 const unsigned comp_offset = offset_in_dwords % 4;
5341 const unsigned num_regs = comp_offset + comps;
5342
5343 fs_builder ubld8 = bld.group(8, 0).exec_all();
5344 fs_reg data = ubld8.vgrf(BRW_REGISTER_TYPE_UD, num_regs);
5345 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5346 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5347
5348 fs_inst *inst = ubld8.emit(SHADER_OPCODE_URB_READ_LOGICAL, data,
5349 srcs, ARRAY_SIZE(srcs));
5350 inst->offset = urb_global_offset;
5351 assert(inst->offset < 2048);
5352 inst->size_written = num_regs * REG_SIZE;
5353
5354 for (unsigned c = 0; c < comps; c++) {
5355 fs_reg dest_comp = offset(dest, bld, c);
5356 fs_reg data_comp = horiz_stride(offset(data, ubld8, comp_offset + c), 0);
5357 bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
5358 }
5359 }
5360
5361 static void
emit_urb_direct_reads_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest,fs_reg urb_handle)5362 emit_urb_direct_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5363 const fs_reg &dest, fs_reg urb_handle)
5364 {
5365 assert(instr->def.bit_size == 32);
5366
5367 unsigned comps = instr->def.num_components;
5368 if (comps == 0)
5369 return;
5370
5371 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5372 assert(nir_src_is_const(*offset_nir_src));
5373
5374 fs_builder ubld16 = bld.group(16, 0).exec_all();
5375
5376 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5377 nir_src_as_uint(*offset_nir_src) +
5378 component_from_intrinsic(instr);
5379
5380 if (offset_in_dwords > 0) {
5381 fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD);
5382 ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4));
5383 urb_handle = new_handle;
5384 }
5385
5386 fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps);
5387 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5388 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5389
5390 fs_inst *inst = ubld16.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5391 data, srcs, ARRAY_SIZE(srcs));
5392 inst->size_written = 2 * comps * REG_SIZE;
5393
5394 for (unsigned c = 0; c < comps; c++) {
5395 fs_reg dest_comp = offset(dest, bld, c);
5396 fs_reg data_comp = horiz_stride(offset(data, ubld16, c), 0);
5397 bld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
5398 }
5399 }
5400
5401 static void
emit_urb_indirect_reads(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest,const fs_reg & offset_src,fs_reg urb_handle)5402 emit_urb_indirect_reads(const fs_builder &bld, nir_intrinsic_instr *instr,
5403 const fs_reg &dest, const fs_reg &offset_src, fs_reg urb_handle)
5404 {
5405 assert(instr->def.bit_size == 32);
5406
5407 unsigned comps = instr->def.num_components;
5408 if (comps == 0)
5409 return;
5410
5411 fs_reg seq_ud;
5412 {
5413 fs_builder ubld8 = bld.group(8, 0).exec_all();
5414 seq_ud = ubld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
5415 fs_reg seq_uw = ubld8.vgrf(BRW_REGISTER_TYPE_UW, 1);
5416 ubld8.MOV(seq_uw, fs_reg(brw_imm_v(0x76543210)));
5417 ubld8.MOV(seq_ud, seq_uw);
5418 ubld8.SHL(seq_ud, seq_ud, brw_imm_ud(2));
5419 }
5420
5421 const unsigned base_in_dwords = nir_intrinsic_base(instr) +
5422 component_from_intrinsic(instr);
5423
5424 for (unsigned c = 0; c < comps; c++) {
5425 for (unsigned q = 0; q < bld.dispatch_width() / 8; q++) {
5426 fs_builder bld8 = bld.group(8, q);
5427
5428 /* offset is always positive, so signedness doesn't matter */
5429 assert(offset_src.type == BRW_REGISTER_TYPE_D ||
5430 offset_src.type == BRW_REGISTER_TYPE_UD);
5431 fs_reg off = bld8.vgrf(offset_src.type, 1);
5432 bld8.MOV(off, quarter(offset_src, q));
5433 bld8.ADD(off, off, brw_imm_ud(base_in_dwords + c));
5434
5435 STATIC_ASSERT(IS_POT(REG_SIZE) && REG_SIZE > 1);
5436
5437 fs_reg comp = bld8.vgrf(BRW_REGISTER_TYPE_UD, 1);
5438 bld8.AND(comp, off, brw_imm_ud(0x3));
5439 bld8.SHL(comp, comp, brw_imm_ud(ffs(REG_SIZE) - 1));
5440 bld8.ADD(comp, comp, seq_ud);
5441
5442 bld8.SHR(off, off, brw_imm_ud(2));
5443
5444 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5445 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
5446 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = off;
5447
5448 fs_reg data = bld8.vgrf(BRW_REGISTER_TYPE_UD, 4);
5449
5450 fs_inst *inst = bld8.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5451 data, srcs, ARRAY_SIZE(srcs));
5452 inst->offset = 0;
5453 inst->size_written = 4 * REG_SIZE;
5454
5455 fs_reg dest_comp = offset(dest, bld, c);
5456 bld8.emit(SHADER_OPCODE_MOV_INDIRECT,
5457 retype(quarter(dest_comp, q), BRW_REGISTER_TYPE_UD),
5458 data,
5459 comp,
5460 brw_imm_ud(4 * REG_SIZE));
5461 }
5462 }
5463 }
5464
5465 static void
emit_urb_indirect_reads_xe2(const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & dest,const fs_reg & offset_src,fs_reg urb_handle)5466 emit_urb_indirect_reads_xe2(const fs_builder &bld, nir_intrinsic_instr *instr,
5467 const fs_reg &dest, const fs_reg &offset_src,
5468 fs_reg urb_handle)
5469 {
5470 assert(instr->def.bit_size == 32);
5471
5472 unsigned comps = instr->def.num_components;
5473 if (comps == 0)
5474 return;
5475
5476 fs_builder ubld16 = bld.group(16, 0).exec_all();
5477
5478 const unsigned offset_in_dwords = nir_intrinsic_base(instr) +
5479 component_from_intrinsic(instr);
5480
5481 if (offset_in_dwords > 0) {
5482 fs_reg new_handle = ubld16.vgrf(BRW_REGISTER_TYPE_UD);
5483 ubld16.ADD(new_handle, urb_handle, brw_imm_ud(offset_in_dwords * 4));
5484 urb_handle = new_handle;
5485 }
5486
5487 fs_reg data = ubld16.vgrf(BRW_REGISTER_TYPE_UD, comps);
5488
5489
5490 for (unsigned q = 0; q < bld.dispatch_width() / 16; q++) {
5491 fs_builder wbld = bld.group(16, q);
5492
5493 fs_reg addr = wbld.vgrf(BRW_REGISTER_TYPE_UD);
5494 wbld.SHL(addr, horiz_offset(offset_src, 16 * q), brw_imm_ud(2));
5495 wbld.ADD(addr, addr, urb_handle);
5496
5497 fs_reg srcs[URB_LOGICAL_NUM_SRCS];
5498 srcs[URB_LOGICAL_SRC_HANDLE] = addr;
5499
5500 fs_inst *inst = wbld.emit(SHADER_OPCODE_URB_READ_LOGICAL,
5501 data, srcs, ARRAY_SIZE(srcs));
5502 inst->size_written = 2 * comps * REG_SIZE;
5503
5504 for (unsigned c = 0; c < comps; c++) {
5505 fs_reg dest_comp = horiz_offset(offset(dest, bld, c), 16 * q);
5506 fs_reg data_comp = offset(data, wbld, c);
5507 wbld.MOV(retype(dest_comp, BRW_REGISTER_TYPE_UD), data_comp);
5508 }
5509 }
5510 }
5511
5512 static void
emit_task_mesh_store(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & urb_handle)5513 emit_task_mesh_store(nir_to_brw_state &ntb,
5514 const fs_builder &bld, nir_intrinsic_instr *instr,
5515 const fs_reg &urb_handle)
5516 {
5517 fs_reg src = get_nir_src(ntb, instr->src[0]);
5518 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5519
5520 if (nir_src_is_const(*offset_nir_src)) {
5521 if (bld.shader->devinfo->ver >= 20)
5522 emit_urb_direct_writes_xe2(bld, instr, src, urb_handle);
5523 else
5524 emit_urb_direct_writes(bld, instr, src, urb_handle);
5525 } else {
5526 if (bld.shader->devinfo->ver >= 20) {
5527 emit_urb_indirect_writes_xe2(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
5528 return;
5529 }
5530 bool use_mod = false;
5531 unsigned mod;
5532
5533 /* Try to calculate the value of (offset + base) % 4. If we can do
5534 * this, then we can do indirect writes using only 1 URB write.
5535 */
5536 use_mod = nir_mod_analysis(nir_get_scalar(offset_nir_src->ssa, 0), nir_type_uint, 4, &mod);
5537 if (use_mod) {
5538 mod += nir_intrinsic_base(instr) + component_from_intrinsic(instr);
5539 mod %= 4;
5540 }
5541
5542 if (use_mod) {
5543 emit_urb_indirect_writes_mod(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle, mod);
5544 } else {
5545 emit_urb_indirect_writes(bld, instr, src, get_nir_src(ntb, *offset_nir_src), urb_handle);
5546 }
5547 }
5548 }
5549
5550 static void
emit_task_mesh_load(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,const fs_reg & urb_handle)5551 emit_task_mesh_load(nir_to_brw_state &ntb,
5552 const fs_builder &bld, nir_intrinsic_instr *instr,
5553 const fs_reg &urb_handle)
5554 {
5555 fs_reg dest = get_nir_def(ntb, instr->def);
5556 nir_src *offset_nir_src = nir_get_io_offset_src(instr);
5557
5558 /* TODO(mesh): for per_vertex and per_primitive, if we could keep around
5559 * the non-array-index offset, we could use to decide if we can perform
5560 * a single large aligned read instead one per component.
5561 */
5562
5563 if (nir_src_is_const(*offset_nir_src)) {
5564 if (bld.shader->devinfo->ver >= 20)
5565 emit_urb_direct_reads_xe2(bld, instr, dest, urb_handle);
5566 else
5567 emit_urb_direct_reads(bld, instr, dest, urb_handle);
5568 } else {
5569 if (bld.shader->devinfo->ver >= 20)
5570 emit_urb_indirect_reads_xe2(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
5571 else
5572 emit_urb_indirect_reads(bld, instr, dest, get_nir_src(ntb, *offset_nir_src), urb_handle);
5573 }
5574 }
5575
5576 static void
fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)5577 fs_nir_emit_task_mesh_intrinsic(nir_to_brw_state &ntb, const fs_builder &bld,
5578 nir_intrinsic_instr *instr)
5579 {
5580 fs_visitor &s = ntb.s;
5581
5582 assert(s.stage == MESA_SHADER_MESH || s.stage == MESA_SHADER_TASK);
5583 const task_mesh_thread_payload &payload = s.task_mesh_payload();
5584
5585 fs_reg dest;
5586 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5587 dest = get_nir_def(ntb, instr->def);
5588
5589 switch (instr->intrinsic) {
5590 case nir_intrinsic_load_mesh_inline_data_intel: {
5591 fs_reg data = offset(payload.inline_parameter, 1, nir_intrinsic_align_offset(instr));
5592 bld.MOV(dest, retype(data, dest.type));
5593 break;
5594 }
5595
5596 case nir_intrinsic_load_draw_id:
5597 dest = retype(dest, BRW_REGISTER_TYPE_UD);
5598 bld.MOV(dest, payload.extended_parameter_0);
5599 break;
5600
5601 case nir_intrinsic_load_local_invocation_id:
5602 unreachable("local invocation id should have been lowered earlier");
5603 break;
5604
5605 case nir_intrinsic_load_local_invocation_index:
5606 dest = retype(dest, BRW_REGISTER_TYPE_UD);
5607 bld.MOV(dest, payload.local_index);
5608 break;
5609
5610 case nir_intrinsic_load_num_workgroups:
5611 dest = retype(dest, BRW_REGISTER_TYPE_UD);
5612 bld.MOV(offset(dest, bld, 0), brw_uw1_grf(0, 13)); /* g0.6 >> 16 */
5613 bld.MOV(offset(dest, bld, 1), brw_uw1_grf(0, 8)); /* g0.4 & 0xffff */
5614 bld.MOV(offset(dest, bld, 2), brw_uw1_grf(0, 9)); /* g0.4 >> 16 */
5615 break;
5616
5617 case nir_intrinsic_load_workgroup_index:
5618 dest = retype(dest, BRW_REGISTER_TYPE_UD);
5619 bld.MOV(dest, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
5620 break;
5621
5622 default:
5623 fs_nir_emit_cs_intrinsic(ntb, instr);
5624 break;
5625 }
5626 }
5627
5628 static void
fs_nir_emit_task_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)5629 fs_nir_emit_task_intrinsic(nir_to_brw_state &ntb,
5630 nir_intrinsic_instr *instr)
5631 {
5632 const fs_builder &bld = ntb.bld;
5633 fs_visitor &s = ntb.s;
5634
5635 assert(s.stage == MESA_SHADER_TASK);
5636 const task_mesh_thread_payload &payload = s.task_mesh_payload();
5637
5638 switch (instr->intrinsic) {
5639 case nir_intrinsic_store_output:
5640 case nir_intrinsic_store_task_payload:
5641 emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
5642 break;
5643
5644 case nir_intrinsic_load_output:
5645 case nir_intrinsic_load_task_payload:
5646 emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
5647 break;
5648
5649 default:
5650 fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
5651 break;
5652 }
5653 }
5654
5655 static void
fs_nir_emit_mesh_intrinsic(nir_to_brw_state & ntb,nir_intrinsic_instr * instr)5656 fs_nir_emit_mesh_intrinsic(nir_to_brw_state &ntb,
5657 nir_intrinsic_instr *instr)
5658 {
5659 const fs_builder &bld = ntb.bld;
5660 fs_visitor &s = ntb.s;
5661
5662 assert(s.stage == MESA_SHADER_MESH);
5663 const task_mesh_thread_payload &payload = s.task_mesh_payload();
5664
5665 switch (instr->intrinsic) {
5666 case nir_intrinsic_store_per_primitive_output:
5667 case nir_intrinsic_store_per_vertex_output:
5668 case nir_intrinsic_store_output:
5669 emit_task_mesh_store(ntb, bld, instr, payload.urb_output);
5670 break;
5671
5672 case nir_intrinsic_load_per_vertex_output:
5673 case nir_intrinsic_load_per_primitive_output:
5674 case nir_intrinsic_load_output:
5675 emit_task_mesh_load(ntb, bld, instr, payload.urb_output);
5676 break;
5677
5678 case nir_intrinsic_load_task_payload:
5679 emit_task_mesh_load(ntb, bld, instr, payload.task_urb_input);
5680 break;
5681
5682 default:
5683 fs_nir_emit_task_mesh_intrinsic(ntb, bld, instr);
5684 break;
5685 }
5686 }
5687
5688 static void
fs_nir_emit_intrinsic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)5689 fs_nir_emit_intrinsic(nir_to_brw_state &ntb,
5690 const fs_builder &bld, nir_intrinsic_instr *instr)
5691 {
5692 const intel_device_info *devinfo = ntb.devinfo;
5693 fs_visitor &s = ntb.s;
5694
5695 /* We handle this as a special case */
5696 if (instr->intrinsic == nir_intrinsic_decl_reg) {
5697 assert(nir_intrinsic_num_array_elems(instr) == 0);
5698 unsigned bit_size = nir_intrinsic_bit_size(instr);
5699 unsigned num_components = nir_intrinsic_num_components(instr);
5700 const brw_reg_type reg_type =
5701 brw_reg_type_from_bit_size(bit_size, bit_size == 8 ?
5702 BRW_REGISTER_TYPE_D :
5703 BRW_REGISTER_TYPE_F);
5704
5705 /* Re-use the destination's slot in the table for the register */
5706 ntb.ssa_values[instr->def.index] =
5707 bld.vgrf(reg_type, num_components);
5708 return;
5709 }
5710
5711 fs_reg dest;
5712 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5713 dest = get_nir_def(ntb, instr->def);
5714
5715 switch (instr->intrinsic) {
5716 case nir_intrinsic_resource_intel:
5717 ntb.ssa_bind_infos[instr->def.index].valid = true;
5718 ntb.ssa_bind_infos[instr->def.index].bindless =
5719 (nir_intrinsic_resource_access_intel(instr) &
5720 nir_resource_intel_bindless) != 0;
5721 ntb.ssa_bind_infos[instr->def.index].block =
5722 nir_intrinsic_resource_block_intel(instr);
5723 ntb.ssa_bind_infos[instr->def.index].set =
5724 nir_intrinsic_desc_set(instr);
5725 ntb.ssa_bind_infos[instr->def.index].binding =
5726 nir_intrinsic_binding(instr);
5727
5728 if (nir_intrinsic_resource_access_intel(instr) &
5729 nir_resource_intel_non_uniform) {
5730 ntb.resource_values[instr->def.index] = fs_reg();
5731 } else {
5732 ntb.resource_values[instr->def.index] =
5733 try_rebuild_resource(ntb, bld, instr->src[1].ssa);
5734 }
5735 ntb.ssa_values[instr->def.index] =
5736 ntb.ssa_values[instr->src[1].ssa->index];
5737 break;
5738
5739 case nir_intrinsic_load_reg:
5740 case nir_intrinsic_store_reg:
5741 /* Nothing to do with these. */
5742 break;
5743
5744 case nir_intrinsic_image_load:
5745 case nir_intrinsic_image_store:
5746 case nir_intrinsic_image_atomic:
5747 case nir_intrinsic_image_atomic_swap:
5748 case nir_intrinsic_bindless_image_load:
5749 case nir_intrinsic_bindless_image_store:
5750 case nir_intrinsic_bindless_image_atomic:
5751 case nir_intrinsic_bindless_image_atomic_swap: {
5752 /* Get some metadata from the image intrinsic. */
5753 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
5754
5755 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5756
5757 switch (instr->intrinsic) {
5758 case nir_intrinsic_image_load:
5759 case nir_intrinsic_image_store:
5760 case nir_intrinsic_image_atomic:
5761 case nir_intrinsic_image_atomic_swap:
5762 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5763 get_nir_image_intrinsic_image(ntb, bld, instr);
5764 break;
5765
5766 default:
5767 /* Bindless */
5768 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
5769 get_nir_image_intrinsic_image(ntb, bld, instr);
5770 break;
5771 }
5772
5773 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5774 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
5775 brw_imm_ud(nir_image_intrinsic_coord_components(instr));
5776
5777 /* Emit an image load, store or atomic op. */
5778 if (instr->intrinsic == nir_intrinsic_image_load ||
5779 instr->intrinsic == nir_intrinsic_bindless_image_load) {
5780 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
5781 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
5782 fs_inst *inst =
5783 bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
5784 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5785 inst->size_written = instr->num_components * s.dispatch_width * 4;
5786 } else if (instr->intrinsic == nir_intrinsic_image_store ||
5787 instr->intrinsic == nir_intrinsic_bindless_image_store) {
5788 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
5789 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]);
5790 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5791 bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
5792 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5793 } else {
5794 unsigned num_srcs = info->num_srcs;
5795 enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr);
5796 if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
5797 assert(num_srcs == 4);
5798 num_srcs = 3;
5799 }
5800
5801 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5802
5803 fs_reg data;
5804 if (num_srcs >= 4)
5805 data = get_nir_src(ntb, instr->src[3]);
5806 if (num_srcs >= 5) {
5807 fs_reg tmp = bld.vgrf(data.type, 2);
5808 fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) };
5809 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5810 data = tmp;
5811 }
5812 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5813 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5814
5815 bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
5816 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5817 }
5818 break;
5819 }
5820
5821 case nir_intrinsic_image_size:
5822 case nir_intrinsic_bindless_image_size: {
5823 /* Cube image sizes should have previously been lowered to a 2D array */
5824 assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
5825
5826 /* Unlike the [un]typed load and store opcodes, the TXS that this turns
5827 * into will handle the binding table index for us in the geneerator.
5828 * Incidentally, this means that we can handle bindless with exactly the
5829 * same code.
5830 */
5831 fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]),
5832 BRW_REGISTER_TYPE_UD);
5833 image = bld.emit_uniformize(image);
5834
5835 assert(nir_src_as_uint(instr->src[1]) == 0);
5836
5837 fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
5838 if (instr->intrinsic == nir_intrinsic_image_size)
5839 srcs[TEX_LOGICAL_SRC_SURFACE] = image;
5840 else
5841 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
5842 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
5843 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
5844 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
5845 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_d(0);
5846
5847 /* Since the image size is always uniform, we can just emit a SIMD8
5848 * query instruction and splat the result out.
5849 */
5850 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
5851
5852 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
5853 fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
5854 tmp, srcs, ARRAY_SIZE(srcs));
5855 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5856
5857 for (unsigned c = 0; c < instr->def.num_components; ++c) {
5858 bld.MOV(offset(retype(dest, tmp.type), bld, c),
5859 component(offset(tmp, ubld, c), 0));
5860 }
5861 break;
5862 }
5863
5864 case nir_intrinsic_image_load_raw_intel: {
5865 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5866 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5867 get_nir_image_intrinsic_image(ntb, bld, instr);
5868 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5869 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5870 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
5871 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
5872
5873 fs_inst *inst =
5874 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5875 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5876 inst->size_written = instr->num_components * s.dispatch_width * 4;
5877 break;
5878 }
5879
5880 case nir_intrinsic_image_store_raw_intel: {
5881 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5882 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5883 get_nir_image_intrinsic_image(ntb, bld, instr);
5884 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5885 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]);
5886 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5887 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
5888 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5889
5890 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5891 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5892 break;
5893 }
5894
5895 case nir_intrinsic_barrier:
5896 case nir_intrinsic_begin_invocation_interlock:
5897 case nir_intrinsic_end_invocation_interlock: {
5898 bool ugm_fence, slm_fence, tgm_fence, urb_fence;
5899 enum opcode opcode = BRW_OPCODE_NOP;
5900
5901 /* Handling interlock intrinsics here will allow the logic for IVB
5902 * render cache (see below) to be reused.
5903 */
5904
5905 switch (instr->intrinsic) {
5906 case nir_intrinsic_barrier: {
5907 /* Note we only care about the memory part of the
5908 * barrier. The execution part will be taken care
5909 * of by the stage specific intrinsic handler functions.
5910 */
5911 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
5912 ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
5913 slm_fence = modes & nir_var_mem_shared;
5914 tgm_fence = modes & nir_var_image;
5915 urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
5916 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
5917 opcode = SHADER_OPCODE_MEMORY_FENCE;
5918 break;
5919 }
5920
5921 case nir_intrinsic_begin_invocation_interlock:
5922 /* For beginInvocationInterlockARB(), we will generate a memory fence
5923 * but with a different opcode so that generator can pick SENDC
5924 * instead of SEND.
5925 */
5926 assert(s.stage == MESA_SHADER_FRAGMENT);
5927 ugm_fence = tgm_fence = true;
5928 slm_fence = urb_fence = false;
5929 opcode = SHADER_OPCODE_INTERLOCK;
5930 break;
5931
5932 case nir_intrinsic_end_invocation_interlock:
5933 /* For endInvocationInterlockARB(), we need to insert a memory fence which
5934 * stalls in the shader until the memory transactions prior to that
5935 * fence are complete. This ensures that the shader does not end before
5936 * any writes from its critical section have landed. Otherwise, you can
5937 * end up with a case where the next invocation on that pixel properly
5938 * stalls for previous FS invocation on its pixel to complete but
5939 * doesn't actually wait for the dataport memory transactions from that
5940 * thread to land before submitting its own.
5941 */
5942 assert(s.stage == MESA_SHADER_FRAGMENT);
5943 ugm_fence = tgm_fence = true;
5944 slm_fence = urb_fence = false;
5945 opcode = SHADER_OPCODE_MEMORY_FENCE;
5946 break;
5947
5948 default:
5949 unreachable("invalid intrinsic");
5950 }
5951
5952 if (opcode == BRW_OPCODE_NOP)
5953 break;
5954
5955 if (s.nir->info.shared_size > 0) {
5956 assert(gl_shader_stage_uses_workgroup(s.stage));
5957 } else {
5958 slm_fence = false;
5959 }
5960
5961 /* If the workgroup fits in a single HW thread, the messages for SLM are
5962 * processed in-order and the shader itself is already synchronized so
5963 * the memory fence is not necessary.
5964 *
5965 * TODO: Check if applies for many HW threads sharing same Data Port.
5966 */
5967 if (!s.nir->info.workgroup_size_variable &&
5968 slm_fence && s.workgroup_size() <= s.dispatch_width)
5969 slm_fence = false;
5970
5971 switch (s.stage) {
5972 case MESA_SHADER_TESS_CTRL:
5973 case MESA_SHADER_TASK:
5974 case MESA_SHADER_MESH:
5975 break;
5976 default:
5977 urb_fence = false;
5978 break;
5979 }
5980
5981 unsigned fence_regs_count = 0;
5982 fs_reg fence_regs[4] = {};
5983
5984 const fs_builder ubld = bld.group(8, 0);
5985
5986 /* A memory barrier with acquire semantics requires us to
5987 * guarantee that memory operations of the specified storage
5988 * class sequenced-after the barrier aren't reordered before the
5989 * barrier, nor before any previous atomic operation
5990 * sequenced-before the barrier which may be synchronizing this
5991 * acquire barrier with a prior release sequence.
5992 *
5993 * In order to guarantee the latter we must make sure that any
5994 * such previous operation has completed execution before
5995 * invalidating the relevant caches, since otherwise some cache
5996 * could be polluted by a concurrent thread after its
5997 * invalidation but before the previous atomic completes, which
5998 * could lead to a violation of the expected memory ordering if
5999 * a subsequent memory read hits the polluted cacheline, which
6000 * would return a stale value read from memory before the
6001 * completion of the atomic sequenced-before the barrier.
6002 *
6003 * This ordering inversion can be avoided trivially if the
6004 * operations we need to order are all handled by a single
6005 * in-order cache, since the flush implied by the memory fence
6006 * occurs after any pending operations have completed, however
6007 * that doesn't help us when dealing with multiple caches
6008 * processing requests out of order, in which case we need to
6009 * explicitly stall the EU until any pending memory operations
6010 * have executed.
6011 *
6012 * Note that that might be somewhat heavy handed in some cases.
6013 * In particular when this memory fence was inserted by
6014 * spirv_to_nir() lowering an atomic with acquire semantics into
6015 * an atomic+barrier sequence we could do a better job by
6016 * synchronizing with respect to that one atomic *only*, but
6017 * that would require additional information not currently
6018 * available to the backend.
6019 *
6020 * XXX - Use an alternative workaround on IVB and ICL, since
6021 * SYNC.ALLWR is only available on Gfx12+.
6022 */
6023 if (devinfo->ver >= 12 &&
6024 (!nir_intrinsic_has_memory_scope(instr) ||
6025 (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
6026 ubld.exec_all().group(1, 0).emit(
6027 BRW_OPCODE_SYNC, ubld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
6028 }
6029
6030 if (devinfo->has_lsc) {
6031 assert(devinfo->verx10 >= 125);
6032 uint32_t desc =
6033 lsc_fence_descriptor_for_intrinsic(devinfo, instr);
6034 if (ugm_fence) {
6035 fence_regs[fence_regs_count++] =
6036 emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
6037 true /* commit_enable */,
6038 0 /* bti; ignored for LSC */);
6039 }
6040
6041 if (tgm_fence) {
6042 fence_regs[fence_regs_count++] =
6043 emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
6044 true /* commit_enable */,
6045 0 /* bti; ignored for LSC */);
6046 }
6047
6048 if (slm_fence) {
6049 assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6050 if (intel_needs_workaround(devinfo, 14014063774)) {
6051 /* Wa_14014063774
6052 *
6053 * Before SLM fence compiler needs to insert SYNC.ALLWR in order
6054 * to avoid the SLM data race.
6055 */
6056 ubld.exec_all().group(1, 0).emit(
6057 BRW_OPCODE_SYNC, ubld.null_reg_ud(),
6058 brw_imm_ud(TGL_SYNC_ALLWR));
6059 }
6060 fence_regs[fence_regs_count++] =
6061 emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
6062 true /* commit_enable */,
6063 0 /* BTI; ignored for LSC */);
6064 }
6065
6066 if (urb_fence) {
6067 assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6068 fence_regs[fence_regs_count++] =
6069 emit_fence(ubld, opcode, BRW_SFID_URB, desc,
6070 true /* commit_enable */,
6071 0 /* BTI; ignored for LSC */);
6072 }
6073 } else if (devinfo->ver >= 11) {
6074 if (tgm_fence || ugm_fence || urb_fence) {
6075 fence_regs[fence_regs_count++] =
6076 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6077 true /* commit_enable HSD ES # 1404612949 */,
6078 0 /* BTI = 0 means data cache */);
6079 }
6080
6081 if (slm_fence) {
6082 assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
6083 fence_regs[fence_regs_count++] =
6084 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6085 true /* commit_enable HSD ES # 1404612949 */,
6086 GFX7_BTI_SLM);
6087 }
6088 } else {
6089 /* Simulation also complains on Gfx9 if we do not enable commit.
6090 */
6091 const bool commit_enable =
6092 instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
6093 devinfo->ver == 9;
6094
6095 if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
6096 fence_regs[fence_regs_count++] =
6097 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
6098 commit_enable, 0 /* BTI */);
6099 }
6100 }
6101
6102 assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
6103
6104 /* Be conservative in Gen11+ and always stall in a fence. Since
6105 * there are two different fences, and shader might want to
6106 * synchronize between them.
6107 *
6108 * TODO: Use scope and visibility information for the barriers from NIR
6109 * to make a better decision on whether we need to stall.
6110 */
6111 bool force_stall = devinfo->ver >= 11;
6112
6113 /* There are four cases where we want to insert a stall:
6114 *
6115 * 1. If we're a nir_intrinsic_end_invocation_interlock. This is
6116 * required to ensure that the shader EOT doesn't happen until
6117 * after the fence returns. Otherwise, we might end up with the
6118 * next shader invocation for that pixel not respecting our fence
6119 * because it may happen on a different HW thread.
6120 *
6121 * 2. If we have multiple fences. This is required to ensure that
6122 * they all complete and nothing gets weirdly out-of-order.
6123 *
6124 * 3. If we have no fences. In this case, we need at least a
6125 * scheduling barrier to keep the compiler from moving things
6126 * around in an invalid way.
6127 *
6128 * 4. On Gen11+ and platforms with LSC, we have multiple fence types,
6129 * without further information about the fence, we need to force a
6130 * stall.
6131 */
6132 if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
6133 fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
6134 ubld.exec_all().group(1, 0).emit(
6135 FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
6136 fence_regs, fence_regs_count);
6137 }
6138
6139 break;
6140 }
6141
6142 case nir_intrinsic_shader_clock: {
6143 /* We cannot do anything if there is an event, so ignore it for now */
6144 const fs_reg shader_clock = get_timestamp(bld);
6145 const fs_reg srcs[] = { component(shader_clock, 0),
6146 component(shader_clock, 1) };
6147 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
6148 break;
6149 }
6150
6151 case nir_intrinsic_load_reloc_const_intel: {
6152 uint32_t id = nir_intrinsic_param_idx(instr);
6153
6154 /* Emit the reloc in the smallest SIMD size to limit register usage. */
6155 const fs_builder ubld = bld.exec_all().group(1, 0);
6156 fs_reg small_dest = ubld.vgrf(dest.type);
6157 ubld.UNDEF(small_dest);
6158 ubld.exec_all().group(1, 0).emit(SHADER_OPCODE_MOV_RELOC_IMM,
6159 small_dest, brw_imm_ud(id));
6160
6161 /* Copy propagation will get rid of this MOV. */
6162 bld.MOV(dest, component(small_dest, 0));
6163 break;
6164 }
6165
6166 case nir_intrinsic_load_uniform: {
6167 /* Offsets are in bytes but they should always aligned to
6168 * the type size
6169 */
6170 unsigned base_offset = nir_intrinsic_base(instr);
6171 assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
6172
6173 fs_reg src(UNIFORM, base_offset / 4, dest.type);
6174
6175 if (nir_src_is_const(instr->src[0])) {
6176 unsigned load_offset = nir_src_as_uint(instr->src[0]);
6177 assert(load_offset % type_sz(dest.type) == 0);
6178 /* The base offset can only handle 32-bit units, so for 16-bit
6179 * data take the modulo of the offset with 4 bytes and add it to
6180 * the offset to read from within the source register.
6181 */
6182 src.offset = load_offset + base_offset % 4;
6183
6184 for (unsigned j = 0; j < instr->num_components; j++) {
6185 bld.MOV(offset(dest, bld, j), offset(src, bld, j));
6186 }
6187 } else {
6188 fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
6189 BRW_REGISTER_TYPE_UD);
6190
6191 /* We need to pass a size to the MOV_INDIRECT but we don't want it to
6192 * go past the end of the uniform. In order to keep the n'th
6193 * component from running past, we subtract off the size of all but
6194 * one component of the vector.
6195 */
6196 assert(nir_intrinsic_range(instr) >=
6197 instr->num_components * type_sz(dest.type));
6198 unsigned read_size = nir_intrinsic_range(instr) -
6199 (instr->num_components - 1) * type_sz(dest.type);
6200
6201 bool supports_64bit_indirects = !intel_device_info_is_9lp(devinfo);
6202
6203 if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
6204 for (unsigned j = 0; j < instr->num_components; j++) {
6205 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
6206 offset(dest, bld, j), offset(src, bld, j),
6207 indirect, brw_imm_ud(read_size));
6208 }
6209 } else {
6210 const unsigned num_mov_indirects =
6211 type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
6212 /* We read a little bit less per MOV INDIRECT, as they are now
6213 * 32-bits ones instead of 64-bit. Fix read_size then.
6214 */
6215 const unsigned read_size_32bit = read_size -
6216 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
6217 for (unsigned j = 0; j < instr->num_components; j++) {
6218 for (unsigned i = 0; i < num_mov_indirects; i++) {
6219 bld.emit(SHADER_OPCODE_MOV_INDIRECT,
6220 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
6221 subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
6222 indirect, brw_imm_ud(read_size_32bit));
6223 }
6224 }
6225 }
6226 }
6227 break;
6228 }
6229
6230 case nir_intrinsic_load_ubo:
6231 case nir_intrinsic_load_ubo_uniform_block_intel: {
6232 fs_reg surface, surface_handle;
6233
6234 if (get_nir_src_bindless(ntb, instr->src[0]))
6235 surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr);
6236 else
6237 surface = get_nir_buffer_intrinsic_index(ntb, bld, instr);
6238
6239 if (!nir_src_is_const(instr->src[1])) {
6240 if (instr->intrinsic == nir_intrinsic_load_ubo) {
6241 /* load_ubo with non-uniform offset */
6242 fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
6243 BRW_REGISTER_TYPE_UD);
6244
6245 const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4;
6246
6247 for (int i = 0; i < instr->num_components; i += comps_per_load) {
6248 const unsigned remaining = instr->num_components - i;
6249 s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
6250 surface, surface_handle,
6251 base_offset,
6252 i * type_sz(dest.type),
6253 instr->def.bit_size / 8,
6254 MIN2(remaining, comps_per_load));
6255 }
6256
6257 s.prog_data->has_ubo_pull = true;
6258 } else {
6259 /* load_ubo with uniform offset */
6260 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6261 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6262 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6263
6264 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6265
6266 srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface;
6267 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
6268
6269 const nir_src load_offset = instr->src[1];
6270 if (nir_src_is_const(load_offset)) {
6271 fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
6272 ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
6273 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
6274 } else {
6275 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6276 bld.emit_uniformize(get_nir_src(ntb, load_offset));
6277 }
6278
6279 const unsigned total_dwords =
6280 ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
6281 unsigned loaded_dwords = 0;
6282
6283 const fs_reg packed_consts =
6284 ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
6285
6286 while (loaded_dwords < total_dwords) {
6287 const unsigned block =
6288 choose_oword_block_size_dwords(devinfo,
6289 total_dwords - loaded_dwords);
6290 const unsigned block_bytes = block * 4;
6291
6292 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
6293
6294 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
6295 ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6296 retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
6297 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
6298 align(block_bytes, REG_SIZE * reg_unit(devinfo));
6299
6300 loaded_dwords += block;
6301
6302 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6303 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6304 brw_imm_ud(block_bytes));
6305 }
6306
6307 for (unsigned c = 0; c < instr->num_components; c++) {
6308 bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
6309 component(packed_consts, c));
6310 }
6311
6312 s.prog_data->has_ubo_pull = true;
6313 }
6314 } else {
6315 /* Even if we are loading doubles, a pull constant load will load
6316 * a 32-bit vec4, so should only reserve vgrf space for that. If we
6317 * need to load a full dvec4 we will have to emit 2 loads. This is
6318 * similar to demote_pull_constants(), except that in that case we
6319 * see individual accesses to each component of the vector and then
6320 * we let CSE deal with duplicate loads. Here we see a vector access
6321 * and we have to split it if necessary.
6322 */
6323 const unsigned type_size = type_sz(dest.type);
6324 const unsigned load_offset = nir_src_as_uint(instr->src[1]);
6325 const unsigned ubo_block =
6326 brw_nir_ubo_surface_index_get_push_block(instr->src[0]);
6327 const unsigned offset_256b = load_offset / 32;
6328 const unsigned end_256b =
6329 DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
6330
6331 /* See if we've selected this as a push constant candidate */
6332 fs_reg push_reg;
6333 for (int i = 0; i < 4; i++) {
6334 const struct brw_ubo_range *range = &s.prog_data->ubo_ranges[i];
6335 if (range->block == ubo_block &&
6336 offset_256b >= range->start &&
6337 end_256b <= range->start + range->length) {
6338
6339 push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
6340 push_reg.offset = load_offset - 32 * range->start;
6341 break;
6342 }
6343 }
6344
6345 if (push_reg.file != BAD_FILE) {
6346 for (unsigned i = 0; i < instr->num_components; i++) {
6347 bld.MOV(offset(dest, bld, i),
6348 byte_offset(push_reg, i * type_size));
6349 }
6350 break;
6351 }
6352
6353 s.prog_data->has_ubo_pull = true;
6354
6355 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
6356 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
6357
6358 for (unsigned c = 0; c < instr->num_components;) {
6359 const unsigned base = load_offset + c * type_size;
6360 /* Number of usable components in the next block-aligned load. */
6361 const unsigned count = MIN2(instr->num_components - c,
6362 (block_sz - base % block_sz) / type_size);
6363
6364 const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6365 fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
6366 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface;
6367 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
6368 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
6369 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
6370
6371 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
6372 srcs, PULL_UNIFORM_CONSTANT_SRCS);
6373
6374 const fs_reg consts =
6375 retype(byte_offset(packed_consts, base & (block_sz - 1)),
6376 dest.type);
6377
6378 for (unsigned d = 0; d < count; d++)
6379 bld.MOV(offset(dest, bld, c + d), component(consts, d));
6380
6381 c += count;
6382 }
6383 }
6384 break;
6385 }
6386
6387 case nir_intrinsic_load_global:
6388 case nir_intrinsic_load_global_constant: {
6389 assert(instr->def.bit_size <= 32);
6390 assert(nir_intrinsic_align(instr) > 0);
6391 fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6392 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]);
6393 srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
6394 srcs[A64_LOGICAL_ENABLE_HELPERS] =
6395 brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
6396
6397 if (instr->def.bit_size == 32 &&
6398 nir_intrinsic_align(instr) >= 4) {
6399 assert(instr->def.num_components <= 4);
6400
6401 srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
6402
6403 fs_inst *inst =
6404 bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
6405 srcs, A64_LOGICAL_NUM_SRCS);
6406 inst->size_written = instr->num_components *
6407 inst->dst.component_size(inst->exec_size);
6408 } else {
6409 const unsigned bit_size = instr->def.bit_size;
6410 assert(instr->def.num_components == 1);
6411 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
6412
6413 srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size);
6414
6415 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
6416 srcs, A64_LOGICAL_NUM_SRCS);
6417 bld.MOV(dest, subscript(tmp, dest.type, 0));
6418 }
6419 break;
6420 }
6421
6422 case nir_intrinsic_store_global: {
6423 assert(nir_src_bit_size(instr->src[0]) <= 32);
6424 assert(nir_intrinsic_write_mask(instr) ==
6425 (1u << instr->num_components) - 1);
6426 assert(nir_intrinsic_align(instr) > 0);
6427
6428 fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6429 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6430 srcs[A64_LOGICAL_ENABLE_HELPERS] =
6431 brw_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
6432
6433 if (nir_src_bit_size(instr->src[0]) == 32 &&
6434 nir_intrinsic_align(instr) >= 4) {
6435 assert(nir_src_num_components(instr->src[0]) <= 4);
6436
6437 srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */
6438 srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
6439
6440 bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, fs_reg(),
6441 srcs, A64_LOGICAL_NUM_SRCS);
6442 } else {
6443 assert(nir_src_num_components(instr->src[0]) == 1);
6444 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6445 brw_reg_type data_type =
6446 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
6447 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
6448 bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type));
6449
6450 srcs[A64_LOGICAL_SRC] = tmp;
6451 srcs[A64_LOGICAL_ARG] = brw_imm_ud(bit_size);
6452
6453 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, fs_reg(),
6454 srcs, A64_LOGICAL_NUM_SRCS);
6455 }
6456 break;
6457 }
6458
6459 case nir_intrinsic_global_atomic:
6460 case nir_intrinsic_global_atomic_swap:
6461 fs_nir_emit_global_atomic(ntb, bld, instr);
6462 break;
6463
6464 case nir_intrinsic_load_global_const_block_intel: {
6465 assert(instr->def.bit_size == 32);
6466 assert(instr->num_components == 8 || instr->num_components == 16);
6467
6468 const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
6469 fs_reg load_val;
6470
6471 bool is_pred_const = nir_src_is_const(instr->src[1]);
6472 if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
6473 /* In this case, we don't want the UBO load at all. We really
6474 * shouldn't get here but it's possible.
6475 */
6476 load_val = brw_imm_ud(0);
6477 } else {
6478 /* The uniform process may stomp the flag so do this first */
6479 fs_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
6480
6481 load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6482
6483 /* If the predicate is constant and we got here, then it's non-zero
6484 * and we don't need the predicate at all.
6485 */
6486 if (!is_pred_const) {
6487 /* Load the predicate */
6488 fs_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
6489 fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
6490 mov->conditional_mod = BRW_CONDITIONAL_NZ;
6491
6492 /* Stomp the destination with 0 if we're OOB */
6493 mov = ubld.MOV(load_val, brw_imm_ud(0));
6494 mov->predicate = BRW_PREDICATE_NORMAL;
6495 mov->predicate_inverse = true;
6496 }
6497
6498 fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6499 srcs[A64_LOGICAL_ADDRESS] = addr;
6500 srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
6501 srcs[A64_LOGICAL_ARG] = brw_imm_ud(instr->num_components);
6502 /* This intrinsic loads memory from a uniform address, sometimes
6503 * shared across lanes. We never need to mask it.
6504 */
6505 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
6506
6507 fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
6508 load_val, srcs, A64_LOGICAL_NUM_SRCS);
6509 if (!is_pred_const)
6510 load->predicate = BRW_PREDICATE_NORMAL;
6511 }
6512
6513 /* From the HW perspective, we just did a single SIMD16 instruction
6514 * which loaded a dword in each SIMD channel. From NIR's perspective,
6515 * this instruction returns a vec16. Any users of this data in the
6516 * back-end will expect a vec16 per SIMD channel so we have to emit a
6517 * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop
6518 * will generally clean them up for us.
6519 */
6520 for (unsigned i = 0; i < instr->num_components; i++) {
6521 bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
6522 component(load_val, i));
6523 }
6524 break;
6525 }
6526
6527 case nir_intrinsic_load_global_constant_uniform_block_intel: {
6528 const unsigned total_dwords = ALIGN(instr->num_components,
6529 REG_SIZE * reg_unit(devinfo) / 4);
6530 unsigned loaded_dwords = 0;
6531
6532 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6533 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6534 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6535
6536 const fs_reg packed_consts =
6537 ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
6538 fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
6539
6540 while (loaded_dwords < total_dwords) {
6541 const unsigned block =
6542 choose_oword_block_size_dwords(devinfo,
6543 total_dwords - loaded_dwords);
6544 const unsigned block_bytes = block * 4;
6545
6546 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
6547
6548 fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6549 srcs[A64_LOGICAL_ADDRESS] = address;
6550 srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
6551 srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
6552 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
6553 ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6554 retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
6555 srcs, A64_LOGICAL_NUM_SRCS)->size_written =
6556 align(block_bytes, REG_SIZE * reg_unit(devinfo));
6557
6558 increment_a64_address(ubld1, address, block_bytes);
6559 loaded_dwords += block;
6560 }
6561
6562 for (unsigned c = 0; c < instr->num_components; c++)
6563 bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
6564 component(packed_consts, c));
6565
6566 break;
6567 }
6568
6569 case nir_intrinsic_load_ssbo: {
6570 const unsigned bit_size = instr->def.bit_size;
6571 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6572 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
6573 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6574 SURFACE_LOGICAL_SRC_SURFACE] =
6575 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6576 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6577 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6578 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
6579
6580 /* Make dest unsigned because that's what the temporary will be */
6581 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
6582
6583 /* Read the vector */
6584 assert(bit_size <= 32);
6585 assert(nir_intrinsic_align(instr) > 0);
6586 if (bit_size == 32 &&
6587 nir_intrinsic_align(instr) >= 4) {
6588 assert(instr->def.num_components <= 4);
6589 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
6590 fs_inst *inst =
6591 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
6592 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6593 inst->size_written = instr->num_components * s.dispatch_width * 4;
6594 } else {
6595 assert(instr->def.num_components == 1);
6596 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
6597
6598 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
6599 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
6600 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
6601 bld.MOV(dest, subscript(read_result, dest.type, 0));
6602 }
6603 break;
6604 }
6605
6606 case nir_intrinsic_store_ssbo: {
6607 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6608 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6609 srcs[get_nir_src_bindless(ntb, instr->src[1]) ?
6610 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6611 SURFACE_LOGICAL_SRC_SURFACE] =
6612 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6613 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]);
6614 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6615 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
6616
6617 fs_reg data = get_nir_src(ntb, instr->src[0]);
6618 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
6619
6620 assert(bit_size <= 32);
6621 assert(nir_intrinsic_write_mask(instr) ==
6622 (1u << instr->num_components) - 1);
6623 assert(nir_intrinsic_align(instr) > 0);
6624 if (bit_size == 32 &&
6625 nir_intrinsic_align(instr) >= 4) {
6626 assert(nir_src_num_components(instr->src[0]) <= 4);
6627 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6628 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
6629 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
6630 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6631 } else {
6632 assert(nir_src_num_components(instr->src[0]) == 1);
6633 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
6634
6635 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6636 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
6637
6638 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
6639 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6640 }
6641 break;
6642 }
6643
6644 case nir_intrinsic_load_ssbo_uniform_block_intel:
6645 case nir_intrinsic_load_shared_uniform_block_intel: {
6646 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6647
6648 const bool is_ssbo =
6649 instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
6650 if (is_ssbo) {
6651 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
6652 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6653 SURFACE_LOGICAL_SRC_SURFACE] =
6654 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6655 } else {
6656 srcs[SURFACE_LOGICAL_SRC_SURFACE] = fs_reg(brw_imm_ud(GFX7_BTI_SLM));
6657 }
6658
6659 const unsigned total_dwords = ALIGN(instr->num_components,
6660 REG_SIZE * reg_unit(devinfo) / 4);
6661 unsigned loaded_dwords = 0;
6662
6663 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6664 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6665 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6666
6667 const fs_reg packed_consts =
6668 ubld1.vgrf(BRW_REGISTER_TYPE_UD, total_dwords);
6669
6670 const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
6671 if (nir_src_is_const(load_offset)) {
6672 fs_reg addr = ubld8.vgrf(BRW_REGISTER_TYPE_UD);
6673 ubld8.MOV(addr, brw_imm_ud(nir_src_as_uint(load_offset)));
6674 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
6675 } else {
6676 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6677 bld.emit_uniformize(get_nir_src(ntb, load_offset));
6678 }
6679
6680 while (loaded_dwords < total_dwords) {
6681 const unsigned block =
6682 choose_oword_block_size_dwords(devinfo,
6683 total_dwords - loaded_dwords);
6684 const unsigned block_bytes = block * 4;
6685
6686 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
6687
6688 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
6689 ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6690 retype(byte_offset(packed_consts, loaded_dwords * 4), BRW_REGISTER_TYPE_UD),
6691 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
6692 align(block_bytes, REG_SIZE * reg_unit(devinfo));
6693
6694 loaded_dwords += block;
6695
6696 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6697 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6698 brw_imm_ud(block_bytes));
6699 }
6700
6701 for (unsigned c = 0; c < instr->num_components; c++)
6702 bld.MOV(retype(offset(dest, bld, c), BRW_REGISTER_TYPE_UD),
6703 component(packed_consts, c));
6704
6705 break;
6706 }
6707
6708 case nir_intrinsic_store_output: {
6709 assert(nir_src_bit_size(instr->src[0]) == 32);
6710 fs_reg src = get_nir_src(ntb, instr->src[0]);
6711
6712 unsigned store_offset = nir_src_as_uint(instr->src[1]);
6713 unsigned num_components = instr->num_components;
6714 unsigned first_component = nir_intrinsic_component(instr);
6715
6716 fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
6717 4 * store_offset), src.type);
6718 for (unsigned j = 0; j < num_components; j++) {
6719 bld.MOV(offset(new_dest, bld, j + first_component),
6720 offset(src, bld, j));
6721 }
6722 break;
6723 }
6724
6725 case nir_intrinsic_ssbo_atomic:
6726 case nir_intrinsic_ssbo_atomic_swap:
6727 fs_nir_emit_surface_atomic(ntb, bld, instr,
6728 get_nir_buffer_intrinsic_index(ntb, bld, instr),
6729 get_nir_src_bindless(ntb, instr->src[0]));
6730 break;
6731
6732 case nir_intrinsic_get_ssbo_size: {
6733 assert(nir_src_num_components(instr->src[0]) == 1);
6734
6735 /* A resinfo's sampler message is used to get the buffer size. The
6736 * SIMD8's writeback message consists of four registers and SIMD16's
6737 * writeback message consists of 8 destination registers (two per each
6738 * component). Because we are only interested on the first channel of
6739 * the first returned component, where resinfo returns the buffer size
6740 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
6741 * the dispatch width.
6742 */
6743 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
6744 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6745 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
6746
6747 /* Set LOD = 0 */
6748 ubld.MOV(src_payload, brw_imm_d(0));
6749
6750 fs_reg srcs[GET_BUFFER_SIZE_SRCS];
6751 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
6752 GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
6753 GET_BUFFER_SIZE_SRC_SURFACE] =
6754 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6755 srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
6756 fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
6757 srcs, GET_BUFFER_SIZE_SRCS);
6758 inst->header_size = 0;
6759 inst->mlen = reg_unit(devinfo);
6760 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
6761
6762 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
6763 *
6764 * "Out-of-bounds checking is always performed at a DWord granularity. If
6765 * any part of the DWord is out-of-bounds then the whole DWord is
6766 * considered out-of-bounds."
6767 *
6768 * This implies that types with size smaller than 4-bytes need to be
6769 * padded if they don't complete the last dword of the buffer. But as we
6770 * need to maintain the original size we need to reverse the padding
6771 * calculation to return the correct size to know the number of elements
6772 * of an unsized array. As we stored in the last two bits of the surface
6773 * size the needed padding for the buffer, we calculate here the
6774 * original buffer_size reversing the surface_size calculation:
6775 *
6776 * surface_size = isl_align(buffer_size, 4) +
6777 * (isl_align(buffer_size) - buffer_size)
6778 *
6779 * buffer_size = surface_size & ~3 - surface_size & 3
6780 */
6781
6782 fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6783 fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6784 fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6785
6786 ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
6787 ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
6788 ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
6789
6790 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
6791 break;
6792 }
6793
6794 case nir_intrinsic_load_scratch: {
6795 assert(instr->def.num_components == 1);
6796 const unsigned bit_size = instr->def.bit_size;
6797 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6798
6799 if (devinfo->verx10 >= 125) {
6800 const fs_builder ubld = bld.exec_all().group(1, 0);
6801 fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
6802 ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
6803 brw_imm_ud(INTEL_MASK(31, 10)));
6804 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
6805 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
6806 } else {
6807 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
6808 brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
6809 }
6810
6811 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6812 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
6813 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
6814 const fs_reg nir_addr = get_nir_src(ntb, instr->src[0]);
6815
6816 /* Make dest unsigned because that's what the temporary will be */
6817 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
6818
6819 /* Read the vector */
6820 assert(instr->def.num_components == 1);
6821 assert(bit_size <= 32);
6822 assert(nir_intrinsic_align(instr) > 0);
6823 if (bit_size == 32 &&
6824 nir_intrinsic_align(instr) >= 4) {
6825 if (devinfo->verx10 >= 125) {
6826 assert(bit_size == 32 &&
6827 nir_intrinsic_align(instr) >= 4);
6828
6829 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6830 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6831 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
6832
6833 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
6834 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6835 } else {
6836 /* The offset for a DWORD scattered message is in dwords. */
6837 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6838 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
6839
6840 bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
6841 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6842 }
6843 } else {
6844 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6845 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6846
6847 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
6848 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
6849 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
6850 bld.MOV(dest, read_result);
6851 }
6852
6853 s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16);
6854 break;
6855 }
6856
6857 case nir_intrinsic_store_scratch: {
6858 assert(nir_src_num_components(instr->src[0]) == 1);
6859 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6860 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6861
6862 if (devinfo->verx10 >= 125) {
6863 const fs_builder ubld = bld.exec_all().group(1, 0);
6864 fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
6865 ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
6866 brw_imm_ud(INTEL_MASK(31, 10)));
6867 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX125_NON_BINDLESS);
6868 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
6869 } else {
6870 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
6871 brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
6872 }
6873
6874 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
6875 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
6876 /**
6877 * While this instruction has side-effects, it should not be predicated
6878 * on sample mask, because otherwise fs helper invocations would
6879 * load undefined values from scratch memory. And scratch memory
6880 * load-stores are produced from operations without side-effects, thus
6881 * they should not have different behaviour in the helper invocations.
6882 */
6883 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
6884 const fs_reg nir_addr = get_nir_src(ntb, instr->src[1]);
6885
6886 fs_reg data = get_nir_src(ntb, instr->src[0]);
6887 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
6888
6889 assert(nir_src_num_components(instr->src[0]) == 1);
6890 assert(bit_size <= 32);
6891 assert(nir_intrinsic_write_mask(instr) == 1);
6892 assert(nir_intrinsic_align(instr) > 0);
6893 if (bit_size == 32 &&
6894 nir_intrinsic_align(instr) >= 4) {
6895 if (devinfo->verx10 >= 125) {
6896 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6897
6898 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6899 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6900 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
6901
6902 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
6903 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6904 } else {
6905 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6906
6907 /* The offset for a DWORD scattered message is in dwords. */
6908 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6909 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
6910
6911 bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
6912 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6913 }
6914 } else {
6915 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6916 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
6917
6918 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6919 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6920
6921 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
6922 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6923 }
6924 s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16);
6925 break;
6926 }
6927
6928 case nir_intrinsic_load_subgroup_size:
6929 /* This should only happen for fragment shaders because every other case
6930 * is lowered in NIR so we can optimize on it.
6931 */
6932 assert(s.stage == MESA_SHADER_FRAGMENT);
6933 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(s.dispatch_width));
6934 break;
6935
6936 case nir_intrinsic_load_subgroup_invocation:
6937 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
6938 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
6939 break;
6940
6941 case nir_intrinsic_load_subgroup_eq_mask:
6942 case nir_intrinsic_load_subgroup_ge_mask:
6943 case nir_intrinsic_load_subgroup_gt_mask:
6944 case nir_intrinsic_load_subgroup_le_mask:
6945 case nir_intrinsic_load_subgroup_lt_mask:
6946 unreachable("not reached");
6947
6948 case nir_intrinsic_vote_any: {
6949 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6950
6951 /* The any/all predicates do not consider channel enables. To prevent
6952 * dead channels from affecting the result, we initialize the flag with
6953 * with the identity value for the logical operation.
6954 */
6955 if (s.dispatch_width == 32) {
6956 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
6957 ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
6958 brw_imm_ud(0));
6959 } else {
6960 ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
6961 }
6962 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
6963
6964 /* For some reason, the any/all predicates don't work properly with
6965 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
6966 * doesn't read the correct subset of the flag register and you end up
6967 * getting garbage in the second half. Work around this by using a pair
6968 * of 1-wide MOVs and scattering the result.
6969 */
6970 const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
6971 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
6972 ubld.MOV(res1, brw_imm_d(0));
6973 set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
6974 s.dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H :
6975 s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
6976 BRW_PREDICATE_ALIGN1_ANY32H,
6977 ubld.MOV(res1, brw_imm_d(-1)));
6978
6979 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
6980 break;
6981 }
6982 case nir_intrinsic_vote_all: {
6983 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6984
6985 /* The any/all predicates do not consider channel enables. To prevent
6986 * dead channels from affecting the result, we initialize the flag with
6987 * with the identity value for the logical operation.
6988 */
6989 if (s.dispatch_width == 32) {
6990 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
6991 ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
6992 brw_imm_ud(0xffffffff));
6993 } else {
6994 ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
6995 }
6996 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
6997
6998 /* For some reason, the any/all predicates don't work properly with
6999 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
7000 * doesn't read the correct subset of the flag register and you end up
7001 * getting garbage in the second half. Work around this by using a pair
7002 * of 1-wide MOVs and scattering the result.
7003 */
7004 const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
7005 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
7006 ubld.MOV(res1, brw_imm_d(0));
7007 set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
7008 s.dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H :
7009 s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
7010 BRW_PREDICATE_ALIGN1_ALL32H,
7011 ubld.MOV(res1, brw_imm_d(-1)));
7012
7013 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
7014 break;
7015 }
7016 case nir_intrinsic_vote_feq:
7017 case nir_intrinsic_vote_ieq: {
7018 fs_reg value = get_nir_src(ntb, instr->src[0]);
7019 if (instr->intrinsic == nir_intrinsic_vote_feq) {
7020 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
7021 value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
7022 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
7023 }
7024
7025 fs_reg uniformized = bld.emit_uniformize(value);
7026 const fs_builder ubld1 = bld.exec_all().group(1, 0);
7027
7028 /* The any/all predicates do not consider channel enables. To prevent
7029 * dead channels from affecting the result, we initialize the flag with
7030 * with the identity value for the logical operation.
7031 */
7032 if (s.dispatch_width == 32) {
7033 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
7034 ubld1.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
7035 brw_imm_ud(0xffffffff));
7036 } else {
7037 ubld1.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
7038 }
7039 bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
7040
7041 /* For some reason, the any/all predicates don't work properly with
7042 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
7043 * doesn't read the correct subset of the flag register and you end up
7044 * getting garbage in the second half. Work around this by using a pair
7045 * of 1-wide MOVs and scattering the result.
7046 */
7047 const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
7048 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
7049 ubld.MOV(res1, brw_imm_d(0));
7050 set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
7051 s.dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H :
7052 s.dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
7053 BRW_PREDICATE_ALIGN1_ALL32H,
7054 ubld.MOV(res1, brw_imm_d(-1)));
7055
7056 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
7057 break;
7058 }
7059
7060 case nir_intrinsic_ballot: {
7061 if (instr->def.bit_size > 32) {
7062 dest.type = BRW_REGISTER_TYPE_UQ;
7063 } else {
7064 dest.type = BRW_REGISTER_TYPE_UD;
7065 }
7066
7067 /* Implement a fast-path for ballot(true). */
7068 if (nir_src_is_const(instr->src[0]) &&
7069 nir_src_as_bool(instr->src[0])) {
7070 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
7071 bld.exec_all().emit(SHADER_OPCODE_LOAD_LIVE_CHANNELS, tmp);
7072 bld.MOV(dest, fs_reg(component(tmp, 0)));
7073 break;
7074 }
7075
7076 const fs_reg value = retype(get_nir_src(ntb, instr->src[0]),
7077 BRW_REGISTER_TYPE_UD);
7078 struct brw_reg flag = brw_flag_reg(0, 0);
7079
7080 if (s.dispatch_width == 32)
7081 flag.type = BRW_REGISTER_TYPE_UD;
7082
7083 bld.exec_all().group(1, 0).MOV(flag, retype(brw_imm_ud(0u), flag.type));
7084 bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
7085 bld.MOV(dest, flag);
7086 break;
7087 }
7088
7089 case nir_intrinsic_read_invocation: {
7090 const fs_reg value = get_nir_src(ntb, instr->src[0]);
7091 const fs_reg invocation = get_nir_src(ntb, instr->src[1]);
7092
7093 fs_reg tmp = bld.vgrf(value.type);
7094
7095 /* When for some reason the subgroup_size picked by NIR is larger than
7096 * the dispatch size picked by the backend (this could happen in RT,
7097 * FS), bound the invocation to the dispatch size.
7098 */
7099 fs_reg bound_invocation;
7100 if (s.api_subgroup_size == 0 ||
7101 bld.dispatch_width() < s.api_subgroup_size) {
7102 bound_invocation = bld.vgrf(BRW_REGISTER_TYPE_UD);
7103 bld.AND(bound_invocation, invocation, brw_imm_ud(s.dispatch_width - 1));
7104 } else {
7105 bound_invocation = invocation;
7106 }
7107 bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
7108 bld.emit_uniformize(bound_invocation));
7109
7110 bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
7111 break;
7112 }
7113
7114 case nir_intrinsic_read_first_invocation: {
7115 const fs_reg value = get_nir_src(ntb, instr->src[0]);
7116 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
7117 break;
7118 }
7119
7120 case nir_intrinsic_shuffle: {
7121 const fs_reg value = get_nir_src(ntb, instr->src[0]);
7122 const fs_reg index = get_nir_src(ntb, instr->src[1]);
7123
7124 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
7125 break;
7126 }
7127
7128 case nir_intrinsic_first_invocation: {
7129 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
7130 bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
7131 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
7132 fs_reg(component(tmp, 0)));
7133 break;
7134 }
7135
7136 case nir_intrinsic_last_invocation: {
7137 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
7138 bld.exec_all().emit(SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
7139 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
7140 fs_reg(component(tmp, 0)));
7141 break;
7142 }
7143
7144 case nir_intrinsic_quad_broadcast: {
7145 const fs_reg value = get_nir_src(ntb, instr->src[0]);
7146 const unsigned index = nir_src_as_uint(instr->src[1]);
7147
7148 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
7149 value, brw_imm_ud(index), brw_imm_ud(4));
7150 break;
7151 }
7152
7153 case nir_intrinsic_quad_swap_horizontal: {
7154 const fs_reg value = get_nir_src(ntb, instr->src[0]);
7155 const fs_reg tmp = bld.vgrf(value.type);
7156
7157 const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
7158
7159 const fs_reg src_left = horiz_stride(value, 2);
7160 const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
7161 const fs_reg tmp_left = horiz_stride(tmp, 2);
7162 const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
7163
7164 ubld.MOV(tmp_left, src_right);
7165 ubld.MOV(tmp_right, src_left);
7166
7167 bld.MOV(retype(dest, value.type), tmp);
7168 break;
7169 }
7170
7171 case nir_intrinsic_quad_swap_vertical: {
7172 const fs_reg value = get_nir_src(ntb, instr->src[0]);
7173 if (nir_src_bit_size(instr->src[0]) == 32) {
7174 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
7175 const fs_reg tmp = bld.vgrf(value.type);
7176 const fs_builder ubld = bld.exec_all();
7177 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
7178 brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
7179 bld.MOV(retype(dest, value.type), tmp);
7180 } else {
7181 /* For larger data types, we have to either emit dispatch_width many
7182 * MOVs or else fall back to doing indirects.
7183 */
7184 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
7185 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
7186 brw_imm_w(0x2));
7187 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
7188 }
7189 break;
7190 }
7191
7192 case nir_intrinsic_quad_swap_diagonal: {
7193 const fs_reg value = get_nir_src(ntb, instr->src[0]);
7194 if (nir_src_bit_size(instr->src[0]) == 32) {
7195 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
7196 const fs_reg tmp = bld.vgrf(value.type);
7197 const fs_builder ubld = bld.exec_all();
7198 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
7199 brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
7200 bld.MOV(retype(dest, value.type), tmp);
7201 } else {
7202 /* For larger data types, we have to either emit dispatch_width many
7203 * MOVs or else fall back to doing indirects.
7204 */
7205 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
7206 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
7207 brw_imm_w(0x3));
7208 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
7209 }
7210 break;
7211 }
7212
7213 case nir_intrinsic_reduce: {
7214 fs_reg src = get_nir_src(ntb, instr->src[0]);
7215 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
7216 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
7217 if (cluster_size == 0 || cluster_size > s.dispatch_width)
7218 cluster_size = s.dispatch_width;
7219
7220 /* Figure out the source type */
7221 src.type = brw_type_for_nir_type(devinfo,
7222 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
7223 nir_src_bit_size(instr->src[0])));
7224
7225 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
7226 opcode brw_op = brw_op_for_nir_reduction_op(redop);
7227 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
7228
7229 /* Set up a register for all of our scratching around and initialize it
7230 * to reduction operation's identity value.
7231 */
7232 fs_reg scan = bld.vgrf(src.type);
7233 bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
7234
7235 bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
7236
7237 dest.type = src.type;
7238 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
7239 /* In this case, CLUSTER_BROADCAST instruction isn't needed because
7240 * the distance between clusters is at least 2 GRFs. In this case,
7241 * we don't need the weird striding of the CLUSTER_BROADCAST
7242 * instruction and can just do regular MOVs.
7243 */
7244 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
7245 const unsigned groups =
7246 (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
7247 const unsigned group_size = s.dispatch_width / groups;
7248 for (unsigned i = 0; i < groups; i++) {
7249 const unsigned cluster = (i * group_size) / cluster_size;
7250 const unsigned comp = cluster * cluster_size + (cluster_size - 1);
7251 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
7252 component(scan, comp));
7253 }
7254 } else {
7255 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
7256 brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
7257 }
7258 break;
7259 }
7260
7261 case nir_intrinsic_inclusive_scan:
7262 case nir_intrinsic_exclusive_scan: {
7263 fs_reg src = get_nir_src(ntb, instr->src[0]);
7264 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
7265
7266 /* Figure out the source type */
7267 src.type = brw_type_for_nir_type(devinfo,
7268 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
7269 nir_src_bit_size(instr->src[0])));
7270
7271 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
7272 opcode brw_op = brw_op_for_nir_reduction_op(redop);
7273 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
7274
7275 /* Set up a register for all of our scratching around and initialize it
7276 * to reduction operation's identity value.
7277 */
7278 fs_reg scan = bld.vgrf(src.type);
7279 const fs_builder allbld = bld.exec_all();
7280 allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
7281
7282 if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
7283 /* Exclusive scan is a bit harder because we have to do an annoying
7284 * shift of the contents before we can begin. To make things worse,
7285 * we can't do this with a normal stride; we have to use indirects.
7286 */
7287 fs_reg shifted = bld.vgrf(src.type);
7288 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
7289 allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
7290 brw_imm_w(-1));
7291 allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
7292 allbld.group(1, 0).MOV(component(shifted, 0), identity);
7293 scan = shifted;
7294 }
7295
7296 bld.emit_scan(brw_op, scan, s.dispatch_width, cond_mod);
7297
7298 bld.MOV(retype(dest, src.type), scan);
7299 break;
7300 }
7301
7302 case nir_intrinsic_load_global_block_intel: {
7303 assert(instr->def.bit_size == 32);
7304
7305 fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
7306
7307 const fs_builder ubld1 = bld.exec_all().group(1, 0);
7308 const fs_builder ubld8 = bld.exec_all().group(8, 0);
7309 const fs_builder ubld16 = bld.exec_all().group(16, 0);
7310
7311 const unsigned total = instr->num_components * s.dispatch_width;
7312 unsigned loaded = 0;
7313
7314 while (loaded < total) {
7315 const unsigned block =
7316 choose_oword_block_size_dwords(devinfo, total - loaded);
7317 const unsigned block_bytes = block * 4;
7318
7319 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
7320
7321 fs_reg srcs[A64_LOGICAL_NUM_SRCS];
7322 srcs[A64_LOGICAL_ADDRESS] = address;
7323 srcs[A64_LOGICAL_SRC] = fs_reg(); /* No source data */
7324 srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
7325 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(1);
7326 ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
7327 retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
7328 srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
7329
7330 increment_a64_address(ubld1, address, block_bytes);
7331 loaded += block;
7332 }
7333
7334 assert(loaded == total);
7335 break;
7336 }
7337
7338 case nir_intrinsic_store_global_block_intel: {
7339 assert(nir_src_bit_size(instr->src[0]) == 32);
7340
7341 fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
7342 fs_reg src = get_nir_src(ntb, instr->src[0]);
7343
7344 const fs_builder ubld1 = bld.exec_all().group(1, 0);
7345 const fs_builder ubld8 = bld.exec_all().group(8, 0);
7346 const fs_builder ubld16 = bld.exec_all().group(16, 0);
7347
7348 const unsigned total = instr->num_components * s.dispatch_width;
7349 unsigned written = 0;
7350
7351 while (written < total) {
7352 const unsigned block =
7353 choose_oword_block_size_dwords(devinfo, total - written);
7354
7355 fs_reg srcs[A64_LOGICAL_NUM_SRCS];
7356 srcs[A64_LOGICAL_ADDRESS] = address;
7357 srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
7358 BRW_REGISTER_TYPE_UD);
7359 srcs[A64_LOGICAL_ARG] = brw_imm_ud(block);
7360 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
7361
7362 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
7363 ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, fs_reg(),
7364 srcs, A64_LOGICAL_NUM_SRCS);
7365
7366 const unsigned block_bytes = block * 4;
7367 increment_a64_address(ubld1, address, block_bytes);
7368 written += block;
7369 }
7370
7371 assert(written == total);
7372 break;
7373 }
7374
7375 case nir_intrinsic_load_shared_block_intel:
7376 case nir_intrinsic_load_ssbo_block_intel: {
7377 assert(instr->def.bit_size == 32);
7378
7379 const bool is_ssbo =
7380 instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
7381 fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0]));
7382
7383 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
7384 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
7385 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
7386 fs_reg(brw_imm_ud(GFX7_BTI_SLM));
7387 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
7388
7389 const fs_builder ubld1 = bld.exec_all().group(1, 0);
7390 const fs_builder ubld8 = bld.exec_all().group(8, 0);
7391 const fs_builder ubld16 = bld.exec_all().group(16, 0);
7392
7393 const unsigned total = instr->num_components * s.dispatch_width;
7394 unsigned loaded = 0;
7395
7396 while (loaded < total) {
7397 const unsigned block =
7398 choose_oword_block_size_dwords(devinfo, total - loaded);
7399 const unsigned block_bytes = block * 4;
7400
7401 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
7402
7403 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
7404 ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
7405 retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
7406 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
7407
7408 ubld1.ADD(address, address, brw_imm_ud(block_bytes));
7409 loaded += block;
7410 }
7411
7412 assert(loaded == total);
7413 break;
7414 }
7415
7416 case nir_intrinsic_store_shared_block_intel:
7417 case nir_intrinsic_store_ssbo_block_intel: {
7418 assert(nir_src_bit_size(instr->src[0]) == 32);
7419
7420 const bool is_ssbo =
7421 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
7422
7423 fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1]));
7424 fs_reg src = get_nir_src(ntb, instr->src[0]);
7425
7426 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
7427 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
7428 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
7429 fs_reg(brw_imm_ud(GFX7_BTI_SLM));
7430 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
7431
7432 const fs_builder ubld1 = bld.exec_all().group(1, 0);
7433 const fs_builder ubld8 = bld.exec_all().group(8, 0);
7434 const fs_builder ubld16 = bld.exec_all().group(16, 0);
7435
7436 const unsigned total = instr->num_components * s.dispatch_width;
7437 unsigned written = 0;
7438
7439 while (written < total) {
7440 const unsigned block =
7441 choose_oword_block_size_dwords(devinfo, total - written);
7442
7443 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
7444 srcs[SURFACE_LOGICAL_SRC_DATA] =
7445 retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD);
7446
7447 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
7448 ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
7449 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
7450
7451 const unsigned block_bytes = block * 4;
7452 ubld1.ADD(address, address, brw_imm_ud(block_bytes));
7453 written += block;
7454 }
7455
7456 assert(written == total);
7457 break;
7458 }
7459
7460 case nir_intrinsic_load_topology_id_intel: {
7461 /* These move around basically every hardware generation, so don't
7462 * do any unbounded checks and fail if the platform hasn't explicitly
7463 * been enabled here.
7464 */
7465 assert(devinfo->ver >= 12 && devinfo->ver <= 20);
7466
7467 /* Here is what the layout of SR0 looks like on Gfx12
7468 * https://gfxspecs.intel.com/Predator/Home/Index/47256
7469 * [13:11] : Slice ID.
7470 * [10:9] : Dual-SubSlice ID
7471 * [8] : SubSlice ID
7472 * [7] : EUID[2] (aka EU Row ID)
7473 * [6] : Reserved
7474 * [5:4] : EUID[1:0]
7475 * [2:0] : Thread ID
7476 *
7477 * Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and
7478 * Register Regions, ARF Registers, State Register,
7479 * https://gfxspecs.intel.com/Predator/Home/Index/56623
7480 * [15:11] : Slice ID.
7481 * [9:8] : SubSlice ID
7482 * [6:4] : EUID
7483 * [2:0] : Thread ID
7484 */
7485 fs_reg raw_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
7486 bld.emit(SHADER_OPCODE_READ_SR_REG, raw_id, brw_imm_ud(0));
7487 switch (nir_intrinsic_base(instr)) {
7488 case BRW_TOPOLOGY_ID_DSS:
7489 if (devinfo->ver >= 20) {
7490 /* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing:
7491 * https://gfxspecs.intel.com/Predator/Home/Index/56936
7492 *
7493 * Note: DSSID in all formulas below is a logical identifier of an
7494 * XeCore (a value that goes from 0 to (number_of_slices *
7495 * number_of_XeCores_per_slice -1). SW can get this value from
7496 * either:
7497 *
7498 * - Message Control Register LogicalSSID field (only in shaders
7499 * eligible for Mid-Thread Preemption).
7500 * - Calculated based of State Register with the following formula:
7501 * DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE +
7502 * StateRRegister.SubSliceID where GT_SS_PER_SLICE is an
7503 * architectural parameter defined per product SKU.
7504 *
7505 * We are using the state register to calculate the DSSID.
7506 */
7507 fs_reg slice_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
7508 fs_reg subslice_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
7509 bld.AND(slice_id, raw_id, brw_imm_ud(INTEL_MASK(15, 11)));
7510 bld.SHR(slice_id, slice_id, brw_imm_ud(11));
7511
7512 /* Assert that max subslices covers at least 2 bits that we use for
7513 * subslices.
7514 */
7515 assert(devinfo->max_subslices_per_slice >= (1 << 2));
7516 bld.MUL(slice_id, slice_id,
7517 brw_imm_ud(devinfo->max_subslices_per_slice));
7518 bld.AND(subslice_id, raw_id, brw_imm_ud(INTEL_MASK(9, 8)));
7519 bld.SHR(subslice_id, subslice_id, brw_imm_ud(8));
7520 bld.ADD(retype(dest, BRW_REGISTER_TYPE_UD), slice_id,
7521 subslice_id);
7522 } else {
7523 bld.AND(raw_id, raw_id, brw_imm_ud(0x3fff));
7524 /* Get rid of anything below dualsubslice */
7525 bld.SHR(retype(dest, BRW_REGISTER_TYPE_UD), raw_id, brw_imm_ud(9));
7526 }
7527 break;
7528 case BRW_TOPOLOGY_ID_EU_THREAD_SIMD: {
7529 s.limit_dispatch_width(16, "Topology helper for Ray queries, "
7530 "not supported in SIMD32 mode.");
7531 fs_reg dst = retype(dest, BRW_REGISTER_TYPE_UD);
7532
7533 if (devinfo->ver >= 20) {
7534 /* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions
7535 * Ray Tracing,
7536 * https://gfxspecs.intel.com/Predator/Home/Index/56936
7537 *
7538 * SyncStackID = (EUID[2:0] << 8) | (ThreadID[2:0] << 4) |
7539 * SIMDLaneID[3:0];
7540 *
7541 * This section just deals with the EUID part.
7542 *
7543 * The 3bit EU[2:0] we need to build for ray query memory addresses
7544 * computations is a bit odd :
7545 *
7546 * EU[2:0] = raw_id[6:4] (identified as EUID[2:0])
7547 */
7548 bld.AND(dst, raw_id, brw_imm_ud(INTEL_MASK(6, 4)));
7549 bld.SHL(dst, dst, brw_imm_ud(4));
7550 } else {
7551 /* EU[3:0] << 7
7552 *
7553 * The 4bit EU[3:0] we need to build for ray query memory addresses
7554 * computations is a bit odd :
7555 *
7556 * EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
7557 * EU[2] = raw_id[8] (identified as SubSlice ID)
7558 * EU[3] = raw_id[7] (identified as EUID[2] or Row ID)
7559 */
7560 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
7561 bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(7, 7)));
7562 bld.SHL(dst, tmp, brw_imm_ud(3));
7563 bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(8, 8)));
7564 bld.SHL(tmp, tmp, brw_imm_ud(1));
7565 bld.OR(dst, dst, tmp);
7566 bld.AND(tmp, raw_id, brw_imm_ud(INTEL_MASK(5, 4)));
7567 bld.SHL(tmp, tmp, brw_imm_ud(3));
7568 bld.OR(dst, dst, tmp);
7569 }
7570
7571 /* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */
7572 {
7573 bld.AND(raw_id, raw_id, brw_imm_ud(INTEL_MASK(2, 0)));
7574 bld.SHL(raw_id, raw_id, brw_imm_ud(4));
7575 bld.OR(dst, dst, raw_id);
7576 }
7577
7578 /* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
7579 assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
7580 bld.ADD(dst, dst,
7581 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
7582 break;
7583 }
7584 default:
7585 unreachable("Invalid topology id type");
7586 }
7587 break;
7588 }
7589
7590 case nir_intrinsic_load_btd_stack_id_intel:
7591 if (s.stage == MESA_SHADER_COMPUTE) {
7592 assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
7593 } else {
7594 assert(brw_shader_stage_is_bindless(s.stage));
7595 }
7596 /* Stack IDs are always in R1 regardless of whether we're coming from a
7597 * bindless shader or a regular compute shader.
7598 */
7599 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
7600 retype(brw_vec8_grf(1 * reg_unit(devinfo), 0), BRW_REGISTER_TYPE_UW));
7601 break;
7602
7603 case nir_intrinsic_btd_spawn_intel:
7604 if (s.stage == MESA_SHADER_COMPUTE) {
7605 assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
7606 } else {
7607 assert(brw_shader_stage_is_bindless(s.stage));
7608 }
7609 /* Make sure all the pointers to resume shaders have landed where other
7610 * threads can see them.
7611 */
7612 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
7613
7614 bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
7615 bld.emit_uniformize(get_nir_src(ntb, instr->src[0])),
7616 get_nir_src(ntb, instr->src[1]));
7617 break;
7618
7619 case nir_intrinsic_btd_retire_intel:
7620 if (s.stage == MESA_SHADER_COMPUTE) {
7621 assert(brw_cs_prog_data(s.prog_data)->uses_btd_stack_ids);
7622 } else {
7623 assert(brw_shader_stage_is_bindless(s.stage));
7624 }
7625 /* Make sure all the pointers to resume shaders have landed where other
7626 * threads can see them.
7627 */
7628 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
7629 bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
7630 break;
7631
7632 case nir_intrinsic_trace_ray_intel: {
7633 const bool synchronous = nir_intrinsic_synchronous(instr);
7634 assert(brw_shader_stage_is_bindless(s.stage) || synchronous);
7635
7636 /* Make sure all the previous RT structure writes are visible to the RT
7637 * fixed function within the DSS, as well as stack pointers to resume
7638 * shaders.
7639 */
7640 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_NONE);
7641
7642 fs_reg srcs[RT_LOGICAL_NUM_SRCS];
7643
7644 fs_reg globals = get_nir_src(ntb, instr->src[0]);
7645 srcs[RT_LOGICAL_SRC_GLOBALS] = bld.emit_uniformize(globals);
7646 srcs[RT_LOGICAL_SRC_BVH_LEVEL] = get_nir_src(ntb, instr->src[1]);
7647 srcs[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] = get_nir_src(ntb, instr->src[2]);
7648 srcs[RT_LOGICAL_SRC_SYNCHRONOUS] = brw_imm_ud(synchronous);
7649 bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, bld.null_reg_ud(),
7650 srcs, RT_LOGICAL_NUM_SRCS);
7651
7652 /* There is no actual value to use in the destination register of the
7653 * synchronous trace instruction. All of the communication with the HW
7654 * unit happens through memory reads/writes. So to ensure that the
7655 * operation has completed before we go read the results in memory, we
7656 * need a barrier followed by an invalidate before accessing memory.
7657 */
7658 if (synchronous) {
7659 bld.emit(BRW_OPCODE_SYNC, bld.null_reg_ud(), brw_imm_ud(TGL_SYNC_ALLWR));
7660 emit_rt_lsc_fence(bld, LSC_FENCE_LOCAL, LSC_FLUSH_TYPE_INVALIDATE);
7661 }
7662 break;
7663 }
7664
7665 default:
7666 #ifndef NDEBUG
7667 assert(instr->intrinsic < nir_num_intrinsics);
7668 fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
7669 #endif
7670 unreachable("unknown intrinsic");
7671 }
7672 }
7673
7674 static fs_reg
expand_to_32bit(const fs_builder & bld,const fs_reg & src)7675 expand_to_32bit(const fs_builder &bld, const fs_reg &src)
7676 {
7677 if (type_sz(src.type) == 2) {
7678 fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
7679 bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW));
7680 return src32;
7681 } else {
7682 return src;
7683 }
7684 }
7685
7686 static void
fs_nir_emit_surface_atomic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,fs_reg surface,bool bindless)7687 fs_nir_emit_surface_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
7688 nir_intrinsic_instr *instr,
7689 fs_reg surface,
7690 bool bindless)
7691 {
7692 const intel_device_info *devinfo = ntb.devinfo;
7693 fs_visitor &s = ntb.s;
7694
7695 enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr);
7696 int num_data = lsc_op_num_data_values(op);
7697
7698 bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
7699
7700 /* The BTI untyped atomic messages only support 32-bit atomics. If you
7701 * just look at the big table of messages in the Vol 7 of the SKL PRM, they
7702 * appear to exist. However, if you look at Vol 2a, there are no message
7703 * descriptors provided for Qword atomic ops except for A64 messages.
7704 *
7705 * 16-bit float atomics are supported, however.
7706 */
7707 assert(instr->def.bit_size == 32 ||
7708 (instr->def.bit_size == 64 && devinfo->has_lsc) ||
7709 (instr->def.bit_size == 16 &&
7710 (devinfo->has_lsc || lsc_opcode_is_atomic_float(op))));
7711
7712 fs_reg dest = get_nir_def(ntb, instr->def);
7713
7714 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
7715 srcs[bindless ?
7716 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
7717 SURFACE_LOGICAL_SRC_SURFACE] = surface;
7718 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
7719 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
7720 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
7721
7722 if (shared) {
7723 /* SLM - Get the offset */
7724 if (nir_src_is_const(instr->src[0])) {
7725 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
7726 brw_imm_ud(nir_intrinsic_base(instr) +
7727 nir_src_as_uint(instr->src[0]));
7728 } else {
7729 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type());
7730 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
7731 retype(get_nir_src(ntb, instr->src[0]), BRW_REGISTER_TYPE_UD),
7732 brw_imm_ud(nir_intrinsic_base(instr)));
7733 }
7734 } else {
7735 /* SSBOs */
7736 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
7737 }
7738
7739 fs_reg data;
7740 if (num_data >= 1)
7741 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2]));
7742
7743 if (num_data >= 2) {
7744 fs_reg tmp = bld.vgrf(data.type, 2);
7745 fs_reg sources[2] = {
7746 data,
7747 expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3]))
7748 };
7749 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
7750 data = tmp;
7751 }
7752 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
7753
7754 /* Emit the actual atomic operation */
7755
7756 switch (instr->def.bit_size) {
7757 case 16: {
7758 fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
7759 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
7760 retype(dest32, dest.type),
7761 srcs, SURFACE_LOGICAL_NUM_SRCS);
7762 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW),
7763 retype(dest32, BRW_REGISTER_TYPE_UD));
7764 break;
7765 }
7766
7767 case 32:
7768 case 64:
7769 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
7770 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
7771 break;
7772 default:
7773 unreachable("Unsupported bit size");
7774 }
7775 }
7776
7777 static void
fs_nir_emit_global_atomic(nir_to_brw_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)7778 fs_nir_emit_global_atomic(nir_to_brw_state &ntb, const fs_builder &bld,
7779 nir_intrinsic_instr *instr)
7780 {
7781 enum lsc_opcode op = lsc_aop_for_nir_intrinsic(instr);
7782 int num_data = lsc_op_num_data_values(op);
7783
7784 fs_reg dest = get_nir_def(ntb, instr->def);
7785
7786 fs_reg addr = get_nir_src(ntb, instr->src[0]);
7787
7788 fs_reg data;
7789 if (num_data >= 1)
7790 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1]));
7791
7792 if (num_data >= 2) {
7793 fs_reg tmp = bld.vgrf(data.type, 2);
7794 fs_reg sources[2] = {
7795 data,
7796 expand_to_32bit(bld, get_nir_src(ntb, instr->src[2]))
7797 };
7798 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
7799 data = tmp;
7800 }
7801
7802 fs_reg srcs[A64_LOGICAL_NUM_SRCS];
7803 srcs[A64_LOGICAL_ADDRESS] = addr;
7804 srcs[A64_LOGICAL_SRC] = data;
7805 srcs[A64_LOGICAL_ARG] = brw_imm_ud(op);
7806 srcs[A64_LOGICAL_ENABLE_HELPERS] = brw_imm_ud(0);
7807
7808 switch (instr->def.bit_size) {
7809 case 16: {
7810 fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
7811 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
7812 retype(dest32, dest.type),
7813 srcs, A64_LOGICAL_NUM_SRCS);
7814 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);
7815 break;
7816 }
7817 case 32:
7818 case 64:
7819 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
7820 srcs, A64_LOGICAL_NUM_SRCS);
7821 break;
7822 default:
7823 unreachable("Unsupported bit size");
7824 }
7825 }
7826
7827 static void
fs_nir_emit_texture(nir_to_brw_state & ntb,nir_tex_instr * instr)7828 fs_nir_emit_texture(nir_to_brw_state &ntb,
7829 nir_tex_instr *instr)
7830 {
7831 const intel_device_info *devinfo = ntb.devinfo;
7832 const fs_builder &bld = ntb.bld;
7833 fs_visitor &s = ntb.s;
7834
7835 fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
7836
7837 /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
7838 *
7839 * "The Pixel Null Mask field, when enabled via the Pixel Null Mask
7840 * Enable will be incorect for sample_c when applied to a surface with
7841 * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
7842 * Enable may incorrectly report pixels as referencing a Null surface."
7843 *
7844 * We'll take care of this in NIR.
7845 */
7846 assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
7847
7848 srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(instr->is_sparse);
7849
7850 int lod_components = 0;
7851
7852 /* The hardware requires a LOD for buffer textures */
7853 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
7854 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
7855
7856 ASSERTED bool got_lod = false;
7857 ASSERTED bool got_bias = false;
7858 bool pack_lod_and_array_index = false;
7859 bool pack_lod_bias_and_offset = false;
7860 uint32_t header_bits = 0;
7861 for (unsigned i = 0; i < instr->num_srcs; i++) {
7862 nir_src nir_src = instr->src[i].src;
7863 fs_reg src = get_nir_src(ntb, nir_src);
7864 switch (instr->src[i].src_type) {
7865 case nir_tex_src_bias:
7866 assert(!got_lod);
7867 got_bias = true;
7868
7869 srcs[TEX_LOGICAL_SRC_LOD] =
7870 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
7871 break;
7872 case nir_tex_src_comparator:
7873 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
7874 break;
7875 case nir_tex_src_coord:
7876 switch (instr->op) {
7877 case nir_texop_txf:
7878 case nir_texop_txf_ms:
7879 case nir_texop_txf_ms_mcs_intel:
7880 case nir_texop_samples_identical:
7881 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
7882 break;
7883 default:
7884 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
7885 break;
7886 }
7887 break;
7888 case nir_tex_src_ddx:
7889 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
7890 lod_components = nir_tex_instr_src_size(instr, i);
7891 break;
7892 case nir_tex_src_ddy:
7893 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
7894 break;
7895 case nir_tex_src_lod:
7896 assert(!got_bias);
7897 got_lod = true;
7898
7899 switch (instr->op) {
7900 case nir_texop_txs:
7901 srcs[TEX_LOGICAL_SRC_LOD] =
7902 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_UD);
7903 break;
7904 case nir_texop_txf:
7905 srcs[TEX_LOGICAL_SRC_LOD] =
7906 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_D);
7907 break;
7908 default:
7909 srcs[TEX_LOGICAL_SRC_LOD] =
7910 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
7911 break;
7912 }
7913 break;
7914 case nir_tex_src_min_lod:
7915 srcs[TEX_LOGICAL_SRC_MIN_LOD] =
7916 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
7917 break;
7918 case nir_tex_src_ms_index:
7919 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
7920 break;
7921
7922 case nir_tex_src_offset: {
7923 uint32_t offset_bits = 0;
7924 if (brw_texture_offset(instr, i, &offset_bits)) {
7925 header_bits |= offset_bits;
7926 } else {
7927 /* On gfx12.5+, if the offsets are not both constant and in the
7928 * {-8,7} range, nir_lower_tex() will have already lowered the
7929 * source offset. So we should never reach this point.
7930 */
7931 assert(devinfo->verx10 < 125);
7932 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
7933 retype(src, BRW_REGISTER_TYPE_D);
7934 }
7935 break;
7936 }
7937
7938 case nir_tex_src_projector:
7939 unreachable("should be lowered");
7940
7941 case nir_tex_src_texture_offset: {
7942 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
7943 /* Emit code to evaluate the actual indexing expression */
7944 if (instr->texture_index == 0 && is_resource_src(nir_src))
7945 srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src);
7946 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
7947 fs_reg tmp = s.vgrf(glsl_uint_type());
7948 bld.ADD(tmp, src, brw_imm_ud(instr->texture_index));
7949 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
7950 }
7951 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
7952 break;
7953 }
7954
7955 case nir_tex_src_sampler_offset: {
7956 /* Emit code to evaluate the actual indexing expression */
7957 if (instr->sampler_index == 0 && is_resource_src(nir_src))
7958 srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src);
7959 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
7960 fs_reg tmp = s.vgrf(glsl_uint_type());
7961 bld.ADD(tmp, src, brw_imm_ud(instr->sampler_index));
7962 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
7963 }
7964 break;
7965 }
7966
7967 case nir_tex_src_texture_handle:
7968 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
7969 srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
7970 if (is_resource_src(nir_src))
7971 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src);
7972 if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
7973 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
7974 break;
7975
7976 case nir_tex_src_sampler_handle:
7977 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
7978 srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
7979 if (is_resource_src(nir_src))
7980 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src);
7981 if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
7982 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
7983 break;
7984
7985 case nir_tex_src_ms_mcs_intel:
7986 assert(instr->op == nir_texop_txf_ms);
7987 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
7988 break;
7989
7990 /* If this parameter is present, we are packing offset U, V and LOD/Bias
7991 * into a single (32-bit) value.
7992 */
7993 case nir_tex_src_backend2:
7994 assert(instr->op == nir_texop_tg4);
7995 pack_lod_bias_and_offset = true;
7996 srcs[TEX_LOGICAL_SRC_LOD] =
7997 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
7998 break;
7999
8000 /* If this parameter is present, we are packing either the explicit LOD
8001 * or LOD bias and the array index into a single (32-bit) value when
8002 * 32-bit texture coordinates are used.
8003 */
8004 case nir_tex_src_backend1:
8005 assert(!got_lod && !got_bias);
8006 got_lod = true;
8007 pack_lod_and_array_index = true;
8008 assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
8009 srcs[TEX_LOGICAL_SRC_LOD] =
8010 retype(get_nir_src_imm(ntb, instr->src[i].src), BRW_REGISTER_TYPE_F);
8011 break;
8012
8013 default:
8014 unreachable("unknown texture source");
8015 }
8016 }
8017
8018 /* If the surface or sampler were not specified through sources, use the
8019 * instruction index.
8020 */
8021 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
8022 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
8023 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
8024 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
8025 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
8026 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
8027
8028 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
8029 (instr->op == nir_texop_txf_ms ||
8030 instr->op == nir_texop_samples_identical)) {
8031 srcs[TEX_LOGICAL_SRC_MCS] =
8032 emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
8033 instr->coord_components,
8034 srcs[TEX_LOGICAL_SRC_SURFACE],
8035 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
8036 }
8037
8038 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
8039 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
8040
8041 enum opcode opcode;
8042 switch (instr->op) {
8043 case nir_texop_tex:
8044 opcode = SHADER_OPCODE_TEX_LOGICAL;
8045 break;
8046 case nir_texop_txb:
8047 opcode = FS_OPCODE_TXB_LOGICAL;
8048 break;
8049 case nir_texop_txl:
8050 opcode = SHADER_OPCODE_TXL_LOGICAL;
8051 break;
8052 case nir_texop_txd:
8053 opcode = SHADER_OPCODE_TXD_LOGICAL;
8054 break;
8055 case nir_texop_txf:
8056 opcode = SHADER_OPCODE_TXF_LOGICAL;
8057 break;
8058 case nir_texop_txf_ms:
8059 /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
8060 * Functions - 3D Sampler - Messages - Message Format:
8061 *
8062 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
8063 */
8064 if (devinfo->verx10 >= 125)
8065 opcode = SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
8066 else
8067 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
8068 break;
8069 case nir_texop_txf_ms_mcs_intel:
8070 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
8071 break;
8072 case nir_texop_query_levels:
8073 case nir_texop_txs:
8074 opcode = SHADER_OPCODE_TXS_LOGICAL;
8075 break;
8076 case nir_texop_lod:
8077 opcode = SHADER_OPCODE_LOD_LOGICAL;
8078 break;
8079 case nir_texop_tg4: {
8080 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) {
8081 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
8082 } else {
8083 opcode = SHADER_OPCODE_TG4_LOGICAL;
8084 if (devinfo->ver >= 20) {
8085 /* If SPV_AMD_texture_gather_bias_lod extension is enabled, all
8086 * texture gather functions (ie. the ones which do not take the
8087 * extra bias argument and the ones that do) fetch texels from
8088 * implicit LOD in fragment shader stage. In all other shader
8089 * stages, base level is used instead.
8090 */
8091 if (instr->is_gather_implicit_lod)
8092 opcode = SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL;
8093
8094 if (got_bias)
8095 opcode = SHADER_OPCODE_TG4_BIAS_LOGICAL;
8096
8097 if (got_lod)
8098 opcode = SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL;
8099
8100 if (pack_lod_bias_and_offset) {
8101 if (got_lod)
8102 opcode = SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL;
8103 if (got_bias)
8104 opcode = SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL;
8105 }
8106 }
8107 }
8108 break;
8109 }
8110 case nir_texop_texture_samples:
8111 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
8112 break;
8113 case nir_texop_samples_identical: {
8114 fs_reg dst = retype(get_nir_def(ntb, instr->def), BRW_REGISTER_TYPE_D);
8115
8116 /* If mcs is an immediate value, it means there is no MCS. In that case
8117 * just return false.
8118 */
8119 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
8120 bld.MOV(dst, brw_imm_ud(0u));
8121 } else {
8122 fs_reg tmp = s.vgrf(glsl_uint_type());
8123 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
8124 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
8125 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
8126 }
8127 return;
8128 }
8129 default:
8130 unreachable("unknown texture opcode");
8131 }
8132
8133 if (instr->op == nir_texop_tg4) {
8134 header_bits |= instr->component << 16;
8135 }
8136
8137 fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
8138 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
8139 inst->offset = header_bits;
8140
8141 inst->has_packed_lod_ai_src = pack_lod_and_array_index;
8142
8143 const unsigned dest_size = nir_tex_instr_dest_size(instr);
8144 if (instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
8145 unsigned write_mask = nir_def_components_read(&instr->def);
8146 assert(write_mask != 0); /* dead code should have been eliminated */
8147 if (instr->is_sparse) {
8148 inst->size_written = (util_last_bit(write_mask) - 1) *
8149 inst->dst.component_size(inst->exec_size) +
8150 (reg_unit(devinfo) * REG_SIZE);
8151 } else {
8152 inst->size_written = util_last_bit(write_mask) *
8153 inst->dst.component_size(inst->exec_size);
8154 }
8155 } else {
8156 inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
8157 (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
8158 }
8159
8160 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
8161 inst->shadow_compare = true;
8162
8163 /* Wa_14012688258:
8164 *
8165 * Don't trim zeros at the end of payload for sample operations
8166 * in cube and cube arrays.
8167 */
8168 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
8169 intel_needs_workaround(devinfo, 14012688258)) {
8170
8171 /* Compiler should send U,V,R parameters even if V,R are 0. */
8172 if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
8173 assert(instr->coord_components >= 3u);
8174
8175 /* See opt_zero_samples(). */
8176 inst->keep_payload_trailing_zeros = true;
8177 }
8178
8179 fs_reg nir_dest[5];
8180 for (unsigned i = 0; i < dest_size; i++)
8181 nir_dest[i] = offset(dst, bld, i);
8182
8183 if (instr->op == nir_texop_query_levels) {
8184 /* # levels is in .w */
8185 if (devinfo->ver == 9) {
8186 /**
8187 * Wa_1940217:
8188 *
8189 * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
8190 * MIPCount returned is undefined instead of 0.
8191 */
8192 fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
8193 mov->conditional_mod = BRW_CONDITIONAL_NZ;
8194 nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D);
8195 fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
8196 sel->predicate = BRW_PREDICATE_NORMAL;
8197 } else {
8198 nir_dest[0] = offset(dst, bld, 3);
8199 }
8200 }
8201
8202 /* The residency bits are only in the first component. */
8203 if (instr->is_sparse)
8204 nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
8205
8206 bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0);
8207 }
8208
8209 static void
fs_nir_emit_jump(nir_to_brw_state & ntb,nir_jump_instr * instr)8210 fs_nir_emit_jump(nir_to_brw_state &ntb, nir_jump_instr *instr)
8211 {
8212 switch (instr->type) {
8213 case nir_jump_break:
8214 ntb.bld.emit(BRW_OPCODE_BREAK);
8215 break;
8216 case nir_jump_continue:
8217 ntb.bld.emit(BRW_OPCODE_CONTINUE);
8218 break;
8219 case nir_jump_halt:
8220 ntb.bld.emit(BRW_OPCODE_HALT);
8221 break;
8222 case nir_jump_return:
8223 default:
8224 unreachable("unknown jump");
8225 }
8226 }
8227
8228 /*
8229 * This helper takes a source register and un/shuffles it into the destination
8230 * register.
8231 *
8232 * If source type size is smaller than destination type size the operation
8233 * needed is a component shuffle. The opposite case would be an unshuffle. If
8234 * source/destination type size is equal a shuffle is done that would be
8235 * equivalent to a simple MOV.
8236 *
8237 * For example, if source is a 16-bit type and destination is 32-bit. A 3
8238 * components .xyz 16-bit vector on SIMD8 would be.
8239 *
8240 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
8241 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
8242 *
8243 * This helper will return the following 2 32-bit components with the 16-bit
8244 * values shuffled:
8245 *
8246 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
8247 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
8248 *
8249 * For unshuffle, the example would be the opposite, a 64-bit type source
8250 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
8251 * would be:
8252 *
8253 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
8254 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
8255 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
8256 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
8257 *
8258 * The returned result would be the following 4 32-bit components unshuffled:
8259 *
8260 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
8261 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
8262 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
8263 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
8264 *
8265 * - Source and destination register must not be overlapped.
8266 * - components units are measured in terms of the smaller type between
8267 * source and destination because we are un/shuffling the smaller
8268 * components from/into the bigger ones.
8269 * - first_component parameter allows skipping source components.
8270 */
8271 void
shuffle_src_to_dst(const fs_builder & bld,const fs_reg & dst,const fs_reg & src,uint32_t first_component,uint32_t components)8272 shuffle_src_to_dst(const fs_builder &bld,
8273 const fs_reg &dst,
8274 const fs_reg &src,
8275 uint32_t first_component,
8276 uint32_t components)
8277 {
8278 if (type_sz(src.type) == type_sz(dst.type)) {
8279 assert(!regions_overlap(dst,
8280 type_sz(dst.type) * bld.dispatch_width() * components,
8281 offset(src, bld, first_component),
8282 type_sz(src.type) * bld.dispatch_width() * components));
8283 for (unsigned i = 0; i < components; i++) {
8284 bld.MOV(retype(offset(dst, bld, i), src.type),
8285 offset(src, bld, i + first_component));
8286 }
8287 } else if (type_sz(src.type) < type_sz(dst.type)) {
8288 /* Source is shuffled into destination */
8289 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
8290 assert(!regions_overlap(dst,
8291 type_sz(dst.type) * bld.dispatch_width() *
8292 DIV_ROUND_UP(components, size_ratio),
8293 offset(src, bld, first_component),
8294 type_sz(src.type) * bld.dispatch_width() * components));
8295
8296 brw_reg_type shuffle_type =
8297 brw_reg_type_from_bit_size(8 * type_sz(src.type),
8298 BRW_REGISTER_TYPE_D);
8299 for (unsigned i = 0; i < components; i++) {
8300 fs_reg shuffle_component_i =
8301 subscript(offset(dst, bld, i / size_ratio),
8302 shuffle_type, i % size_ratio);
8303 bld.MOV(shuffle_component_i,
8304 retype(offset(src, bld, i + first_component), shuffle_type));
8305 }
8306 } else {
8307 /* Source is unshuffled into destination */
8308 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
8309 assert(!regions_overlap(dst,
8310 type_sz(dst.type) * bld.dispatch_width() * components,
8311 offset(src, bld, first_component / size_ratio),
8312 type_sz(src.type) * bld.dispatch_width() *
8313 DIV_ROUND_UP(components + (first_component % size_ratio),
8314 size_ratio)));
8315
8316 brw_reg_type shuffle_type =
8317 brw_reg_type_from_bit_size(8 * type_sz(dst.type),
8318 BRW_REGISTER_TYPE_D);
8319 for (unsigned i = 0; i < components; i++) {
8320 fs_reg shuffle_component_i =
8321 subscript(offset(src, bld, (first_component + i) / size_ratio),
8322 shuffle_type, (first_component + i) % size_ratio);
8323 bld.MOV(retype(offset(dst, bld, i), shuffle_type),
8324 shuffle_component_i);
8325 }
8326 }
8327 }
8328
8329 void
shuffle_from_32bit_read(const fs_builder & bld,const fs_reg & dst,const fs_reg & src,uint32_t first_component,uint32_t components)8330 shuffle_from_32bit_read(const fs_builder &bld,
8331 const fs_reg &dst,
8332 const fs_reg &src,
8333 uint32_t first_component,
8334 uint32_t components)
8335 {
8336 assert(type_sz(src.type) == 4);
8337
8338 /* This function takes components in units of the destination type while
8339 * shuffle_src_to_dst takes components in units of the smallest type
8340 */
8341 if (type_sz(dst.type) > 4) {
8342 assert(type_sz(dst.type) == 8);
8343 first_component *= 2;
8344 components *= 2;
8345 }
8346
8347 shuffle_src_to_dst(bld, dst, src, first_component, components);
8348 }
8349
8350 static void
fs_nir_emit_instr(nir_to_brw_state & ntb,nir_instr * instr)8351 fs_nir_emit_instr(nir_to_brw_state &ntb, nir_instr *instr)
8352 {
8353 ntb.bld = ntb.bld.annotate(NULL, instr);
8354
8355 switch (instr->type) {
8356 case nir_instr_type_alu:
8357 fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
8358 break;
8359
8360 case nir_instr_type_deref:
8361 unreachable("All derefs should've been lowered");
8362 break;
8363
8364 case nir_instr_type_intrinsic:
8365 switch (ntb.s.stage) {
8366 case MESA_SHADER_VERTEX:
8367 fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8368 break;
8369 case MESA_SHADER_TESS_CTRL:
8370 fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8371 break;
8372 case MESA_SHADER_TESS_EVAL:
8373 fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8374 break;
8375 case MESA_SHADER_GEOMETRY:
8376 fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8377 break;
8378 case MESA_SHADER_FRAGMENT:
8379 fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8380 break;
8381 case MESA_SHADER_COMPUTE:
8382 case MESA_SHADER_KERNEL:
8383 fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8384 break;
8385 case MESA_SHADER_RAYGEN:
8386 case MESA_SHADER_ANY_HIT:
8387 case MESA_SHADER_CLOSEST_HIT:
8388 case MESA_SHADER_MISS:
8389 case MESA_SHADER_INTERSECTION:
8390 case MESA_SHADER_CALLABLE:
8391 fs_nir_emit_bs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8392 break;
8393 case MESA_SHADER_TASK:
8394 fs_nir_emit_task_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8395 break;
8396 case MESA_SHADER_MESH:
8397 fs_nir_emit_mesh_intrinsic(ntb, nir_instr_as_intrinsic(instr));
8398 break;
8399 default:
8400 unreachable("unsupported shader stage");
8401 }
8402 break;
8403
8404 case nir_instr_type_tex:
8405 fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
8406 break;
8407
8408 case nir_instr_type_load_const:
8409 fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
8410 break;
8411
8412 case nir_instr_type_undef:
8413 /* We create a new VGRF for undefs on every use (by handling
8414 * them in get_nir_src()), rather than for each definition.
8415 * This helps register coalescing eliminate MOVs from undef.
8416 */
8417 break;
8418
8419 case nir_instr_type_jump:
8420 fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
8421 break;
8422
8423 default:
8424 unreachable("unknown instruction type");
8425 }
8426 }
8427
8428 static unsigned
brw_rnd_mode_from_nir(unsigned mode,unsigned * mask)8429 brw_rnd_mode_from_nir(unsigned mode, unsigned *mask)
8430 {
8431 unsigned brw_mode = 0;
8432 *mask = 0;
8433
8434 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
8435 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
8436 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
8437 mode) {
8438 brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT;
8439 *mask |= BRW_CR0_RND_MODE_MASK;
8440 }
8441 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
8442 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
8443 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
8444 mode) {
8445 brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT;
8446 *mask |= BRW_CR0_RND_MODE_MASK;
8447 }
8448 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
8449 brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE;
8450 *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
8451 }
8452 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
8453 brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE;
8454 *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
8455 }
8456 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
8457 brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE;
8458 *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
8459 }
8460 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
8461 *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
8462 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
8463 *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
8464 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
8465 *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
8466 if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
8467 *mask |= BRW_CR0_FP_MODE_MASK;
8468
8469 if (*mask != 0)
8470 assert((*mask & brw_mode) == brw_mode);
8471
8472 return brw_mode;
8473 }
8474
8475 static void
emit_shader_float_controls_execution_mode(nir_to_brw_state & ntb)8476 emit_shader_float_controls_execution_mode(nir_to_brw_state &ntb)
8477 {
8478 const fs_builder &bld = ntb.bld;
8479 fs_visitor &s = ntb.s;
8480
8481 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
8482 if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
8483 return;
8484
8485 fs_builder ubld = bld.exec_all().group(1, 0);
8486 fs_builder abld = ubld.annotate("shader floats control execution mode");
8487 unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask);
8488
8489 if (mask == 0)
8490 return;
8491
8492 abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
8493 brw_imm_d(mode), brw_imm_d(mask));
8494 }
8495
8496 void
nir_to_brw(fs_visitor * s)8497 nir_to_brw(fs_visitor *s)
8498 {
8499 nir_to_brw_state ntb = {
8500 .s = *s,
8501 .nir = s->nir,
8502 .devinfo = s->devinfo,
8503 .mem_ctx = ralloc_context(NULL),
8504 .bld = fs_builder(s).at_end(),
8505 };
8506
8507 emit_shader_float_controls_execution_mode(ntb);
8508
8509 /* emit the arrays used for inputs and outputs - load/store intrinsics will
8510 * be converted to reads/writes of these arrays
8511 */
8512 fs_nir_setup_outputs(ntb);
8513 fs_nir_setup_uniforms(ntb.s);
8514 fs_nir_emit_system_values(ntb);
8515 ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
8516
8517 fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
8518
8519 ntb.bld.emit(SHADER_OPCODE_HALT_TARGET);
8520
8521 ralloc_free(ntb.mem_ctx);
8522 }
8523
8524