1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26 #include "elk_nir.h"
27 #include "elk_nir_private.h"
28 #include "elk_eu.h"
29 #include "nir.h"
30 #include "nir_intrinsics.h"
31 #include "nir_search_helpers.h"
32 #include "util/u_math.h"
33 #include "util/bitscan.h"
34
35 #include <vector>
36
37 using namespace elk;
38
39 struct elk_fs_bind_info {
40 bool valid;
41 bool bindless;
42 unsigned block;
43 unsigned set;
44 unsigned binding;
45 };
46
47 struct nir_to_elk_state {
48 elk_fs_visitor &s;
49 const nir_shader *nir;
50 const intel_device_info *devinfo;
51 void *mem_ctx;
52
53 /* Points to the end of the program. Annotated with the current NIR
54 * instruction when applicable.
55 */
56 fs_builder bld;
57
58 elk_fs_reg *ssa_values;
59 elk_fs_inst **resource_insts;
60 struct elk_fs_bind_info *ssa_bind_infos;
61 elk_fs_reg *resource_values;
62 elk_fs_reg *system_values;
63 };
64
65 static elk_fs_reg get_nir_src(nir_to_elk_state &ntb, const nir_src &src);
66 static elk_fs_reg get_nir_def(nir_to_elk_state &ntb, const nir_def &def);
67 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
68
69 static void fs_nir_emit_intrinsic(nir_to_elk_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
70 static elk_fs_reg emit_samplepos_setup(nir_to_elk_state &ntb);
71 static elk_fs_reg emit_sampleid_setup(nir_to_elk_state &ntb);
72 static elk_fs_reg emit_samplemaskin_setup(nir_to_elk_state &ntb);
73 static elk_fs_reg emit_shading_rate_setup(nir_to_elk_state &ntb);
74
75 static void fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl);
76 static void fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list);
77 static void fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt);
78 static void fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop);
79 static void fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block);
80 static void fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr);
81
82 static void fs_nir_emit_surface_atomic(nir_to_elk_state &ntb,
83 const fs_builder &bld,
84 nir_intrinsic_instr *instr,
85 elk_fs_reg surface,
86 bool bindless);
87 static void fs_nir_emit_global_atomic(nir_to_elk_state &ntb,
88 const fs_builder &bld,
89 nir_intrinsic_instr *instr);
90
91 static void
fs_nir_setup_outputs(nir_to_elk_state & ntb)92 fs_nir_setup_outputs(nir_to_elk_state &ntb)
93 {
94 elk_fs_visitor &s = ntb.s;
95
96 if (s.stage == MESA_SHADER_TESS_CTRL ||
97 s.stage == MESA_SHADER_FRAGMENT)
98 return;
99
100 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
101
102 /* Calculate the size of output registers in a separate pass, before
103 * allocating them. With ARB_enhanced_layouts, multiple output variables
104 * may occupy the same slot, but have different type sizes.
105 */
106 nir_foreach_shader_out_variable(var, s.nir) {
107 const int loc = var->data.driver_location;
108 const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
109 vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
110 }
111
112 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
113 if (vec4s[loc] == 0) {
114 loc++;
115 continue;
116 }
117
118 unsigned reg_size = vec4s[loc];
119
120 /* Check if there are any ranges that start within this range and extend
121 * past it. If so, include them in this allocation.
122 */
123 for (unsigned i = 1; i < reg_size; i++) {
124 assert(i + loc < ARRAY_SIZE(vec4s));
125 reg_size = MAX2(vec4s[i + loc] + i, reg_size);
126 }
127
128 elk_fs_reg reg = ntb.bld.vgrf(ELK_REGISTER_TYPE_F, 4 * reg_size);
129 for (unsigned i = 0; i < reg_size; i++) {
130 assert(loc + i < ARRAY_SIZE(s.outputs));
131 s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
132 }
133
134 loc += reg_size;
135 }
136 }
137
138 static void
fs_nir_setup_uniforms(elk_fs_visitor & s)139 fs_nir_setup_uniforms(elk_fs_visitor &s)
140 {
141 const intel_device_info *devinfo = s.devinfo;
142
143 /* Only the first compile gets to set up uniforms. */
144 if (s.push_constant_loc)
145 return;
146
147 s.uniforms = s.nir->num_uniforms / 4;
148
149 if (gl_shader_stage_is_compute(s.stage) && devinfo->verx10 < 125) {
150 /* Add uniforms for builtins after regular NIR uniforms. */
151 assert(s.uniforms == s.prog_data->nr_params);
152
153 /* Subgroup ID must be the last uniform on the list. This will make
154 * easier later to split between cross thread and per thread
155 * uniforms.
156 */
157 uint32_t *param = elk_stage_prog_data_add_params(s.prog_data, 1);
158 *param = ELK_PARAM_BUILTIN_SUBGROUP_ID;
159 s.uniforms++;
160 }
161 }
162
163 static elk_fs_reg
emit_work_group_id_setup(nir_to_elk_state & ntb)164 emit_work_group_id_setup(nir_to_elk_state &ntb)
165 {
166 elk_fs_visitor &s = ntb.s;
167 const fs_builder &bld = ntb.bld;
168
169 assert(gl_shader_stage_is_compute(s.stage));
170
171 elk_fs_reg id = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
172
173 struct elk_reg r0_1(retype(elk_vec1_grf(0, 1), ELK_REGISTER_TYPE_UD));
174 bld.MOV(id, r0_1);
175
176 struct elk_reg r0_6(retype(elk_vec1_grf(0, 6), ELK_REGISTER_TYPE_UD));
177 struct elk_reg r0_7(retype(elk_vec1_grf(0, 7), ELK_REGISTER_TYPE_UD));
178 bld.MOV(offset(id, bld, 1), r0_6);
179 bld.MOV(offset(id, bld, 2), r0_7);
180
181 return id;
182 }
183
184 static bool
emit_system_values_block(nir_to_elk_state & ntb,nir_block * block)185 emit_system_values_block(nir_to_elk_state &ntb, nir_block *block)
186 {
187 elk_fs_visitor &s = ntb.s;
188 elk_fs_reg *reg;
189
190 nir_foreach_instr(instr, block) {
191 if (instr->type != nir_instr_type_intrinsic)
192 continue;
193
194 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
195 switch (intrin->intrinsic) {
196 case nir_intrinsic_load_vertex_id:
197 case nir_intrinsic_load_base_vertex:
198 unreachable("should be lowered by nir_lower_system_values().");
199
200 case nir_intrinsic_load_vertex_id_zero_base:
201 case nir_intrinsic_load_is_indexed_draw:
202 case nir_intrinsic_load_first_vertex:
203 case nir_intrinsic_load_instance_id:
204 case nir_intrinsic_load_base_instance:
205 unreachable("should be lowered by elk_nir_lower_vs_inputs().");
206 break;
207
208 case nir_intrinsic_load_draw_id:
209 unreachable("should be lowered by elk_nir_lower_vs_inputs().");
210 break;
211
212 case nir_intrinsic_load_invocation_id:
213 if (s.stage == MESA_SHADER_TESS_CTRL)
214 break;
215 assert(s.stage == MESA_SHADER_GEOMETRY);
216 reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
217 if (reg->file == BAD_FILE) {
218 *reg = s.gs_payload().instance_id;
219 }
220 break;
221
222 case nir_intrinsic_load_sample_pos:
223 case nir_intrinsic_load_sample_pos_or_center:
224 assert(s.stage == MESA_SHADER_FRAGMENT);
225 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
226 if (reg->file == BAD_FILE)
227 *reg = emit_samplepos_setup(ntb);
228 break;
229
230 case nir_intrinsic_load_sample_id:
231 assert(s.stage == MESA_SHADER_FRAGMENT);
232 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
233 if (reg->file == BAD_FILE)
234 *reg = emit_sampleid_setup(ntb);
235 break;
236
237 case nir_intrinsic_load_sample_mask_in:
238 assert(s.stage == MESA_SHADER_FRAGMENT);
239 assert(s.devinfo->ver >= 7);
240 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
241 if (reg->file == BAD_FILE)
242 *reg = emit_samplemaskin_setup(ntb);
243 break;
244
245 case nir_intrinsic_load_workgroup_id:
246 case nir_intrinsic_load_workgroup_id_zero_base:
247 assert(gl_shader_stage_is_compute(s.stage));
248 reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
249 if (reg->file == BAD_FILE)
250 *reg = emit_work_group_id_setup(ntb);
251 break;
252
253 case nir_intrinsic_load_helper_invocation:
254 assert(s.stage == MESA_SHADER_FRAGMENT);
255 reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
256 if (reg->file == BAD_FILE) {
257 const fs_builder abld =
258 ntb.bld.annotate("gl_HelperInvocation", NULL);
259
260 /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
261 * pixel mask is in g1.7 of the thread payload.
262 *
263 * We move the per-channel pixel enable bit to the low bit of each
264 * channel by shifting the byte containing the pixel mask by the
265 * vector immediate 0x76543210UV.
266 *
267 * The region of <1,8,0> reads only 1 byte (the pixel masks for
268 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
269 * masks for 2 and 3) in SIMD16.
270 */
271 elk_fs_reg shifted = abld.vgrf(ELK_REGISTER_TYPE_UW, 1);
272
273 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
274 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
275 /* According to the "PS Thread Payload for Normal
276 * Dispatch" pages on the BSpec, the dispatch mask is
277 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
278 * gfx6+.
279 */
280 const struct elk_reg reg = s.devinfo->ver >= 20 ?
281 xe2_vec1_grf(i, 15) : elk_vec1_grf(i + 1, 7);
282 hbld.SHR(offset(shifted, hbld, i),
283 stride(retype(reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
284 elk_imm_v(0x76543210));
285 }
286
287 /* A set bit in the pixel mask means the channel is enabled, but
288 * that is the opposite of gl_HelperInvocation so we need to invert
289 * the mask.
290 *
291 * The negate source-modifier bit of logical instructions on Gfx8+
292 * performs 1's complement negation, so we can use that instead of
293 * a NOT instruction.
294 */
295 elk_fs_reg inverted = negate(shifted);
296 if (s.devinfo->ver < 8) {
297 inverted = abld.vgrf(ELK_REGISTER_TYPE_UW);
298 abld.NOT(inverted, shifted);
299 }
300
301 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
302 * with 1 and negating.
303 */
304 elk_fs_reg anded = abld.vgrf(ELK_REGISTER_TYPE_UD, 1);
305 abld.AND(anded, inverted, elk_imm_uw(1));
306
307 elk_fs_reg dst = abld.vgrf(ELK_REGISTER_TYPE_D, 1);
308 abld.MOV(dst, negate(retype(anded, ELK_REGISTER_TYPE_D)));
309 *reg = dst;
310 }
311 break;
312
313 case nir_intrinsic_load_frag_shading_rate:
314 reg = &ntb.system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
315 if (reg->file == BAD_FILE)
316 *reg = emit_shading_rate_setup(ntb);
317 break;
318
319 default:
320 break;
321 }
322 }
323
324 return true;
325 }
326
327 static void
fs_nir_emit_system_values(nir_to_elk_state & ntb)328 fs_nir_emit_system_values(nir_to_elk_state &ntb)
329 {
330 const fs_builder &bld = ntb.bld;
331 elk_fs_visitor &s = ntb.s;
332
333 ntb.system_values = ralloc_array(ntb.mem_ctx, elk_fs_reg, SYSTEM_VALUE_MAX);
334 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
335 ntb.system_values[i] = elk_fs_reg();
336 }
337
338 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we
339 * never end up using it.
340 */
341 {
342 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
343 elk_fs_reg ® = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
344 reg = abld.vgrf(ELK_REGISTER_TYPE_UW);
345 abld.UNDEF(reg);
346
347 const fs_builder allbld8 = abld.group(8, 0).exec_all();
348 allbld8.MOV(reg, elk_imm_v(0x76543210));
349 if (s.dispatch_width > 8)
350 allbld8.ADD(byte_offset(reg, 16), reg, elk_imm_uw(8u));
351 if (s.dispatch_width > 16) {
352 const fs_builder allbld16 = abld.group(16, 0).exec_all();
353 allbld16.ADD(byte_offset(reg, 32), reg, elk_imm_uw(16u));
354 }
355 }
356
357 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
358 nir_foreach_block(block, impl)
359 emit_system_values_block(ntb, block);
360 }
361
362 static void
fs_nir_emit_impl(nir_to_elk_state & ntb,nir_function_impl * impl)363 fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl)
364 {
365 ntb.ssa_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
366 ntb.resource_insts = rzalloc_array(ntb.mem_ctx, elk_fs_inst *, impl->ssa_alloc);
367 ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct elk_fs_bind_info, impl->ssa_alloc);
368 ntb.resource_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
369
370 fs_nir_emit_cf_list(ntb, &impl->body);
371 }
372
373 static void
fs_nir_emit_cf_list(nir_to_elk_state & ntb,exec_list * list)374 fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list)
375 {
376 exec_list_validate(list);
377 foreach_list_typed(nir_cf_node, node, node, list) {
378 switch (node->type) {
379 case nir_cf_node_if:
380 fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
381 break;
382
383 case nir_cf_node_loop:
384 fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
385 break;
386
387 case nir_cf_node_block:
388 fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
389 break;
390
391 default:
392 unreachable("Invalid CFG node block");
393 }
394 }
395 }
396
397 static void
fs_nir_emit_if(nir_to_elk_state & ntb,nir_if * if_stmt)398 fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt)
399 {
400 const intel_device_info *devinfo = ntb.devinfo;
401 const fs_builder &bld = ntb.bld;
402
403 bool invert;
404 elk_fs_reg cond_reg;
405
406 /* If the condition has the form !other_condition, use other_condition as
407 * the source, but invert the predicate on the if instruction.
408 */
409 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
410 if (cond != NULL && cond->op == nir_op_inot) {
411 invert = true;
412 cond_reg = get_nir_src(ntb, cond->src[0].src);
413 cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
414
415 if (devinfo->ver <= 5 &&
416 (cond->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
417 /* redo boolean resolve on gen5 */
418 elk_fs_reg masked = ntb.s.vgrf(glsl_int_type());
419 bld.AND(masked, cond_reg, elk_imm_d(1));
420 masked.negate = true;
421 elk_fs_reg tmp = bld.vgrf(cond_reg.type);
422 bld.MOV(retype(tmp, ELK_REGISTER_TYPE_D), masked);
423 cond_reg = tmp;
424 }
425 } else {
426 invert = false;
427 cond_reg = get_nir_src(ntb, if_stmt->condition);
428 }
429
430 /* first, put the condition into f0 */
431 elk_fs_inst *inst = bld.MOV(bld.null_reg_d(),
432 retype(cond_reg, ELK_REGISTER_TYPE_D));
433 inst->conditional_mod = ELK_CONDITIONAL_NZ;
434
435 bld.IF(ELK_PREDICATE_NORMAL)->predicate_inverse = invert;
436
437 fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
438
439 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
440 bld.emit(ELK_OPCODE_ELSE);
441 fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
442 }
443
444 bld.emit(ELK_OPCODE_ENDIF);
445
446 if (devinfo->ver < 7)
447 ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
448 "in SIMD32 mode.");
449 }
450
451 static void
fs_nir_emit_loop(nir_to_elk_state & ntb,nir_loop * loop)452 fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop)
453 {
454 const intel_device_info *devinfo = ntb.devinfo;
455 const fs_builder &bld = ntb.bld;
456
457 assert(!nir_loop_has_continue_construct(loop));
458 bld.emit(ELK_OPCODE_DO);
459
460 fs_nir_emit_cf_list(ntb, &loop->body);
461
462 bld.emit(ELK_OPCODE_WHILE);
463
464 if (devinfo->ver < 7)
465 ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
466 "in SIMD32 mode.");
467 }
468
469 static void
fs_nir_emit_block(nir_to_elk_state & ntb,nir_block * block)470 fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block)
471 {
472 fs_builder bld = ntb.bld;
473
474 nir_foreach_instr(instr, block) {
475 fs_nir_emit_instr(ntb, instr);
476 }
477
478 ntb.bld = bld;
479 }
480
481 /**
482 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
483 * match instr.
484 */
485 static bool
optimize_extract_to_float(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)486 optimize_extract_to_float(nir_to_elk_state &ntb, nir_alu_instr *instr,
487 const elk_fs_reg &result)
488 {
489 const intel_device_info *devinfo = ntb.devinfo;
490 const fs_builder &bld = ntb.bld;
491
492 if (!instr->src[0].src.ssa->parent_instr)
493 return false;
494
495 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
496 return false;
497
498 nir_alu_instr *src0 =
499 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
500
501 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
502 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
503 return false;
504
505 unsigned element = nir_src_as_uint(src0->src[1].src);
506
507 /* Element type to extract.*/
508 const elk_reg_type type = elk_int_type(
509 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
510 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
511
512 elk_fs_reg op0 = get_nir_src(ntb, src0->src[0].src);
513 op0.type = elk_type_for_nir_type(devinfo,
514 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
515 nir_src_bit_size(src0->src[0].src)));
516 op0 = offset(op0, bld, src0->src[0].swizzle[0]);
517
518 bld.MOV(result, subscript(op0, type, element));
519 return true;
520 }
521
522 static bool
optimize_frontfacing_ternary(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)523 optimize_frontfacing_ternary(nir_to_elk_state &ntb,
524 nir_alu_instr *instr,
525 const elk_fs_reg &result)
526 {
527 const intel_device_info *devinfo = ntb.devinfo;
528 elk_fs_visitor &s = ntb.s;
529
530 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
531 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
532 return false;
533
534 if (!nir_src_is_const(instr->src[1].src) ||
535 !nir_src_is_const(instr->src[2].src))
536 return false;
537
538 const float value1 = nir_src_as_float(instr->src[1].src);
539 const float value2 = nir_src_as_float(instr->src[2].src);
540 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
541 return false;
542
543 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
544 assert(value1 == -value2);
545
546 elk_fs_reg tmp = s.vgrf(glsl_int_type());
547
548 if (devinfo->ver >= 20) {
549 /* Gfx20+ has separate back-facing bits for each pair of
550 * subspans in order to support multiple polygons, so we need to
551 * use a <1;8,0> region in order to select the correct word for
552 * each channel. Unfortunately they're no longer aligned to the
553 * sign bit of a 16-bit word, so a left shift is necessary.
554 */
555 elk_fs_reg ff = ntb.bld.vgrf(ELK_REGISTER_TYPE_UW);
556
557 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
558 const fs_builder hbld = ntb.bld.group(16, i);
559 const struct elk_reg gi_uw = retype(xe2_vec1_grf(i, 9),
560 ELK_REGISTER_TYPE_UW);
561 hbld.SHL(offset(ff, hbld, i), stride(gi_uw, 1, 8, 0), elk_imm_ud(4));
562 }
563
564 if (value1 == -1.0f)
565 ff.negate = true;
566
567 ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_UW, 1), ff,
568 elk_imm_uw(0x3f80));
569
570 } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
571 /* According to the BSpec "PS Thread Payload for Normal
572 * Dispatch", the front/back facing interpolation bit is stored
573 * as bit 15 of either the R1.1 or R1.6 poly info field, for the
574 * first and second polygons respectively in multipolygon PS
575 * dispatch mode.
576 */
577 assert(s.dispatch_width == 16);
578
579 for (unsigned i = 0; i < s.max_polygons; i++) {
580 const fs_builder hbld = ntb.bld.group(8, i);
581 struct elk_reg g1 = retype(elk_vec1_grf(1, 1 + 5 * i),
582 ELK_REGISTER_TYPE_UW);
583
584 if (value1 == -1.0f)
585 g1.negate = true;
586
587 hbld.OR(subscript(offset(tmp, hbld, i), ELK_REGISTER_TYPE_UW, 1),
588 g1, elk_imm_uw(0x3f80));
589 }
590
591 } else if (devinfo->ver >= 12) {
592 /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
593 elk_fs_reg g1 = elk_fs_reg(retype(elk_vec1_grf(1, 1), ELK_REGISTER_TYPE_W));
594
595 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
596 *
597 * or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W
598 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
599 *
600 * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
601 */
602 if (value1 == -1.0f)
603 g1.negate = true;
604
605 ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_W, 1),
606 g1, elk_imm_uw(0x3f80));
607 } else if (devinfo->ver >= 6) {
608 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
609 elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
610
611 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
612 *
613 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
614 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
615 *
616 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
617 *
618 * This negation looks like it's safe in practice, because bits 0:4 will
619 * surely be TRIANGLES
620 */
621
622 if (value1 == -1.0f) {
623 g0.negate = true;
624 }
625
626 ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_W, 1),
627 g0, elk_imm_uw(0x3f80));
628 } else {
629 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
630 elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
631
632 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
633 *
634 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
635 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
636 *
637 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
638 *
639 * This negation looks like it's safe in practice, because bits 0:4 will
640 * surely be TRIANGLES
641 */
642
643 if (value1 == -1.0f) {
644 g1_6.negate = true;
645 }
646
647 ntb.bld.OR(tmp, g1_6, elk_imm_d(0x3f800000));
648 }
649 ntb.bld.AND(retype(result, ELK_REGISTER_TYPE_D), tmp, elk_imm_d(0xbf800000));
650
651 return true;
652 }
653
654 static elk_rnd_mode
elk_rnd_mode_from_nir_op(const nir_op op)655 elk_rnd_mode_from_nir_op (const nir_op op) {
656 switch (op) {
657 case nir_op_f2f16_rtz:
658 return ELK_RND_MODE_RTZ;
659 case nir_op_f2f16_rtne:
660 return ELK_RND_MODE_RTNE;
661 default:
662 unreachable("Operation doesn't support rounding mode");
663 }
664 }
665
666 static elk_rnd_mode
elk_rnd_mode_from_execution_mode(unsigned execution_mode)667 elk_rnd_mode_from_execution_mode(unsigned execution_mode)
668 {
669 if (nir_has_any_rounding_mode_rtne(execution_mode))
670 return ELK_RND_MODE_RTNE;
671 if (nir_has_any_rounding_mode_rtz(execution_mode))
672 return ELK_RND_MODE_RTZ;
673 return ELK_RND_MODE_UNSPECIFIED;
674 }
675
676 static elk_fs_reg
prepare_alu_destination_and_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op,bool need_dest)677 prepare_alu_destination_and_sources(nir_to_elk_state &ntb,
678 const fs_builder &bld,
679 nir_alu_instr *instr,
680 elk_fs_reg *op,
681 bool need_dest)
682 {
683 const intel_device_info *devinfo = ntb.devinfo;
684
685 elk_fs_reg result =
686 need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
687
688 result.type = elk_type_for_nir_type(devinfo,
689 (nir_alu_type)(nir_op_infos[instr->op].output_type |
690 instr->def.bit_size));
691
692 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
693 op[i] = get_nir_src(ntb, instr->src[i].src);
694 op[i].type = elk_type_for_nir_type(devinfo,
695 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
696 nir_src_bit_size(instr->src[i].src)));
697 }
698
699 /* Move and vecN instrutions may still be vectored. Return the raw,
700 * vectored source and destination so that elk_fs_visitor::nir_emit_alu can
701 * handle it. Other callers should not have to handle these kinds of
702 * instructions.
703 */
704 switch (instr->op) {
705 case nir_op_mov:
706 case nir_op_vec2:
707 case nir_op_vec3:
708 case nir_op_vec4:
709 case nir_op_vec8:
710 case nir_op_vec16:
711 return result;
712 default:
713 break;
714 }
715
716 /* At this point, we have dealt with any instruction that operates on
717 * more than a single channel. Therefore, we can just adjust the source
718 * and destination registers for that channel and emit the instruction.
719 */
720 unsigned channel = 0;
721 if (nir_op_infos[instr->op].output_size == 0) {
722 /* Since NIR is doing the scalarizing for us, we should only ever see
723 * vectorized operations with a single channel.
724 */
725 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
726 assert(util_bitcount(write_mask) == 1);
727 channel = ffs(write_mask) - 1;
728
729 result = offset(result, bld, channel);
730 }
731
732 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
733 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
734 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
735 }
736
737 return result;
738 }
739
740 static elk_fs_reg
resolve_source_modifiers(const fs_builder & bld,const elk_fs_reg & src)741 resolve_source_modifiers(const fs_builder &bld, const elk_fs_reg &src)
742 {
743 if (!src.abs && !src.negate)
744 return src;
745
746 elk_fs_reg temp = bld.vgrf(src.type);
747 bld.MOV(temp, src);
748
749 return temp;
750 }
751
752 static void
resolve_inot_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op)753 resolve_inot_sources(nir_to_elk_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
754 elk_fs_reg *op)
755 {
756 for (unsigned i = 0; i < 2; i++) {
757 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
758
759 if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
760 /* The source of the inot is now the source of instr. */
761 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
762
763 assert(!op[i].negate);
764 op[i].negate = true;
765 } else {
766 op[i] = resolve_source_modifiers(bld, op[i]);
767 }
768 }
769 }
770
771 static bool
try_emit_b2fi_of_inot(nir_to_elk_state & ntb,const fs_builder & bld,elk_fs_reg result,nir_alu_instr * instr)772 try_emit_b2fi_of_inot(nir_to_elk_state &ntb, const fs_builder &bld,
773 elk_fs_reg result,
774 nir_alu_instr *instr)
775 {
776 const intel_device_info *devinfo = bld.shader->devinfo;
777
778 if (devinfo->ver < 6 || devinfo->verx10 >= 125)
779 return false;
780
781 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
782
783 if (inot_instr == NULL || inot_instr->op != nir_op_inot)
784 return false;
785
786 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set
787 * of valid size-changing combinations is a bit more complex.
788 *
789 * The source restriction is just because I was lazy about generating the
790 * constant below.
791 */
792 if (instr->def.bit_size != 32 ||
793 nir_src_bit_size(inot_instr->src[0].src) != 32)
794 return false;
795
796 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,
797 * this is float(1 + a).
798 */
799 elk_fs_reg op;
800
801 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
802
803 /* Ignore the saturate modifier, if there is one. The result of the
804 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
805 */
806 bld.ADD(result, op, elk_imm_d(1));
807
808 return true;
809 }
810
811 /**
812 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
813 *
814 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
815 * the source of \c instr that is a \c nir_op_fsign.
816 */
817 static void
emit_fsign(nir_to_elk_state & ntb,const fs_builder & bld,const nir_alu_instr * instr,elk_fs_reg result,elk_fs_reg * op,unsigned fsign_src)818 emit_fsign(nir_to_elk_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
819 elk_fs_reg result, elk_fs_reg *op, unsigned fsign_src)
820 {
821 elk_fs_visitor &s = ntb.s;
822 const intel_device_info *devinfo = ntb.devinfo;
823
824 elk_fs_inst *inst;
825
826 assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
827 assert(fsign_src < nir_op_infos[instr->op].num_inputs);
828
829 if (instr->op != nir_op_fsign) {
830 const nir_alu_instr *const fsign_instr =
831 nir_src_as_alu_instr(instr->src[fsign_src].src);
832
833 /* op[fsign_src] has the nominal result of the fsign, and op[1 -
834 * fsign_src] has the other multiply source. This must be rearranged so
835 * that op[0] is the source of the fsign op[1] is the other multiply
836 * source.
837 */
838 if (fsign_src != 0)
839 op[1] = op[0];
840
841 op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
842
843 const nir_alu_type t =
844 (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
845 nir_src_bit_size(fsign_instr->src[0].src));
846
847 op[0].type = elk_type_for_nir_type(devinfo, t);
848
849 unsigned channel = 0;
850 if (nir_op_infos[instr->op].output_size == 0) {
851 /* Since NIR is doing the scalarizing for us, we should only ever see
852 * vectorized operations with a single channel.
853 */
854 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
855 assert(util_bitcount(write_mask) == 1);
856 channel = ffs(write_mask) - 1;
857 }
858
859 op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
860 }
861
862 if (type_sz(op[0].type) == 2) {
863 /* AND(val, 0x8000) gives the sign bit.
864 *
865 * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
866 */
867 elk_fs_reg zero = retype(elk_imm_uw(0), ELK_REGISTER_TYPE_HF);
868 bld.CMP(bld.null_reg_f(), op[0], zero, ELK_CONDITIONAL_NZ);
869
870 op[0].type = ELK_REGISTER_TYPE_UW;
871 result.type = ELK_REGISTER_TYPE_UW;
872 bld.AND(result, op[0], elk_imm_uw(0x8000u));
873
874 if (instr->op == nir_op_fsign)
875 inst = bld.OR(result, result, elk_imm_uw(0x3c00u));
876 else {
877 /* Use XOR here to get the result sign correct. */
878 inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UW));
879 }
880
881 inst->predicate = ELK_PREDICATE_NORMAL;
882 } else if (type_sz(op[0].type) == 4) {
883 /* AND(val, 0x80000000) gives the sign bit.
884 *
885 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
886 * zero.
887 */
888 bld.CMP(bld.null_reg_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ);
889
890 op[0].type = ELK_REGISTER_TYPE_UD;
891 result.type = ELK_REGISTER_TYPE_UD;
892 bld.AND(result, op[0], elk_imm_ud(0x80000000u));
893
894 if (instr->op == nir_op_fsign)
895 inst = bld.OR(result, result, elk_imm_ud(0x3f800000u));
896 else {
897 /* Use XOR here to get the result sign correct. */
898 inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UD));
899 }
900
901 inst->predicate = ELK_PREDICATE_NORMAL;
902 } else {
903 /* For doubles we do the same but we need to consider:
904 *
905 * - 2-src instructions can't operate with 64-bit immediates
906 * - The sign is encoded in the high 32-bit of each DF
907 * - We need to produce a DF result.
908 */
909
910 elk_fs_reg zero = s.vgrf(glsl_double_type());
911 bld.MOV(zero, elk_setup_imm_df(bld, 0.0));
912 bld.CMP(bld.null_reg_df(), op[0], zero, ELK_CONDITIONAL_NZ);
913
914 bld.MOV(result, zero);
915
916 elk_fs_reg r = subscript(result, ELK_REGISTER_TYPE_UD, 1);
917 bld.AND(r, subscript(op[0], ELK_REGISTER_TYPE_UD, 1),
918 elk_imm_ud(0x80000000u));
919
920 if (instr->op == nir_op_fsign) {
921 set_predicate(ELK_PREDICATE_NORMAL,
922 bld.OR(r, r, elk_imm_ud(0x3ff00000u)));
923 } else {
924 if (devinfo->has_64bit_int) {
925 /* This could be done better in some cases. If the scale is an
926 * immediate with the low 32-bits all 0, emitting a separate XOR and
927 * OR would allow an algebraic optimization to remove the OR. There
928 * are currently zero instances of fsign(double(x))*IMM in shader-db
929 * or any test suite, so it is hard to care at this time.
930 */
931 elk_fs_reg result_int64 = retype(result, ELK_REGISTER_TYPE_UQ);
932 inst = bld.XOR(result_int64, result_int64,
933 retype(op[1], ELK_REGISTER_TYPE_UQ));
934 } else {
935 elk_fs_reg result_int64 = retype(result, ELK_REGISTER_TYPE_UQ);
936 bld.MOV(subscript(result_int64, ELK_REGISTER_TYPE_UD, 0),
937 subscript(op[1], ELK_REGISTER_TYPE_UD, 0));
938 bld.XOR(subscript(result_int64, ELK_REGISTER_TYPE_UD, 1),
939 subscript(result_int64, ELK_REGISTER_TYPE_UD, 1),
940 subscript(op[1], ELK_REGISTER_TYPE_UD, 1));
941 }
942 }
943 }
944 }
945
946 /**
947 * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
948 *
949 * Checks the operands of a \c nir_op_fmul to determine whether or not
950 * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
951 *
952 * \param instr The multiplication instruction
953 *
954 * \param fsign_src The source of \c instr that may or may not be a
955 * \c nir_op_fsign
956 */
957 static bool
can_fuse_fmul_fsign(nir_alu_instr * instr,unsigned fsign_src)958 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
959 {
960 assert(instr->op == nir_op_fmul);
961
962 nir_alu_instr *const fsign_instr =
963 nir_src_as_alu_instr(instr->src[fsign_src].src);
964
965 /* Rules:
966 *
967 * 1. instr->src[fsign_src] must be a nir_op_fsign.
968 * 2. The nir_op_fsign can only be used by this multiplication.
969 * 3. The source that is the nir_op_fsign does not have source modifiers.
970 * \c emit_fsign only examines the source modifiers of the source of the
971 * \c nir_op_fsign.
972 *
973 * The nir_op_fsign must also not have the saturate modifier, but steps
974 * have already been taken (in nir_opt_algebraic) to ensure that.
975 */
976 return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
977 is_used_once(fsign_instr);
978 }
979
980 static bool
is_const_zero(const nir_src & src)981 is_const_zero(const nir_src &src)
982 {
983 return nir_src_is_const(src) && nir_src_as_int(src) == 0;
984 }
985
986 static void
fs_nir_emit_alu(nir_to_elk_state & ntb,nir_alu_instr * instr,bool need_dest)987 fs_nir_emit_alu(nir_to_elk_state &ntb, nir_alu_instr *instr,
988 bool need_dest)
989 {
990 const intel_device_info *devinfo = ntb.devinfo;
991 const fs_builder &bld = ntb.bld;
992 elk_fs_visitor &s = ntb.s;
993
994 elk_fs_inst *inst;
995 unsigned execution_mode =
996 bld.shader->nir->info.float_controls_execution_mode;
997
998 elk_fs_reg op[NIR_MAX_VEC_COMPONENTS];
999 elk_fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
1000
1001 #ifndef NDEBUG
1002 /* Everything except raw moves, some type conversions, iabs, and ineg
1003 * should have 8-bit sources lowered by nir_lower_bit_size in
1004 * elk_preprocess_nir or by elk_nir_lower_conversions in
1005 * elk_postprocess_nir.
1006 */
1007 switch (instr->op) {
1008 case nir_op_mov:
1009 case nir_op_vec2:
1010 case nir_op_vec3:
1011 case nir_op_vec4:
1012 case nir_op_vec8:
1013 case nir_op_vec16:
1014 case nir_op_i2f16:
1015 case nir_op_i2f32:
1016 case nir_op_i2i16:
1017 case nir_op_i2i32:
1018 case nir_op_u2f16:
1019 case nir_op_u2f32:
1020 case nir_op_u2u16:
1021 case nir_op_u2u32:
1022 case nir_op_iabs:
1023 case nir_op_ineg:
1024 case nir_op_pack_32_4x8_split:
1025 break;
1026
1027 default:
1028 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1029 assert(type_sz(op[i].type) > 1);
1030 }
1031 }
1032 #endif
1033
1034 switch (instr->op) {
1035 case nir_op_mov:
1036 case nir_op_vec2:
1037 case nir_op_vec3:
1038 case nir_op_vec4:
1039 case nir_op_vec8:
1040 case nir_op_vec16: {
1041 elk_fs_reg temp = result;
1042 bool need_extra_copy = false;
1043
1044 nir_intrinsic_instr *store_reg =
1045 nir_store_reg_for_def(&instr->def);
1046 if (store_reg != NULL) {
1047 nir_def *dest_reg = store_reg->src[1].ssa;
1048 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1049 nir_intrinsic_instr *load_reg =
1050 nir_load_reg_for_def(instr->src[i].src.ssa);
1051 if (load_reg == NULL)
1052 continue;
1053
1054 if (load_reg->src[0].ssa == dest_reg) {
1055 need_extra_copy = true;
1056 temp = bld.vgrf(result.type, 4);
1057 break;
1058 }
1059 }
1060 }
1061
1062 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
1063 unsigned last_bit = util_last_bit(write_mask);
1064
1065 for (unsigned i = 0; i < last_bit; i++) {
1066 if (!(write_mask & (1 << i)))
1067 continue;
1068
1069 if (instr->op == nir_op_mov) {
1070 bld.MOV(offset(temp, bld, i),
1071 offset(op[0], bld, instr->src[0].swizzle[i]));
1072 } else {
1073 bld.MOV(offset(temp, bld, i),
1074 offset(op[i], bld, instr->src[i].swizzle[0]));
1075 }
1076 }
1077
1078 /* In this case the source and destination registers were the same,
1079 * so we need to insert an extra set of moves in order to deal with
1080 * any swizzling.
1081 */
1082 if (need_extra_copy) {
1083 for (unsigned i = 0; i < last_bit; i++) {
1084 if (!(write_mask & (1 << i)))
1085 continue;
1086
1087 bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1088 }
1089 }
1090 return;
1091 }
1092
1093 case nir_op_i2f32:
1094 case nir_op_u2f32:
1095 if (optimize_extract_to_float(ntb, instr, result))
1096 return;
1097 inst = bld.MOV(result, op[0]);
1098 break;
1099
1100 case nir_op_f2f16_rtne:
1101 case nir_op_f2f16_rtz:
1102 case nir_op_f2f16: {
1103 elk_rnd_mode rnd = ELK_RND_MODE_UNSPECIFIED;
1104
1105 if (nir_op_f2f16 == instr->op)
1106 rnd = elk_rnd_mode_from_execution_mode(execution_mode);
1107 else
1108 rnd = elk_rnd_mode_from_nir_op(instr->op);
1109
1110 if (ELK_RND_MODE_UNSPECIFIED != rnd)
1111 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), elk_imm_d(rnd));
1112
1113 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1114 inst = bld.F32TO16(result, op[0]);
1115 break;
1116 }
1117
1118 case nir_op_b2i8:
1119 case nir_op_b2i16:
1120 case nir_op_b2i32:
1121 case nir_op_b2i64:
1122 case nir_op_b2f16:
1123 case nir_op_b2f32:
1124 case nir_op_b2f64:
1125 if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1126 break;
1127 op[0].type = ELK_REGISTER_TYPE_D;
1128 op[0].negate = !op[0].negate;
1129 FALLTHROUGH;
1130 case nir_op_i2f64:
1131 case nir_op_i2i64:
1132 case nir_op_u2f64:
1133 case nir_op_u2u64:
1134 case nir_op_f2f64:
1135 case nir_op_f2i64:
1136 case nir_op_f2u64:
1137 case nir_op_i2i32:
1138 case nir_op_u2u32:
1139 case nir_op_f2i32:
1140 case nir_op_f2u32:
1141 case nir_op_i2f16:
1142 case nir_op_u2f16:
1143 case nir_op_f2i16:
1144 case nir_op_f2u16:
1145 case nir_op_f2i8:
1146 case nir_op_f2u8:
1147 if (result.type == ELK_REGISTER_TYPE_B ||
1148 result.type == ELK_REGISTER_TYPE_UB ||
1149 result.type == ELK_REGISTER_TYPE_HF)
1150 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1151
1152 if (op[0].type == ELK_REGISTER_TYPE_B ||
1153 op[0].type == ELK_REGISTER_TYPE_UB ||
1154 op[0].type == ELK_REGISTER_TYPE_HF)
1155 assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1156
1157 inst = bld.MOV(result, op[0]);
1158 break;
1159
1160 case nir_op_i2i8:
1161 case nir_op_u2u8:
1162 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1163 FALLTHROUGH;
1164 case nir_op_i2i16:
1165 case nir_op_u2u16: {
1166 /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1167 * Emitting the instructions one by one results in two MOV instructions
1168 * that won't be propagated. By handling both instructions here, a
1169 * single MOV is emitted.
1170 */
1171 nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1172 if (extract_instr != NULL) {
1173 if (extract_instr->op == nir_op_extract_u8 ||
1174 extract_instr->op == nir_op_extract_i8) {
1175 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1176
1177 const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1178 const elk_reg_type type =
1179 elk_int_type(1, extract_instr->op == nir_op_extract_i8);
1180
1181 op[0] = subscript(op[0], type, byte);
1182 } else if (extract_instr->op == nir_op_extract_u16 ||
1183 extract_instr->op == nir_op_extract_i16) {
1184 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1185
1186 const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1187 const elk_reg_type type =
1188 elk_int_type(2, extract_instr->op == nir_op_extract_i16);
1189
1190 op[0] = subscript(op[0], type, word);
1191 }
1192 }
1193
1194 inst = bld.MOV(result, op[0]);
1195 break;
1196 }
1197
1198 case nir_op_fsat:
1199 inst = bld.MOV(result, op[0]);
1200 inst->saturate = true;
1201 break;
1202
1203 case nir_op_fneg:
1204 case nir_op_ineg:
1205 op[0].negate = true;
1206 inst = bld.MOV(result, op[0]);
1207 break;
1208
1209 case nir_op_fabs:
1210 case nir_op_iabs:
1211 op[0].negate = false;
1212 op[0].abs = true;
1213 inst = bld.MOV(result, op[0]);
1214 break;
1215
1216 case nir_op_f2f32:
1217 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1218 elk_rnd_mode rnd =
1219 elk_rnd_mode_from_execution_mode(execution_mode);
1220 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1221 elk_imm_d(rnd));
1222 }
1223
1224 if (op[0].type == ELK_REGISTER_TYPE_HF)
1225 assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1226
1227 inst = bld.MOV(result, op[0]);
1228 break;
1229
1230 case nir_op_fsign:
1231 emit_fsign(ntb, bld, instr, result, op, 0);
1232 break;
1233
1234 case nir_op_frcp:
1235 inst = bld.emit(ELK_SHADER_OPCODE_RCP, result, op[0]);
1236 break;
1237
1238 case nir_op_fexp2:
1239 inst = bld.emit(ELK_SHADER_OPCODE_EXP2, result, op[0]);
1240 break;
1241
1242 case nir_op_flog2:
1243 inst = bld.emit(ELK_SHADER_OPCODE_LOG2, result, op[0]);
1244 break;
1245
1246 case nir_op_fsin:
1247 inst = bld.emit(ELK_SHADER_OPCODE_SIN, result, op[0]);
1248 break;
1249
1250 case nir_op_fcos:
1251 inst = bld.emit(ELK_SHADER_OPCODE_COS, result, op[0]);
1252 break;
1253
1254 case nir_op_fddx_fine:
1255 inst = bld.emit(ELK_FS_OPCODE_DDX_FINE, result, op[0]);
1256 break;
1257 case nir_op_fddx:
1258 case nir_op_fddx_coarse:
1259 inst = bld.emit(ELK_FS_OPCODE_DDX_COARSE, result, op[0]);
1260 break;
1261 case nir_op_fddy_fine:
1262 inst = bld.emit(ELK_FS_OPCODE_DDY_FINE, result, op[0]);
1263 break;
1264 case nir_op_fddy:
1265 case nir_op_fddy_coarse:
1266 inst = bld.emit(ELK_FS_OPCODE_DDY_COARSE, result, op[0]);
1267 break;
1268
1269 case nir_op_fadd:
1270 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1271 elk_rnd_mode rnd =
1272 elk_rnd_mode_from_execution_mode(execution_mode);
1273 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1274 elk_imm_d(rnd));
1275 }
1276 FALLTHROUGH;
1277 case nir_op_iadd:
1278 inst = bld.ADD(result, op[0], op[1]);
1279 break;
1280
1281 case nir_op_iadd3:
1282 inst = bld.ADD3(result, op[0], op[1], op[2]);
1283 break;
1284
1285 case nir_op_iadd_sat:
1286 case nir_op_uadd_sat:
1287 inst = bld.ADD(result, op[0], op[1]);
1288 inst->saturate = true;
1289 break;
1290
1291 case nir_op_isub_sat:
1292 bld.emit(ELK_SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1293 break;
1294
1295 case nir_op_usub_sat:
1296 bld.emit(ELK_SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1297 break;
1298
1299 case nir_op_irhadd:
1300 case nir_op_urhadd:
1301 assert(instr->def.bit_size < 64);
1302 inst = bld.AVG(result, op[0], op[1]);
1303 break;
1304
1305 case nir_op_ihadd:
1306 case nir_op_uhadd: {
1307 assert(instr->def.bit_size < 64);
1308 elk_fs_reg tmp = bld.vgrf(result.type);
1309
1310 if (devinfo->ver >= 8) {
1311 op[0] = resolve_source_modifiers(bld, op[0]);
1312 op[1] = resolve_source_modifiers(bld, op[1]);
1313 }
1314
1315 /* AVG(x, y) - ((x ^ y) & 1) */
1316 bld.XOR(tmp, op[0], op[1]);
1317 bld.AND(tmp, tmp, retype(elk_imm_ud(1), result.type));
1318 bld.AVG(result, op[0], op[1]);
1319 inst = bld.ADD(result, result, tmp);
1320 inst->src[1].negate = true;
1321 break;
1322 }
1323
1324 case nir_op_fmul:
1325 for (unsigned i = 0; i < 2; i++) {
1326 if (can_fuse_fmul_fsign(instr, i)) {
1327 emit_fsign(ntb, bld, instr, result, op, i);
1328 return;
1329 }
1330 }
1331
1332 /* We emit the rounding mode after the previous fsign optimization since
1333 * it won't result in a MUL, but will try to negate the value by other
1334 * means.
1335 */
1336 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1337 elk_rnd_mode rnd =
1338 elk_rnd_mode_from_execution_mode(execution_mode);
1339 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1340 elk_imm_d(rnd));
1341 }
1342
1343 inst = bld.MUL(result, op[0], op[1]);
1344 break;
1345
1346 case nir_op_imul_2x32_64:
1347 case nir_op_umul_2x32_64:
1348 bld.MUL(result, op[0], op[1]);
1349 break;
1350
1351 case nir_op_imul_32x16:
1352 case nir_op_umul_32x16: {
1353 const bool ud = instr->op == nir_op_umul_32x16;
1354 const enum elk_reg_type word_type =
1355 ud ? ELK_REGISTER_TYPE_UW : ELK_REGISTER_TYPE_W;
1356 const enum elk_reg_type dword_type =
1357 ud ? ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_D;
1358
1359 assert(instr->def.bit_size == 32);
1360
1361 /* Before copy propagation there are no immediate values. */
1362 assert(op[0].file != IMM && op[1].file != IMM);
1363
1364 op[1] = subscript(op[1], word_type, 0);
1365
1366 if (devinfo->ver >= 7)
1367 bld.MUL(result, retype(op[0], dword_type), op[1]);
1368 else
1369 bld.MUL(result, op[1], retype(op[0], dword_type));
1370
1371 break;
1372 }
1373
1374 case nir_op_imul:
1375 assert(instr->def.bit_size < 64);
1376 bld.MUL(result, op[0], op[1]);
1377 break;
1378
1379 case nir_op_imul_high:
1380 case nir_op_umul_high:
1381 assert(instr->def.bit_size < 64);
1382 if (instr->def.bit_size == 32) {
1383 bld.emit(ELK_SHADER_OPCODE_MULH, result, op[0], op[1]);
1384 } else {
1385 elk_fs_reg tmp = bld.vgrf(elk_reg_type_from_bit_size(32, op[0].type));
1386 bld.MUL(tmp, op[0], op[1]);
1387 bld.MOV(result, subscript(tmp, result.type, 1));
1388 }
1389 break;
1390
1391 case nir_op_idiv:
1392 case nir_op_udiv:
1393 assert(instr->def.bit_size < 64);
1394 bld.emit(ELK_SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1395 break;
1396
1397 case nir_op_uadd_carry:
1398 unreachable("Should have been lowered by carry_to_arith().");
1399
1400 case nir_op_usub_borrow:
1401 unreachable("Should have been lowered by borrow_to_arith().");
1402
1403 case nir_op_umod:
1404 case nir_op_irem:
1405 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1406 * appears that our hardware just does the right thing for signed
1407 * remainder.
1408 */
1409 assert(instr->def.bit_size < 64);
1410 bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1411 break;
1412
1413 case nir_op_imod: {
1414 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
1415 bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1416
1417 /* Math instructions don't support conditional mod */
1418 inst = bld.MOV(bld.null_reg_d(), result);
1419 inst->conditional_mod = ELK_CONDITIONAL_NZ;
1420
1421 /* Now, we need to determine if signs of the sources are different.
1422 * When we XOR the sources, the top bit is 0 if they are the same and 1
1423 * if they are different. We can then use a conditional modifier to
1424 * turn that into a predicate. This leads us to an XOR.l instruction.
1425 *
1426 * Technically, according to the PRM, you're not allowed to use .l on a
1427 * XOR instruction. However, empirical experiments and Curro's reading
1428 * of the simulator source both indicate that it's safe.
1429 */
1430 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_D);
1431 inst = bld.XOR(tmp, op[0], op[1]);
1432 inst->predicate = ELK_PREDICATE_NORMAL;
1433 inst->conditional_mod = ELK_CONDITIONAL_L;
1434
1435 /* If the result of the initial remainder operation is non-zero and the
1436 * two sources have different signs, add in a copy of op[1] to get the
1437 * final integer modulus value.
1438 */
1439 inst = bld.ADD(result, result, op[1]);
1440 inst->predicate = ELK_PREDICATE_NORMAL;
1441 break;
1442 }
1443
1444 case nir_op_flt32:
1445 case nir_op_fge32:
1446 case nir_op_feq32:
1447 case nir_op_fneu32: {
1448 elk_fs_reg dest = result;
1449
1450 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1451 if (bit_size != 32) {
1452 dest = bld.vgrf(op[0].type, 1);
1453 bld.UNDEF(dest);
1454 }
1455
1456 bld.CMP(dest, op[0], op[1], elk_cmod_for_nir_comparison(instr->op));
1457
1458 if (bit_size > 32) {
1459 bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1460 } else if(bit_size < 32) {
1461 /* When we convert the result to 32-bit we need to be careful and do
1462 * it as a signed conversion to get sign extension (for 32-bit true)
1463 */
1464 const elk_reg_type src_type =
1465 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1466
1467 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1468 }
1469 break;
1470 }
1471
1472 case nir_op_ilt32:
1473 case nir_op_ult32:
1474 case nir_op_ige32:
1475 case nir_op_uge32:
1476 case nir_op_ieq32:
1477 case nir_op_ine32: {
1478 elk_fs_reg dest = result;
1479
1480 const uint32_t bit_size = type_sz(op[0].type) * 8;
1481 if (bit_size != 32) {
1482 dest = bld.vgrf(op[0].type, 1);
1483 bld.UNDEF(dest);
1484 }
1485
1486 bld.CMP(dest, op[0], op[1],
1487 elk_cmod_for_nir_comparison(instr->op));
1488
1489 if (bit_size > 32) {
1490 bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1491 } else if (bit_size < 32) {
1492 /* When we convert the result to 32-bit we need to be careful and do
1493 * it as a signed conversion to get sign extension (for 32-bit true)
1494 */
1495 const elk_reg_type src_type =
1496 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1497
1498 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1499 }
1500 break;
1501 }
1502
1503 case nir_op_inot:
1504 if (devinfo->ver >= 8) {
1505 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1506
1507 if (inot_src_instr != NULL &&
1508 (inot_src_instr->op == nir_op_ior ||
1509 inot_src_instr->op == nir_op_ixor ||
1510 inot_src_instr->op == nir_op_iand)) {
1511 /* The sources of the source logical instruction are now the
1512 * sources of the instruction that will be generated.
1513 */
1514 prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
1515 resolve_inot_sources(ntb, bld, inot_src_instr, op);
1516
1517 /* Smash all of the sources and destination to be signed. This
1518 * doesn't matter for the operation of the instruction, but cmod
1519 * propagation fails on unsigned sources with negation (due to
1520 * elk_fs_inst::can_do_cmod returning false).
1521 */
1522 result.type =
1523 elk_type_for_nir_type(devinfo,
1524 (nir_alu_type)(nir_type_int |
1525 instr->def.bit_size));
1526 op[0].type =
1527 elk_type_for_nir_type(devinfo,
1528 (nir_alu_type)(nir_type_int |
1529 nir_src_bit_size(inot_src_instr->src[0].src)));
1530 op[1].type =
1531 elk_type_for_nir_type(devinfo,
1532 (nir_alu_type)(nir_type_int |
1533 nir_src_bit_size(inot_src_instr->src[1].src)));
1534
1535 /* For XOR, only invert one of the sources. Arbitrarily choose
1536 * the first source.
1537 */
1538 op[0].negate = !op[0].negate;
1539 if (inot_src_instr->op != nir_op_ixor)
1540 op[1].negate = !op[1].negate;
1541
1542 switch (inot_src_instr->op) {
1543 case nir_op_ior:
1544 bld.AND(result, op[0], op[1]);
1545 return;
1546
1547 case nir_op_iand:
1548 bld.OR(result, op[0], op[1]);
1549 return;
1550
1551 case nir_op_ixor:
1552 bld.XOR(result, op[0], op[1]);
1553 return;
1554
1555 default:
1556 unreachable("impossible opcode");
1557 }
1558 }
1559 op[0] = resolve_source_modifiers(bld, op[0]);
1560 }
1561 bld.NOT(result, op[0]);
1562 break;
1563 case nir_op_ixor:
1564 if (devinfo->ver >= 8) {
1565 resolve_inot_sources(ntb, bld, instr, op);
1566 }
1567 bld.XOR(result, op[0], op[1]);
1568 break;
1569 case nir_op_ior:
1570 if (devinfo->ver >= 8) {
1571 resolve_inot_sources(ntb, bld, instr, op);
1572 }
1573 bld.OR(result, op[0], op[1]);
1574 break;
1575 case nir_op_iand:
1576 if (devinfo->ver >= 8) {
1577 resolve_inot_sources(ntb, bld, instr, op);
1578 }
1579 bld.AND(result, op[0], op[1]);
1580 break;
1581
1582 case nir_op_fdot2:
1583 case nir_op_fdot3:
1584 case nir_op_fdot4:
1585 case nir_op_b32all_fequal2:
1586 case nir_op_b32all_iequal2:
1587 case nir_op_b32all_fequal3:
1588 case nir_op_b32all_iequal3:
1589 case nir_op_b32all_fequal4:
1590 case nir_op_b32all_iequal4:
1591 case nir_op_b32any_fnequal2:
1592 case nir_op_b32any_inequal2:
1593 case nir_op_b32any_fnequal3:
1594 case nir_op_b32any_inequal3:
1595 case nir_op_b32any_fnequal4:
1596 case nir_op_b32any_inequal4:
1597 unreachable("Lowered by nir_lower_alu_reductions");
1598
1599 case nir_op_ldexp:
1600 unreachable("not reached: should be handled by ldexp_to_arith()");
1601
1602 case nir_op_fsqrt:
1603 inst = bld.emit(ELK_SHADER_OPCODE_SQRT, result, op[0]);
1604 break;
1605
1606 case nir_op_frsq:
1607 inst = bld.emit(ELK_SHADER_OPCODE_RSQ, result, op[0]);
1608 break;
1609
1610 case nir_op_ftrunc:
1611 inst = bld.RNDZ(result, op[0]);
1612 if (devinfo->ver < 6) {
1613 set_condmod(ELK_CONDITIONAL_R, inst);
1614 set_predicate(ELK_PREDICATE_NORMAL,
1615 bld.ADD(result, result, elk_imm_f(1.0f)));
1616 inst = bld.MOV(result, result); /* for potential saturation */
1617 }
1618 break;
1619
1620 case nir_op_fceil: {
1621 op[0].negate = !op[0].negate;
1622 elk_fs_reg temp = s.vgrf(glsl_float_type());
1623 bld.RNDD(temp, op[0]);
1624 temp.negate = true;
1625 inst = bld.MOV(result, temp);
1626 break;
1627 }
1628 case nir_op_ffloor:
1629 inst = bld.RNDD(result, op[0]);
1630 break;
1631 case nir_op_ffract:
1632 inst = bld.FRC(result, op[0]);
1633 break;
1634 case nir_op_fround_even:
1635 inst = bld.RNDE(result, op[0]);
1636 if (devinfo->ver < 6) {
1637 set_condmod(ELK_CONDITIONAL_R, inst);
1638 set_predicate(ELK_PREDICATE_NORMAL,
1639 bld.ADD(result, result, elk_imm_f(1.0f)));
1640 inst = bld.MOV(result, result); /* for potential saturation */
1641 }
1642 break;
1643
1644 case nir_op_fquantize2f16: {
1645 elk_fs_reg tmp16 = bld.vgrf(ELK_REGISTER_TYPE_D);
1646 elk_fs_reg tmp32 = bld.vgrf(ELK_REGISTER_TYPE_F);
1647 elk_fs_reg zero = bld.vgrf(ELK_REGISTER_TYPE_F);
1648
1649 /* The destination stride must be at least as big as the source stride. */
1650 tmp16 = subscript(tmp16, ELK_REGISTER_TYPE_HF, 0);
1651
1652 /* Check for denormal */
1653 elk_fs_reg abs_src0 = op[0];
1654 abs_src0.abs = true;
1655 bld.CMP(bld.null_reg_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1656 ELK_CONDITIONAL_L);
1657 /* Get the appropriately signed zero */
1658 bld.AND(retype(zero, ELK_REGISTER_TYPE_UD),
1659 retype(op[0], ELK_REGISTER_TYPE_UD),
1660 elk_imm_ud(0x80000000));
1661 /* Do the actual F32 -> F16 -> F32 conversion */
1662 bld.F32TO16(tmp16, op[0]);
1663 bld.F16TO32(tmp32, tmp16);
1664 /* Select that or zero based on normal status */
1665 inst = bld.SEL(result, zero, tmp32);
1666 inst->predicate = ELK_PREDICATE_NORMAL;
1667 break;
1668 }
1669
1670 case nir_op_imin:
1671 case nir_op_umin:
1672 case nir_op_fmin:
1673 inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_L);
1674 break;
1675
1676 case nir_op_imax:
1677 case nir_op_umax:
1678 case nir_op_fmax:
1679 inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_GE);
1680 break;
1681
1682 case nir_op_pack_snorm_2x16:
1683 case nir_op_pack_snorm_4x8:
1684 case nir_op_pack_unorm_2x16:
1685 case nir_op_pack_unorm_4x8:
1686 case nir_op_unpack_snorm_2x16:
1687 case nir_op_unpack_snorm_4x8:
1688 case nir_op_unpack_unorm_2x16:
1689 case nir_op_unpack_unorm_4x8:
1690 case nir_op_unpack_half_2x16:
1691 case nir_op_pack_half_2x16:
1692 unreachable("not reached: should be handled by lower_packing_builtins");
1693
1694 case nir_op_unpack_half_2x16_split_x_flush_to_zero:
1695 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1696 FALLTHROUGH;
1697 case nir_op_unpack_half_2x16_split_x:
1698 inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 0));
1699 break;
1700
1701 case nir_op_unpack_half_2x16_split_y_flush_to_zero:
1702 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1703 FALLTHROUGH;
1704 case nir_op_unpack_half_2x16_split_y:
1705 inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 1));
1706 break;
1707
1708 case nir_op_pack_64_2x32_split:
1709 case nir_op_pack_32_2x16_split:
1710 bld.emit(ELK_FS_OPCODE_PACK, result, op[0], op[1]);
1711 break;
1712
1713 case nir_op_pack_32_4x8_split:
1714 bld.emit(ELK_FS_OPCODE_PACK, result, op, 4);
1715 break;
1716
1717 case nir_op_unpack_64_2x32_split_x:
1718 case nir_op_unpack_64_2x32_split_y: {
1719 if (instr->op == nir_op_unpack_64_2x32_split_x)
1720 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 0));
1721 else
1722 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 1));
1723 break;
1724 }
1725
1726 case nir_op_unpack_32_2x16_split_x:
1727 case nir_op_unpack_32_2x16_split_y: {
1728 if (instr->op == nir_op_unpack_32_2x16_split_x)
1729 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 0));
1730 else
1731 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 1));
1732 break;
1733 }
1734
1735 case nir_op_fpow:
1736 inst = bld.emit(ELK_SHADER_OPCODE_POW, result, op[0], op[1]);
1737 break;
1738
1739 case nir_op_bitfield_reverse:
1740 assert(instr->def.bit_size == 32);
1741 assert(nir_src_bit_size(instr->src[0].src) == 32);
1742 bld.BFREV(result, op[0]);
1743 break;
1744
1745 case nir_op_bit_count:
1746 assert(instr->def.bit_size == 32);
1747 assert(nir_src_bit_size(instr->src[0].src) < 64);
1748 bld.CBIT(result, op[0]);
1749 break;
1750
1751 case nir_op_uclz:
1752 assert(instr->def.bit_size == 32);
1753 assert(nir_src_bit_size(instr->src[0].src) == 32);
1754 bld.LZD(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1755 break;
1756
1757 case nir_op_ifind_msb: {
1758 assert(instr->def.bit_size == 32);
1759 assert(nir_src_bit_size(instr->src[0].src) == 32);
1760 assert(devinfo->ver >= 7);
1761
1762 bld.FBH(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1763
1764 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1765 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1766 * subtract the result from 31 to convert the MSB count into an LSB
1767 * count.
1768 */
1769 bld.CMP(bld.null_reg_d(), result, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1770
1771 inst = bld.ADD(result, result, elk_imm_d(31));
1772 inst->predicate = ELK_PREDICATE_NORMAL;
1773 inst->src[0].negate = true;
1774 break;
1775 }
1776
1777 case nir_op_find_lsb:
1778 assert(instr->def.bit_size == 32);
1779 assert(nir_src_bit_size(instr->src[0].src) == 32);
1780 assert(devinfo->ver >= 7);
1781 bld.FBL(result, op[0]);
1782 break;
1783
1784 case nir_op_ubitfield_extract:
1785 case nir_op_ibitfield_extract:
1786 unreachable("should have been lowered");
1787 case nir_op_ubfe:
1788 case nir_op_ibfe:
1789 assert(instr->def.bit_size < 64);
1790 bld.BFE(result, op[2], op[1], op[0]);
1791 break;
1792 case nir_op_bfm:
1793 assert(instr->def.bit_size < 64);
1794 bld.BFI1(result, op[0], op[1]);
1795 break;
1796 case nir_op_bfi:
1797 assert(instr->def.bit_size < 64);
1798
1799 /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1800 * either 0 or src0. Replacing the 0 with another value can eliminate a
1801 * temporary register.
1802 */
1803 if (is_const_zero(instr->src[2].src))
1804 bld.BFI2(result, op[0], op[1], op[0]);
1805 else
1806 bld.BFI2(result, op[0], op[1], op[2]);
1807
1808 break;
1809
1810 case nir_op_bitfield_insert:
1811 unreachable("not reached: should have been lowered");
1812
1813 /* With regards to implicit masking of the shift counts for 8- and 16-bit
1814 * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1815 * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1816 * src0) are used. The Bspec (backed by data from experimentation) state
1817 * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1818 * types.
1819 *
1820 * The match the behavior expected for the NIR opcodes, explicit masks for
1821 * 8- and 16-bit types must be added.
1822 */
1823 case nir_op_ishl:
1824 if (instr->def.bit_size < 32) {
1825 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1826 bld.SHL(result, op[0], result);
1827 } else {
1828 bld.SHL(result, op[0], op[1]);
1829 }
1830
1831 break;
1832 case nir_op_ishr:
1833 if (instr->def.bit_size < 32) {
1834 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1835 bld.ASR(result, op[0], result);
1836 } else {
1837 bld.ASR(result, op[0], op[1]);
1838 }
1839
1840 break;
1841 case nir_op_ushr:
1842 if (instr->def.bit_size < 32) {
1843 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1844 bld.SHR(result, op[0], result);
1845 } else {
1846 bld.SHR(result, op[0], op[1]);
1847 }
1848
1849 break;
1850
1851 case nir_op_urol:
1852 bld.ROL(result, op[0], op[1]);
1853 break;
1854 case nir_op_uror:
1855 bld.ROR(result, op[0], op[1]);
1856 break;
1857
1858 case nir_op_pack_half_2x16_split:
1859 bld.emit(ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1860 break;
1861
1862 case nir_op_sdot_4x8_iadd:
1863 case nir_op_sdot_4x8_iadd_sat:
1864 inst = bld.DP4A(retype(result, ELK_REGISTER_TYPE_D),
1865 retype(op[2], ELK_REGISTER_TYPE_D),
1866 retype(op[0], ELK_REGISTER_TYPE_D),
1867 retype(op[1], ELK_REGISTER_TYPE_D));
1868
1869 if (instr->op == nir_op_sdot_4x8_iadd_sat)
1870 inst->saturate = true;
1871 break;
1872
1873 case nir_op_udot_4x8_uadd:
1874 case nir_op_udot_4x8_uadd_sat:
1875 inst = bld.DP4A(retype(result, ELK_REGISTER_TYPE_UD),
1876 retype(op[2], ELK_REGISTER_TYPE_UD),
1877 retype(op[0], ELK_REGISTER_TYPE_UD),
1878 retype(op[1], ELK_REGISTER_TYPE_UD));
1879
1880 if (instr->op == nir_op_udot_4x8_uadd_sat)
1881 inst->saturate = true;
1882 break;
1883
1884 case nir_op_sudot_4x8_iadd:
1885 case nir_op_sudot_4x8_iadd_sat:
1886 inst = bld.DP4A(retype(result, ELK_REGISTER_TYPE_D),
1887 retype(op[2], ELK_REGISTER_TYPE_D),
1888 retype(op[0], ELK_REGISTER_TYPE_D),
1889 retype(op[1], ELK_REGISTER_TYPE_UD));
1890
1891 if (instr->op == nir_op_sudot_4x8_iadd_sat)
1892 inst->saturate = true;
1893 break;
1894
1895 case nir_op_ffma:
1896 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1897 elk_rnd_mode rnd =
1898 elk_rnd_mode_from_execution_mode(execution_mode);
1899 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1900 elk_imm_d(rnd));
1901 }
1902
1903 inst = bld.MAD(result, op[2], op[1], op[0]);
1904 break;
1905
1906 case nir_op_flrp:
1907 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1908 elk_rnd_mode rnd =
1909 elk_rnd_mode_from_execution_mode(execution_mode);
1910 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1911 elk_imm_d(rnd));
1912 }
1913
1914 inst = bld.LRP(result, op[0], op[1], op[2]);
1915 break;
1916
1917 case nir_op_b32csel:
1918 if (optimize_frontfacing_ternary(ntb, instr, result))
1919 return;
1920
1921 bld.CMP(bld.null_reg_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ);
1922 inst = bld.SEL(result, op[1], op[2]);
1923 inst->predicate = ELK_PREDICATE_NORMAL;
1924 break;
1925
1926 case nir_op_extract_u8:
1927 case nir_op_extract_i8: {
1928 unsigned byte = nir_src_as_uint(instr->src[1].src);
1929
1930 /* The PRMs say:
1931 *
1932 * BDW+
1933 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1934 * Use two instructions and a word or DWord intermediate integer type.
1935 */
1936 if (instr->def.bit_size == 64) {
1937 const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1938
1939 if (instr->op == nir_op_extract_i8) {
1940 /* If we need to sign extend, extract to a word first */
1941 elk_fs_reg w_temp = bld.vgrf(ELK_REGISTER_TYPE_W);
1942 bld.MOV(w_temp, subscript(op[0], type, byte));
1943 bld.MOV(result, w_temp);
1944 } else if (byte & 1) {
1945 /* Extract the high byte from the word containing the desired byte
1946 * offset.
1947 */
1948 bld.SHR(result,
1949 subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1950 elk_imm_uw(8));
1951 } else {
1952 /* Otherwise use an AND with 0xff and a word type */
1953 bld.AND(result,
1954 subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1955 elk_imm_uw(0xff));
1956 }
1957 } else {
1958 const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1959 bld.MOV(result, subscript(op[0], type, byte));
1960 }
1961 break;
1962 }
1963
1964 case nir_op_extract_u16:
1965 case nir_op_extract_i16: {
1966 const elk_reg_type type = elk_int_type(2, instr->op == nir_op_extract_i16);
1967 unsigned word = nir_src_as_uint(instr->src[1].src);
1968 bld.MOV(result, subscript(op[0], type, word));
1969 break;
1970 }
1971
1972 default:
1973 unreachable("unhandled instruction");
1974 }
1975
1976 /* If we need to do a boolean resolve, replace the result with -(x & 1)
1977 * to sign extend the low bit to 0/~0
1978 */
1979 if (devinfo->ver <= 5 &&
1980 !result.is_null() &&
1981 (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1982 elk_fs_reg masked = s.vgrf(glsl_int_type());
1983 bld.AND(masked, result, elk_imm_d(1));
1984 masked.negate = true;
1985 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), masked);
1986 }
1987 }
1988
1989 static void
fs_nir_emit_load_const(nir_to_elk_state & ntb,nir_load_const_instr * instr)1990 fs_nir_emit_load_const(nir_to_elk_state &ntb,
1991 nir_load_const_instr *instr)
1992 {
1993 const intel_device_info *devinfo = ntb.devinfo;
1994 const fs_builder &bld = ntb.bld;
1995
1996 const elk_reg_type reg_type =
1997 elk_reg_type_from_bit_size(instr->def.bit_size, ELK_REGISTER_TYPE_D);
1998 elk_fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1999
2000 switch (instr->def.bit_size) {
2001 case 8:
2002 for (unsigned i = 0; i < instr->def.num_components; i++)
2003 bld.MOV(offset(reg, bld, i), elk_setup_imm_b(bld, instr->value[i].i8));
2004 break;
2005
2006 case 16:
2007 for (unsigned i = 0; i < instr->def.num_components; i++)
2008 bld.MOV(offset(reg, bld, i), elk_imm_w(instr->value[i].i16));
2009 break;
2010
2011 case 32:
2012 for (unsigned i = 0; i < instr->def.num_components; i++)
2013 bld.MOV(offset(reg, bld, i), elk_imm_d(instr->value[i].i32));
2014 break;
2015
2016 case 64:
2017 assert(devinfo->ver >= 7);
2018 if (!devinfo->has_64bit_int) {
2019 for (unsigned i = 0; i < instr->def.num_components; i++) {
2020 bld.MOV(retype(offset(reg, bld, i), ELK_REGISTER_TYPE_DF),
2021 elk_setup_imm_df(bld, instr->value[i].f64));
2022 }
2023 } else {
2024 for (unsigned i = 0; i < instr->def.num_components; i++)
2025 bld.MOV(offset(reg, bld, i), elk_imm_q(instr->value[i].i64));
2026 }
2027 break;
2028
2029 default:
2030 unreachable("Invalid bit size");
2031 }
2032
2033 ntb.ssa_values[instr->def.index] = reg;
2034 }
2035
2036 static bool
get_nir_src_bindless(nir_to_elk_state & ntb,const nir_src & src)2037 get_nir_src_bindless(nir_to_elk_state &ntb, const nir_src &src)
2038 {
2039 return ntb.ssa_bind_infos[src.ssa->index].bindless;
2040 }
2041
2042 static bool
is_resource_src(nir_src src)2043 is_resource_src(nir_src src)
2044 {
2045 return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2046 nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
2047 }
2048
2049 static elk_fs_reg
get_resource_nir_src(nir_to_elk_state & ntb,const nir_src & src)2050 get_resource_nir_src(nir_to_elk_state &ntb, const nir_src &src)
2051 {
2052 if (!is_resource_src(src))
2053 return elk_fs_reg();
2054 return ntb.resource_values[src.ssa->index];
2055 }
2056
2057 static elk_fs_reg
get_nir_src(nir_to_elk_state & ntb,const nir_src & src)2058 get_nir_src(nir_to_elk_state &ntb, const nir_src &src)
2059 {
2060 const intel_device_info *devinfo = ntb.devinfo;
2061
2062 nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
2063
2064 elk_fs_reg reg;
2065 if (!load_reg) {
2066 if (nir_src_is_undef(src)) {
2067 const elk_reg_type reg_type =
2068 elk_reg_type_from_bit_size(src.ssa->bit_size,
2069 ELK_REGISTER_TYPE_D);
2070 reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
2071 } else {
2072 reg = ntb.ssa_values[src.ssa->index];
2073 }
2074 } else {
2075 nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
2076 /* We don't handle indirects on locals */
2077 assert(nir_intrinsic_base(load_reg) == 0);
2078 assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
2079 reg = ntb.ssa_values[decl_reg->def.index];
2080 }
2081
2082 if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
2083 /* The only 64-bit type available on gfx7 is DF, so use that. */
2084 reg.type = ELK_REGISTER_TYPE_DF;
2085 } else {
2086 /* To avoid floating-point denorm flushing problems, set the type by
2087 * default to an integer type - instructions that need floating point
2088 * semantics will set this to F if they need to
2089 */
2090 reg.type = elk_reg_type_from_bit_size(nir_src_bit_size(src),
2091 ELK_REGISTER_TYPE_D);
2092 }
2093
2094 return reg;
2095 }
2096
2097 /**
2098 * Return an IMM for constants; otherwise call get_nir_src() as normal.
2099 *
2100 * This function should not be called on any value which may be 64 bits.
2101 * We could theoretically support 64-bit on gfx8+ but we choose not to
2102 * because it wouldn't work in general (no gfx7 support) and there are
2103 * enough restrictions in 64-bit immediates that you can't take the return
2104 * value and treat it the same as the result of get_nir_src().
2105 */
2106 static elk_fs_reg
get_nir_src_imm(nir_to_elk_state & ntb,const nir_src & src)2107 get_nir_src_imm(nir_to_elk_state &ntb, const nir_src &src)
2108 {
2109 assert(nir_src_bit_size(src) == 32);
2110 return nir_src_is_const(src) ?
2111 elk_fs_reg(elk_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
2112 }
2113
2114 static elk_fs_reg
get_nir_def(nir_to_elk_state & ntb,const nir_def & def)2115 get_nir_def(nir_to_elk_state &ntb, const nir_def &def)
2116 {
2117 const fs_builder &bld = ntb.bld;
2118
2119 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2120 if (!store_reg) {
2121 const elk_reg_type reg_type =
2122 elk_reg_type_from_bit_size(def.bit_size,
2123 def.bit_size == 8 ?
2124 ELK_REGISTER_TYPE_D :
2125 ELK_REGISTER_TYPE_F);
2126 ntb.ssa_values[def.index] =
2127 bld.vgrf(reg_type, def.num_components);
2128 bld.UNDEF(ntb.ssa_values[def.index]);
2129 return ntb.ssa_values[def.index];
2130 } else {
2131 nir_intrinsic_instr *decl_reg =
2132 nir_reg_get_decl(store_reg->src[1].ssa);
2133 /* We don't handle indirects on locals */
2134 assert(nir_intrinsic_base(store_reg) == 0);
2135 assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
2136 return ntb.ssa_values[decl_reg->def.index];
2137 }
2138 }
2139
2140 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)2141 get_nir_write_mask(const nir_def &def)
2142 {
2143 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2144 if (!store_reg) {
2145 return nir_component_mask(def.num_components);
2146 } else {
2147 return nir_intrinsic_write_mask(store_reg);
2148 }
2149 }
2150
2151 static elk_fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum elk_opcode opcode,const elk_fs_reg & dst,const elk_fs_reg & src,const elk_fs_reg & desc,const elk_fs_reg & flag_reg,glsl_interp_mode interpolation)2152 emit_pixel_interpolater_send(const fs_builder &bld,
2153 enum elk_opcode opcode,
2154 const elk_fs_reg &dst,
2155 const elk_fs_reg &src,
2156 const elk_fs_reg &desc,
2157 const elk_fs_reg &flag_reg,
2158 glsl_interp_mode interpolation)
2159 {
2160 struct elk_wm_prog_data *wm_prog_data =
2161 elk_wm_prog_data(bld.shader->stage_prog_data);
2162
2163 elk_fs_reg srcs[INTERP_NUM_SRCS];
2164 srcs[INTERP_SRC_OFFSET] = src;
2165 srcs[INTERP_SRC_MSG_DESC] = desc;
2166 srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2167
2168 elk_fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2169 /* 2 floats per slot returned */
2170 inst->size_written = 2 * dst.component_size(inst->exec_size);
2171 if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2172 inst->pi_noperspective = true;
2173 /* TGL BSpec says:
2174 * This field cannot be set to "Linear Interpolation"
2175 * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2176 */
2177 wm_prog_data->uses_nonperspective_interp_modes = true;
2178 }
2179
2180 wm_prog_data->pulls_bary = true;
2181
2182 return inst;
2183 }
2184
2185 /**
2186 * Computes 1 << x, given a D/UD register containing some value x.
2187 */
2188 static elk_fs_reg
intexp2(const fs_builder & bld,const elk_fs_reg & x)2189 intexp2(const fs_builder &bld, const elk_fs_reg &x)
2190 {
2191 assert(x.type == ELK_REGISTER_TYPE_UD || x.type == ELK_REGISTER_TYPE_D);
2192
2193 elk_fs_reg result = bld.vgrf(x.type, 1);
2194 elk_fs_reg one = bld.vgrf(x.type, 1);
2195
2196 bld.MOV(one, retype(elk_imm_d(1), one.type));
2197 bld.SHL(result, one, x);
2198 return result;
2199 }
2200
2201 static void
emit_gs_end_primitive(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src)2202 emit_gs_end_primitive(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src)
2203 {
2204 elk_fs_visitor &s = ntb.s;
2205 assert(s.stage == MESA_SHADER_GEOMETRY);
2206
2207 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2208
2209 if (s.gs_compile->control_data_header_size_bits == 0)
2210 return;
2211
2212 /* We can only do EndPrimitive() functionality when the control data
2213 * consists of cut bits. Fortunately, the only time it isn't is when the
2214 * output type is points, in which case EndPrimitive() is a no-op.
2215 */
2216 if (gs_prog_data->control_data_format !=
2217 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2218 return;
2219 }
2220
2221 /* Cut bits use one bit per vertex. */
2222 assert(s.gs_compile->control_data_bits_per_vertex == 1);
2223
2224 elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2225 vertex_count.type = ELK_REGISTER_TYPE_UD;
2226
2227 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2228 * vertex n, 0 otherwise. So all we need to do here is mark bit
2229 * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2230 * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2231 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2232 *
2233 * Note that if EndPrimitive() is called before emitting any vertices, this
2234 * will cause us to set bit 31 of the control_data_bits register to 1.
2235 * That's fine because:
2236 *
2237 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2238 * output, so the hardware will ignore cut bit 31.
2239 *
2240 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2241 * last vertex, so setting cut bit 31 has no effect (since the primitive
2242 * is automatically ended when the GS terminates).
2243 *
2244 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2245 * control_data_bits register to 0 when the first vertex is emitted.
2246 */
2247
2248 const fs_builder abld = ntb.bld.annotate("end primitive");
2249
2250 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2251 elk_fs_reg prev_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2252 abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2253 elk_fs_reg mask = intexp2(abld, prev_count);
2254 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2255 * attention to the lower 5 bits of its second source argument, so on this
2256 * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2257 * ((vertex_count - 1) % 32).
2258 */
2259 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2260 }
2261
2262 void
emit_gs_control_data_bits(const elk_fs_reg & vertex_count)2263 elk_fs_visitor::emit_gs_control_data_bits(const elk_fs_reg &vertex_count)
2264 {
2265 assert(stage == MESA_SHADER_GEOMETRY);
2266 assert(gs_compile->control_data_bits_per_vertex != 0);
2267
2268 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
2269
2270 const fs_builder bld = fs_builder(this).at_end();
2271 const fs_builder abld = bld.annotate("emit control data bits");
2272 const fs_builder fwa_bld = bld.exec_all();
2273
2274 /* We use a single UD register to accumulate control data bits (32 bits
2275 * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
2276 * at a time.
2277 *
2278 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2279 * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2280 * use the Channel Mask phase to enable/disable which DWord within that
2281 * group to write. (Remember, different SIMD8 channels may have emitted
2282 * different numbers of vertices, so we may need per-slot offsets.)
2283 *
2284 * Channel masking presents an annoying problem: we may have to replicate
2285 * the data up to 4 times:
2286 *
2287 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2288 *
2289 * To avoid penalizing shaders that emit a small number of vertices, we
2290 * can avoid these sometimes: if the size of the control data header is
2291 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
2292 * land in the same 128-bit group, so we can skip per-slot offsets.
2293 *
2294 * Similarly, if the control data header is <= 32 bits, there is only one
2295 * DWord, so we can skip channel masks.
2296 */
2297 elk_fs_reg channel_mask, per_slot_offset;
2298
2299 if (gs_compile->control_data_header_size_bits > 32)
2300 channel_mask = vgrf(glsl_uint_type());
2301
2302 if (gs_compile->control_data_header_size_bits > 128)
2303 per_slot_offset = vgrf(glsl_uint_type());
2304
2305 /* Figure out which DWord we're trying to write to using the formula:
2306 *
2307 * dword_index = (vertex_count - 1) * bits_per_vertex / 32
2308 *
2309 * Since bits_per_vertex is a power of two, and is known at compile
2310 * time, this can be optimized to:
2311 *
2312 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2313 */
2314 if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
2315 elk_fs_reg dword_index = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2316 elk_fs_reg prev_count = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2317 abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2318 unsigned log2_bits_per_vertex =
2319 util_last_bit(gs_compile->control_data_bits_per_vertex);
2320 abld.SHR(dword_index, prev_count, elk_imm_ud(6u - log2_bits_per_vertex));
2321
2322 if (per_slot_offset.file != BAD_FILE) {
2323 /* Set the per-slot offset to dword_index / 4, so that we'll write to
2324 * the appropriate OWord within the control data header.
2325 */
2326 abld.SHR(per_slot_offset, dword_index, elk_imm_ud(2u));
2327 }
2328
2329 /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2330 * write to the appropriate DWORD within the OWORD.
2331 */
2332 elk_fs_reg channel = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2333 fwa_bld.AND(channel, dword_index, elk_imm_ud(3u));
2334 channel_mask = intexp2(fwa_bld, channel);
2335 /* Then the channel masks need to be in bits 23:16. */
2336 fwa_bld.SHL(channel_mask, channel_mask, elk_imm_ud(16u));
2337 }
2338
2339 /* If there are channel masks, add 3 extra copies of the data. */
2340 const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2341 elk_fs_reg sources[4];
2342
2343 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
2344 sources[i] = this->control_data_bits;
2345
2346 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2347 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2348 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2349 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2350 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, length);
2351 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
2352 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2353
2354 elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2355 srcs, ARRAY_SIZE(srcs));
2356
2357 /* We need to increment Global Offset by 256-bits to make room for
2358 * Broadwell's extra "Vertex Count" payload at the beginning of the
2359 * URB entry. Since this is an OWord message, Global Offset is counted
2360 * in 128-bit units, so we must set it to 2.
2361 */
2362 if (gs_prog_data->static_vertex_count == -1)
2363 inst->offset = 2;
2364 }
2365
2366 static void
set_gs_stream_control_data_bits(nir_to_elk_state & ntb,const elk_fs_reg & vertex_count,unsigned stream_id)2367 set_gs_stream_control_data_bits(nir_to_elk_state &ntb, const elk_fs_reg &vertex_count,
2368 unsigned stream_id)
2369 {
2370 elk_fs_visitor &s = ntb.s;
2371
2372 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2373
2374 /* Note: we are calling this *before* increasing vertex_count, so
2375 * this->vertex_count == vertex_count - 1 in the formula above.
2376 */
2377
2378 /* Stream mode uses 2 bits per vertex */
2379 assert(s.gs_compile->control_data_bits_per_vertex == 2);
2380
2381 /* Must be a valid stream */
2382 assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2383
2384 /* Control data bits are initialized to 0 so we don't have to set any
2385 * bits when sending vertices to stream 0.
2386 */
2387 if (stream_id == 0)
2388 return;
2389
2390 const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL);
2391
2392 /* reg::sid = stream_id */
2393 elk_fs_reg sid = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2394 abld.MOV(sid, elk_imm_ud(stream_id));
2395
2396 /* reg:shift_count = 2 * (vertex_count - 1) */
2397 elk_fs_reg shift_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2398 abld.SHL(shift_count, vertex_count, elk_imm_ud(1u));
2399
2400 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2401 * attention to the lower 5 bits of its second source argument, so on this
2402 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2403 * stream_id << ((2 * (vertex_count - 1)) % 32).
2404 */
2405 elk_fs_reg mask = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2406 abld.SHL(mask, sid, shift_count);
2407 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2408 }
2409
2410 static void
emit_gs_vertex(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2411 emit_gs_vertex(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src,
2412 unsigned stream_id)
2413 {
2414 elk_fs_visitor &s = ntb.s;
2415
2416 assert(s.stage == MESA_SHADER_GEOMETRY);
2417
2418 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2419
2420 elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2421 vertex_count.type = ELK_REGISTER_TYPE_UD;
2422
2423 /* Haswell and later hardware ignores the "Render Stream Select" bits
2424 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2425 * and instead sends all primitives down the pipeline for rasterization.
2426 * If the SOL stage is enabled, "Render Stream Select" is honored and
2427 * primitives bound to non-zero streams are discarded after stream output.
2428 *
2429 * Since the only purpose of primives sent to non-zero streams is to
2430 * be recorded by transform feedback, we can simply discard all geometry
2431 * bound to these streams when transform feedback is disabled.
2432 */
2433 if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2434 return;
2435
2436 /* If we're outputting 32 control data bits or less, then we can wait
2437 * until the shader is over to output them all. Otherwise we need to
2438 * output them as we go. Now is the time to do it, since we're about to
2439 * output the vertex_count'th vertex, so it's guaranteed that the
2440 * control data bits associated with the (vertex_count - 1)th vertex are
2441 * correct.
2442 */
2443 if (s.gs_compile->control_data_header_size_bits > 32) {
2444 const fs_builder abld =
2445 ntb.bld.annotate("emit vertex: emit control data bits");
2446
2447 /* Only emit control data bits if we've finished accumulating a batch
2448 * of 32 bits. This is the case when:
2449 *
2450 * (vertex_count * bits_per_vertex) % 32 == 0
2451 *
2452 * (in other words, when the last 5 bits of vertex_count *
2453 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
2454 * integer n (which is always the case, since bits_per_vertex is
2455 * always 1 or 2), this is equivalent to requiring that the last 5-n
2456 * bits of vertex_count are 0:
2457 *
2458 * vertex_count & (2^(5-n) - 1) == 0
2459 *
2460 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2461 * equivalent to:
2462 *
2463 * vertex_count & (32 / bits_per_vertex - 1) == 0
2464 *
2465 * TODO: If vertex_count is an immediate, we could do some of this math
2466 * at compile time...
2467 */
2468 elk_fs_inst *inst =
2469 abld.AND(ntb.bld.null_reg_d(), vertex_count,
2470 elk_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2471 inst->conditional_mod = ELK_CONDITIONAL_Z;
2472
2473 abld.IF(ELK_PREDICATE_NORMAL);
2474 /* If vertex_count is 0, then no control data bits have been
2475 * accumulated yet, so we can skip emitting them.
2476 */
2477 abld.CMP(ntb.bld.null_reg_d(), vertex_count, elk_imm_ud(0u),
2478 ELK_CONDITIONAL_NEQ);
2479 abld.IF(ELK_PREDICATE_NORMAL);
2480 s.emit_gs_control_data_bits(vertex_count);
2481 abld.emit(ELK_OPCODE_ENDIF);
2482
2483 /* Reset control_data_bits to 0 so we can start accumulating a new
2484 * batch.
2485 *
2486 * Note: in the case where vertex_count == 0, this neutralizes the
2487 * effect of any call to EndPrimitive() that the shader may have
2488 * made before outputting its first vertex.
2489 */
2490 inst = abld.MOV(s.control_data_bits, elk_imm_ud(0u));
2491 inst->force_writemask_all = true;
2492 abld.emit(ELK_OPCODE_ENDIF);
2493 }
2494
2495 s.emit_urb_writes(vertex_count);
2496
2497 /* In stream mode we have to set control data bits for all vertices
2498 * unless we have disabled control data bits completely (which we do
2499 * do for MESA_PRIM_POINTS outputs that don't use streams).
2500 */
2501 if (s.gs_compile->control_data_header_size_bits > 0 &&
2502 gs_prog_data->control_data_format ==
2503 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2504 set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2505 }
2506 }
2507
2508 static void
emit_gs_input_load(nir_to_elk_state & ntb,const elk_fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2509 emit_gs_input_load(nir_to_elk_state &ntb, const elk_fs_reg &dst,
2510 const nir_src &vertex_src,
2511 unsigned base_offset,
2512 const nir_src &offset_src,
2513 unsigned num_components,
2514 unsigned first_component)
2515 {
2516 const intel_device_info *devinfo = ntb.devinfo;
2517 const fs_builder &bld = ntb.bld;
2518 elk_fs_visitor &s = ntb.s;
2519
2520 assert(type_sz(dst.type) == 4);
2521 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2522 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2523
2524 /* TODO: figure out push input layout for invocations == 1 */
2525 if (gs_prog_data->invocations == 1 &&
2526 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2527 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2528 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2529 nir_src_as_uint(vertex_src) * push_reg_count;
2530 const elk_fs_reg attr = elk_fs_reg(ATTR, 0, dst.type);
2531 for (unsigned i = 0; i < num_components; i++) {
2532 ntb.bld.MOV(offset(dst, bld, i),
2533 offset(attr, bld, imm_offset + i + first_component));
2534 }
2535 return;
2536 }
2537
2538 /* Resort to the pull model. Ensure the VUE handles are provided. */
2539 assert(gs_prog_data->base.include_vue_handles);
2540
2541 elk_fs_reg start = s.gs_payload().icp_handle_start;
2542 elk_fs_reg icp_handle = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2543
2544 if (gs_prog_data->invocations == 1) {
2545 if (nir_src_is_const(vertex_src)) {
2546 /* The vertex index is constant; just select the proper URB handle. */
2547 icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src));
2548 } else {
2549 /* The vertex index is non-constant. We need to use indirect
2550 * addressing to fetch the proper URB handle.
2551 *
2552 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2553 * indicating that channel <n> should read the handle from
2554 * DWord <n>. We convert that to bytes by multiplying by 4.
2555 *
2556 * Next, we convert the vertex index to bytes by multiplying
2557 * by 32 (shifting by 5), and add the two together. This is
2558 * the final indirect byte offset.
2559 */
2560 elk_fs_reg sequence =
2561 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2562 elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2563 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2564 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2565
2566 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2567 bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2568 /* Convert vertex_index to bytes (multiply by 32) */
2569 bld.SHL(vertex_offset_bytes,
2570 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2571 elk_imm_ud(5u));
2572 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2573
2574 /* Use first_icp_handle as the base offset. There is one register
2575 * of URB handles per vertex, so inform the register allocator that
2576 * we might read up to nir->info.gs.vertices_in registers.
2577 */
2578 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2579 elk_fs_reg(icp_offset_bytes),
2580 elk_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE));
2581 }
2582 } else {
2583 assert(gs_prog_data->invocations > 1);
2584
2585 if (nir_src_is_const(vertex_src)) {
2586 unsigned vertex = nir_src_as_uint(vertex_src);
2587 assert(devinfo->ver >= 9 || vertex <= 5);
2588 bld.MOV(icp_handle, component(start, vertex));
2589 } else {
2590 /* The vertex index is non-constant. We need to use indirect
2591 * addressing to fetch the proper URB handle.
2592 *
2593 */
2594 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2595
2596 /* Convert vertex_index to bytes (multiply by 4) */
2597 bld.SHL(icp_offset_bytes,
2598 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2599 elk_imm_ud(2u));
2600
2601 /* Use first_icp_handle as the base offset. There is one DWord
2602 * of URB handles per vertex, so inform the register allocator that
2603 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2604 */
2605 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2606 elk_fs_reg(icp_offset_bytes),
2607 elk_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2608 REG_SIZE));
2609 }
2610 }
2611
2612 elk_fs_inst *inst;
2613 elk_fs_reg indirect_offset = get_nir_src(ntb, offset_src);
2614
2615 if (nir_src_is_const(offset_src)) {
2616 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2617 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2618
2619 /* Constant indexing - use global offset. */
2620 if (first_component != 0) {
2621 unsigned read_components = num_components + first_component;
2622 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2623 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2624 ARRAY_SIZE(srcs));
2625 inst->size_written = read_components *
2626 tmp.component_size(inst->exec_size);
2627 for (unsigned i = 0; i < num_components; i++) {
2628 bld.MOV(offset(dst, bld, i),
2629 offset(tmp, bld, i + first_component));
2630 }
2631 } else {
2632 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2633 ARRAY_SIZE(srcs));
2634 inst->size_written = num_components *
2635 dst.component_size(inst->exec_size);
2636 }
2637 inst->offset = base_offset + nir_src_as_uint(offset_src);
2638 } else {
2639 /* Indirect indexing - use per-slot offsets as well. */
2640 unsigned read_components = num_components + first_component;
2641 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2642
2643 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2644 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2645 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2646
2647 if (first_component != 0) {
2648 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2649 srcs, ARRAY_SIZE(srcs));
2650 inst->size_written = read_components *
2651 tmp.component_size(inst->exec_size);
2652 for (unsigned i = 0; i < num_components; i++) {
2653 bld.MOV(offset(dst, bld, i),
2654 offset(tmp, bld, i + first_component));
2655 }
2656 } else {
2657 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2658 srcs, ARRAY_SIZE(srcs));
2659 inst->size_written = num_components *
2660 dst.component_size(inst->exec_size);
2661 }
2662 inst->offset = base_offset;
2663 }
2664 }
2665
2666 static elk_fs_reg
get_indirect_offset(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2667 get_indirect_offset(nir_to_elk_state &ntb, nir_intrinsic_instr *instr)
2668 {
2669 nir_src *offset_src = nir_get_io_offset_src(instr);
2670
2671 if (nir_src_is_const(*offset_src)) {
2672 /* The only constant offset we should find is 0. elk_nir.c's
2673 * add_const_offset_to_base() will fold other constant offsets
2674 * into the "base" index.
2675 */
2676 assert(nir_src_as_uint(*offset_src) == 0);
2677 return elk_fs_reg();
2678 }
2679
2680 return get_nir_src(ntb, *offset_src);
2681 }
2682
2683 static void
fs_nir_emit_vs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2684 fs_nir_emit_vs_intrinsic(nir_to_elk_state &ntb,
2685 nir_intrinsic_instr *instr)
2686 {
2687 const fs_builder &bld = ntb.bld;
2688 elk_fs_visitor &s = ntb.s;
2689 assert(s.stage == MESA_SHADER_VERTEX);
2690
2691 elk_fs_reg dest;
2692 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2693 dest = get_nir_def(ntb, instr->def);
2694
2695 switch (instr->intrinsic) {
2696 case nir_intrinsic_load_vertex_id:
2697 case nir_intrinsic_load_base_vertex:
2698 unreachable("should be lowered by nir_lower_system_values()");
2699
2700 case nir_intrinsic_load_input: {
2701 assert(instr->def.bit_size == 32);
2702 const elk_fs_reg src = offset(elk_fs_reg(ATTR, 0, dest.type), bld,
2703 nir_intrinsic_base(instr) * 4 +
2704 nir_intrinsic_component(instr) +
2705 nir_src_as_uint(instr->src[0]));
2706
2707 for (unsigned i = 0; i < instr->num_components; i++)
2708 bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2709 break;
2710 }
2711
2712 case nir_intrinsic_load_vertex_id_zero_base:
2713 case nir_intrinsic_load_instance_id:
2714 case nir_intrinsic_load_base_instance:
2715 case nir_intrinsic_load_draw_id:
2716 case nir_intrinsic_load_first_vertex:
2717 case nir_intrinsic_load_is_indexed_draw:
2718 unreachable("lowered by elk_nir_lower_vs_inputs");
2719
2720 default:
2721 fs_nir_emit_intrinsic(ntb, bld, instr);
2722 break;
2723 }
2724 }
2725
2726 static elk_fs_reg
get_tcs_single_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2727 get_tcs_single_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2728 nir_intrinsic_instr *instr)
2729 {
2730 elk_fs_visitor &s = ntb.s;
2731
2732 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2733 const nir_src &vertex_src = instr->src[0];
2734 nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2735
2736 const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2737
2738 elk_fs_reg icp_handle;
2739
2740 if (nir_src_is_const(vertex_src)) {
2741 /* Emit a MOV to resolve <0,1,0> regioning. */
2742 icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2743 unsigned vertex = nir_src_as_uint(vertex_src);
2744 bld.MOV(icp_handle, component(start, vertex));
2745 } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2746 vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2747 /* For the common case of only 1 instance, an array index of
2748 * gl_InvocationID means reading the handles from the start. Skip all
2749 * the indirect work.
2750 */
2751 icp_handle = start;
2752 } else {
2753 /* The vertex index is non-constant. We need to use indirect
2754 * addressing to fetch the proper URB handle.
2755 */
2756 icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2757
2758 /* Each ICP handle is a single DWord (4 bytes) */
2759 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2760 bld.SHL(vertex_offset_bytes,
2761 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2762 elk_imm_ud(2u));
2763
2764 /* We might read up to 4 registers. */
2765 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2766 start, vertex_offset_bytes,
2767 elk_imm_ud(4 * REG_SIZE));
2768 }
2769
2770 return icp_handle;
2771 }
2772
2773 static elk_fs_reg
get_tcs_multi_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2774 get_tcs_multi_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2775 nir_intrinsic_instr *instr)
2776 {
2777 elk_fs_visitor &s = ntb.s;
2778 const intel_device_info *devinfo = s.devinfo;
2779
2780 struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) s.key;
2781 const nir_src &vertex_src = instr->src[0];
2782 const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2783
2784 const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2785
2786 if (nir_src_is_const(vertex_src))
2787 return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2788
2789 /* The vertex index is non-constant. We need to use indirect
2790 * addressing to fetch the proper URB handle.
2791 *
2792 * First, we start with the sequence indicating that channel <n>
2793 * should read the handle from DWord <n>. We convert that to bytes
2794 * by multiplying by 4.
2795 *
2796 * Next, we convert the vertex index to bytes by multiplying
2797 * by the GRF size (by shifting), and add the two together. This is
2798 * the final indirect byte offset.
2799 */
2800 elk_fs_reg icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2801 elk_fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2802 elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2803 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2804 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2805
2806 /* Offsets will be 0, 4, 8, ... */
2807 bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2808 /* Convert vertex_index to bytes (multiply by 32) */
2809 assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2810 bld.SHL(vertex_offset_bytes,
2811 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2812 elk_imm_ud(ffs(grf_size_bytes) - 1));
2813 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2814
2815 /* Use start of ICP handles as the base offset. There is one register
2816 * of URB handles per vertex, so inform the register allocator that
2817 * we might read up to nir->info.gs.vertices_in registers.
2818 */
2819 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2820 icp_offset_bytes,
2821 elk_imm_ud(elk_tcs_prog_key_input_vertices(tcs_key) *
2822 grf_size_bytes));
2823
2824 return icp_handle;
2825 }
2826
2827 static void
setup_barrier_message_payload_gfx125(const fs_builder & bld,const elk_fs_reg & msg_payload)2828 setup_barrier_message_payload_gfx125(const fs_builder &bld,
2829 const elk_fs_reg &msg_payload)
2830 {
2831 assert(bld.shader->devinfo->verx10 >= 125);
2832
2833 /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
2834 elk_fs_reg m0_10ub = component(retype(msg_payload, ELK_REGISTER_TYPE_UB), 10);
2835 elk_fs_reg r0_11ub =
2836 stride(suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UB), 11),
2837 0, 1, 0);
2838 bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
2839 }
2840
2841 static void
emit_barrier(nir_to_elk_state & ntb)2842 emit_barrier(nir_to_elk_state &ntb)
2843 {
2844 const intel_device_info *devinfo = ntb.devinfo;
2845 const fs_builder &bld = ntb.bld;
2846 elk_fs_visitor &s = ntb.s;
2847
2848 /* We are getting the barrier ID from the compute shader header */
2849 assert(gl_shader_stage_uses_workgroup(s.stage));
2850
2851 elk_fs_reg payload = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
2852
2853 /* Clear the message payload */
2854 bld.exec_all().group(8, 0).MOV(payload, elk_imm_ud(0u));
2855
2856 if (devinfo->verx10 >= 125) {
2857 setup_barrier_message_payload_gfx125(bld, payload);
2858 } else {
2859 assert(gl_shader_stage_is_compute(s.stage));
2860
2861 uint32_t barrier_id_mask;
2862 switch (devinfo->ver) {
2863 case 7:
2864 case 8:
2865 barrier_id_mask = 0x0f000000u; break;
2866 case 9:
2867 barrier_id_mask = 0x8f000000u; break;
2868 case 11:
2869 case 12:
2870 barrier_id_mask = 0x7f000000u; break;
2871 default:
2872 unreachable("barrier is only available on gen >= 7");
2873 }
2874
2875 /* Copy the barrier id from r0.2 to the message payload reg.2 */
2876 elk_fs_reg r0_2 = elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD));
2877 bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
2878 elk_imm_ud(barrier_id_mask));
2879 }
2880
2881 /* Emit a gateway "barrier" message using the payload we set up, followed
2882 * by a wait instruction.
2883 */
2884 bld.exec_all().emit(ELK_SHADER_OPCODE_BARRIER, reg_undef, payload);
2885 }
2886
2887 static void
emit_tcs_barrier(nir_to_elk_state & ntb)2888 emit_tcs_barrier(nir_to_elk_state &ntb)
2889 {
2890 const intel_device_info *devinfo = ntb.devinfo;
2891 const fs_builder &bld = ntb.bld;
2892 elk_fs_visitor &s = ntb.s;
2893
2894 assert(s.stage == MESA_SHADER_TESS_CTRL);
2895 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2896
2897 elk_fs_reg m0 = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2898 elk_fs_reg m0_2 = component(m0, 2);
2899
2900 const fs_builder chanbld = bld.exec_all().group(1, 0);
2901
2902 /* Zero the message header */
2903 bld.exec_all().MOV(m0, elk_imm_ud(0u));
2904
2905 if (devinfo->verx10 >= 125) {
2906 setup_barrier_message_payload_gfx125(bld, m0);
2907 } else if (devinfo->ver >= 11) {
2908 chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
2909 elk_imm_ud(INTEL_MASK(30, 24)));
2910
2911 /* Set the Barrier Count and the enable bit */
2912 chanbld.OR(m0_2, m0_2,
2913 elk_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2914 } else {
2915 /* Copy "Barrier ID" from r0.2, bits 16:13 */
2916 chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
2917 elk_imm_ud(INTEL_MASK(16, 13)));
2918
2919 /* Shift it up to bits 27:24. */
2920 chanbld.SHL(m0_2, m0_2, elk_imm_ud(11));
2921
2922 /* Set the Barrier Count and the enable bit */
2923 chanbld.OR(m0_2, m0_2,
2924 elk_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2925 }
2926
2927 bld.emit(ELK_SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2928 }
2929
2930 static void
fs_nir_emit_tcs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2931 fs_nir_emit_tcs_intrinsic(nir_to_elk_state &ntb,
2932 nir_intrinsic_instr *instr)
2933 {
2934 const intel_device_info *devinfo = ntb.devinfo;
2935 const fs_builder &bld = ntb.bld;
2936 elk_fs_visitor &s = ntb.s;
2937
2938 assert(s.stage == MESA_SHADER_TESS_CTRL);
2939 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2940 struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2941
2942 elk_fs_reg dst;
2943 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2944 dst = get_nir_def(ntb, instr->def);
2945
2946 switch (instr->intrinsic) {
2947 case nir_intrinsic_load_primitive_id:
2948 bld.MOV(dst, s.tcs_payload().primitive_id);
2949 break;
2950 case nir_intrinsic_load_invocation_id:
2951 bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
2952 break;
2953
2954 case nir_intrinsic_barrier:
2955 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
2956 fs_nir_emit_intrinsic(ntb, bld, instr);
2957 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
2958 if (tcs_prog_data->instances != 1)
2959 emit_tcs_barrier(ntb);
2960 }
2961 break;
2962
2963 case nir_intrinsic_load_input:
2964 unreachable("nir_lower_io should never give us these.");
2965 break;
2966
2967 case nir_intrinsic_load_per_vertex_input: {
2968 assert(instr->def.bit_size == 32);
2969 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2970 unsigned imm_offset = nir_intrinsic_base(instr);
2971 elk_fs_inst *inst;
2972
2973 const bool multi_patch =
2974 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
2975
2976 elk_fs_reg icp_handle = multi_patch ?
2977 get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
2978 get_tcs_single_patch_icp_handle(ntb, bld, instr);
2979
2980 /* We can only read two double components with each URB read, so
2981 * we send two read messages in that case, each one loading up to
2982 * two double components.
2983 */
2984 unsigned num_components = instr->num_components;
2985 unsigned first_component = nir_intrinsic_component(instr);
2986
2987 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2988 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2989
2990 if (indirect_offset.file == BAD_FILE) {
2991 /* Constant indexing - use global offset. */
2992 if (first_component != 0) {
2993 unsigned read_components = num_components + first_component;
2994 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2995 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2996 ARRAY_SIZE(srcs));
2997 for (unsigned i = 0; i < num_components; i++) {
2998 bld.MOV(offset(dst, bld, i),
2999 offset(tmp, bld, i + first_component));
3000 }
3001 } else {
3002 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
3003 ARRAY_SIZE(srcs));
3004 }
3005 inst->offset = imm_offset;
3006 } else {
3007 /* Indirect indexing - use per-slot offsets as well. */
3008 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3009
3010 if (first_component != 0) {
3011 unsigned read_components = num_components + first_component;
3012 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
3013 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3014 srcs, ARRAY_SIZE(srcs));
3015 for (unsigned i = 0; i < num_components; i++) {
3016 bld.MOV(offset(dst, bld, i),
3017 offset(tmp, bld, i + first_component));
3018 }
3019 } else {
3020 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
3021 srcs, ARRAY_SIZE(srcs));
3022 }
3023 inst->offset = imm_offset;
3024 }
3025 inst->size_written = (num_components + first_component) *
3026 inst->dst.component_size(inst->exec_size);
3027
3028 /* Copy the temporary to the destination to deal with writemasking.
3029 *
3030 * Also attempt to deal with gl_PointSize being in the .w component.
3031 */
3032 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
3033 assert(type_sz(dst.type) == 4);
3034 inst->dst = bld.vgrf(dst.type, 4);
3035 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
3036 bld.MOV(dst, offset(inst->dst, bld, 3));
3037 }
3038 break;
3039 }
3040
3041 case nir_intrinsic_load_output:
3042 case nir_intrinsic_load_per_vertex_output: {
3043 assert(instr->def.bit_size == 32);
3044 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3045 unsigned imm_offset = nir_intrinsic_base(instr);
3046 unsigned first_component = nir_intrinsic_component(instr);
3047
3048 elk_fs_inst *inst;
3049 if (indirect_offset.file == BAD_FILE) {
3050 /* This MOV replicates the output handle to all enabled channels
3051 * is SINGLE_PATCH mode.
3052 */
3053 elk_fs_reg patch_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
3054 bld.MOV(patch_handle, s.tcs_payload().patch_urb_output);
3055
3056 {
3057 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3058 srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
3059
3060 if (first_component != 0) {
3061 unsigned read_components =
3062 instr->num_components + first_component;
3063 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
3064 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3065 srcs, ARRAY_SIZE(srcs));
3066 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3067 for (unsigned i = 0; i < instr->num_components; i++) {
3068 bld.MOV(offset(dst, bld, i),
3069 offset(tmp, bld, i + first_component));
3070 }
3071 } else {
3072 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
3073 srcs, ARRAY_SIZE(srcs));
3074 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3075 }
3076 inst->offset = imm_offset;
3077 }
3078 } else {
3079 /* Indirect indexing - use per-slot offsets as well. */
3080 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3081 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3082 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3083
3084 if (first_component != 0) {
3085 unsigned read_components =
3086 instr->num_components + first_component;
3087 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
3088 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3089 srcs, ARRAY_SIZE(srcs));
3090 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3091 for (unsigned i = 0; i < instr->num_components; i++) {
3092 bld.MOV(offset(dst, bld, i),
3093 offset(tmp, bld, i + first_component));
3094 }
3095 } else {
3096 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
3097 srcs, ARRAY_SIZE(srcs));
3098 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3099 }
3100 inst->offset = imm_offset;
3101 }
3102 break;
3103 }
3104
3105 case nir_intrinsic_store_output:
3106 case nir_intrinsic_store_per_vertex_output: {
3107 assert(nir_src_bit_size(instr->src[0]) == 32);
3108 elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
3109 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3110 unsigned imm_offset = nir_intrinsic_base(instr);
3111 unsigned mask = nir_intrinsic_write_mask(instr);
3112
3113 if (mask == 0)
3114 break;
3115
3116 unsigned num_components = util_last_bit(mask);
3117 unsigned first_component = nir_intrinsic_component(instr);
3118 assert((first_component + num_components) <= 4);
3119
3120 mask = mask << first_component;
3121
3122 const bool has_urb_lsc = devinfo->ver >= 20;
3123
3124 elk_fs_reg mask_reg;
3125 if (mask != WRITEMASK_XYZW)
3126 mask_reg = elk_imm_ud(mask << 16);
3127
3128 elk_fs_reg sources[4];
3129
3130 unsigned m = has_urb_lsc ? 0 : first_component;
3131 for (unsigned i = 0; i < num_components; i++) {
3132 int c = i + first_component;
3133 if (mask & (1 << c)) {
3134 sources[m++] = offset(value, bld, i);
3135 } else if (devinfo->ver < 20) {
3136 m++;
3137 }
3138 }
3139
3140 assert(has_urb_lsc || m == (first_component + num_components));
3141
3142 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3143 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
3144 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3145 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
3146 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, m);
3147 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(m);
3148 bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
3149
3150 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
3151 srcs, ARRAY_SIZE(srcs));
3152 inst->offset = imm_offset;
3153 break;
3154 }
3155
3156 default:
3157 fs_nir_emit_intrinsic(ntb, bld, instr);
3158 break;
3159 }
3160 }
3161
3162 static void
fs_nir_emit_tes_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3163 fs_nir_emit_tes_intrinsic(nir_to_elk_state &ntb,
3164 nir_intrinsic_instr *instr)
3165 {
3166 const intel_device_info *devinfo = ntb.devinfo;
3167 const fs_builder &bld = ntb.bld;
3168 elk_fs_visitor &s = ntb.s;
3169
3170 assert(s.stage == MESA_SHADER_TESS_EVAL);
3171 struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(s.prog_data);
3172
3173 elk_fs_reg dest;
3174 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3175 dest = get_nir_def(ntb, instr->def);
3176
3177 switch (instr->intrinsic) {
3178 case nir_intrinsic_load_primitive_id:
3179 bld.MOV(dest, s.tes_payload().primitive_id);
3180 break;
3181
3182 case nir_intrinsic_load_tess_coord:
3183 for (unsigned i = 0; i < 3; i++)
3184 bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3185 break;
3186
3187 case nir_intrinsic_load_input:
3188 case nir_intrinsic_load_per_vertex_input: {
3189 assert(instr->def.bit_size == 32);
3190 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3191 unsigned imm_offset = nir_intrinsic_base(instr);
3192 unsigned first_component = nir_intrinsic_component(instr);
3193
3194 elk_fs_inst *inst;
3195 if (indirect_offset.file == BAD_FILE) {
3196 /* Arbitrarily only push up to 32 vec4 slots worth of data,
3197 * which is 16 registers (since each holds 2 vec4 slots).
3198 */
3199 const unsigned max_push_slots = 32;
3200 if (imm_offset < max_push_slots) {
3201 const elk_fs_reg src = horiz_offset(elk_fs_reg(ATTR, 0, dest.type),
3202 4 * imm_offset + first_component);
3203 for (int i = 0; i < instr->num_components; i++)
3204 bld.MOV(offset(dest, bld, i), component(src, i));
3205
3206 tes_prog_data->base.urb_read_length =
3207 MAX2(tes_prog_data->base.urb_read_length,
3208 (imm_offset / 2) + 1);
3209 } else {
3210 /* Replicate the patch handle to all enabled channels */
3211 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3212 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3213
3214 if (first_component != 0) {
3215 unsigned read_components =
3216 instr->num_components + first_component;
3217 elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3218 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3219 srcs, ARRAY_SIZE(srcs));
3220 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3221 for (unsigned i = 0; i < instr->num_components; i++) {
3222 bld.MOV(offset(dest, bld, i),
3223 offset(tmp, bld, i + first_component));
3224 }
3225 } else {
3226 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3227 srcs, ARRAY_SIZE(srcs));
3228 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3229 }
3230 inst->offset = imm_offset;
3231 }
3232 } else {
3233 /* Indirect indexing - use per-slot offsets as well. */
3234
3235 /* We can only read two double components with each URB read, so
3236 * we send two read messages in that case, each one loading up to
3237 * two double components.
3238 */
3239 unsigned num_components = instr->num_components;
3240
3241 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3242 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3243 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3244
3245 if (first_component != 0) {
3246 unsigned read_components =
3247 num_components + first_component;
3248 elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3249 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3250 srcs, ARRAY_SIZE(srcs));
3251 for (unsigned i = 0; i < num_components; i++) {
3252 bld.MOV(offset(dest, bld, i),
3253 offset(tmp, bld, i + first_component));
3254 }
3255 } else {
3256 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3257 srcs, ARRAY_SIZE(srcs));
3258 }
3259 inst->offset = imm_offset;
3260 inst->size_written = (num_components + first_component) *
3261 inst->dst.component_size(inst->exec_size);
3262 }
3263 break;
3264 }
3265 default:
3266 fs_nir_emit_intrinsic(ntb, bld, instr);
3267 break;
3268 }
3269 }
3270
3271 static void
fs_nir_emit_gs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3272 fs_nir_emit_gs_intrinsic(nir_to_elk_state &ntb,
3273 nir_intrinsic_instr *instr)
3274 {
3275 const fs_builder &bld = ntb.bld;
3276 elk_fs_visitor &s = ntb.s;
3277
3278 assert(s.stage == MESA_SHADER_GEOMETRY);
3279 elk_fs_reg indirect_offset;
3280
3281 elk_fs_reg dest;
3282 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3283 dest = get_nir_def(ntb, instr->def);
3284
3285 switch (instr->intrinsic) {
3286 case nir_intrinsic_load_primitive_id:
3287 assert(s.stage == MESA_SHADER_GEOMETRY);
3288 assert(elk_gs_prog_data(s.prog_data)->include_primitive_id);
3289 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD), s.gs_payload().primitive_id);
3290 break;
3291
3292 case nir_intrinsic_load_input:
3293 unreachable("load_input intrinsics are invalid for the GS stage");
3294
3295 case nir_intrinsic_load_per_vertex_input:
3296 emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3297 instr->src[1], instr->num_components,
3298 nir_intrinsic_component(instr));
3299 break;
3300
3301 case nir_intrinsic_emit_vertex_with_counter:
3302 emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3303 break;
3304
3305 case nir_intrinsic_end_primitive_with_counter:
3306 emit_gs_end_primitive(ntb, instr->src[0]);
3307 break;
3308
3309 case nir_intrinsic_set_vertex_and_primitive_count:
3310 bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3311 break;
3312
3313 case nir_intrinsic_load_invocation_id: {
3314 elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3315 assert(val.file != BAD_FILE);
3316 dest.type = val.type;
3317 bld.MOV(dest, val);
3318 break;
3319 }
3320
3321 default:
3322 fs_nir_emit_intrinsic(ntb, bld, instr);
3323 break;
3324 }
3325 }
3326
3327 /**
3328 * Fetch the current render target layer index.
3329 */
3330 static elk_fs_reg
fetch_render_target_array_index(const fs_builder & bld)3331 fetch_render_target_array_index(const fs_builder &bld)
3332 {
3333 const elk_fs_visitor *v = static_cast<const elk_fs_visitor *>(bld.shader);
3334
3335 if (bld.shader->devinfo->ver >= 20) {
3336 /* Gfx20+ has separate Render Target Array indices for each pair
3337 * of subspans in order to support multiple polygons, so we need
3338 * to use a <1;8,0> region in order to select the correct word
3339 * for each channel.
3340 */
3341 const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3342
3343 for (unsigned i = 0; i < DIV_ROUND_UP(bld.dispatch_width(), 16); i++) {
3344 const fs_builder hbld = bld.group(16, i);
3345 const struct elk_reg reg = retype(elk_vec1_grf(2 * i + 1, 1),
3346 ELK_REGISTER_TYPE_UW);
3347 hbld.AND(offset(idx, hbld, i), stride(reg, 1, 8, 0),
3348 elk_imm_uw(0x7ff));
3349 }
3350
3351 return idx;
3352 } else if (bld.shader->devinfo->ver >= 12 && v->max_polygons == 2) {
3353 /* According to the BSpec "PS Thread Payload for Normal
3354 * Dispatch", the render target array index is stored as bits
3355 * 26:16 of either the R1.1 or R1.6 poly info dwords, for the
3356 * first and second polygons respectively in multipolygon PS
3357 * dispatch mode.
3358 */
3359 assert(bld.dispatch_width() == 16);
3360 const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3361
3362 for (unsigned i = 0; i < v->max_polygons; i++) {
3363 const fs_builder hbld = bld.group(8, i);
3364 const struct elk_reg g1 = elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 1, 3 + 10 * i);
3365 hbld.AND(offset(idx, hbld, i), g1, elk_imm_uw(0x7ff));
3366 }
3367
3368 return idx;
3369 } else if (bld.shader->devinfo->ver >= 12) {
3370 /* The render target array index is provided in the thread payload as
3371 * bits 26:16 of r1.1.
3372 */
3373 const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3374 bld.AND(idx, elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 1, 3),
3375 elk_imm_uw(0x7ff));
3376 return idx;
3377 } else if (bld.shader->devinfo->ver >= 6) {
3378 /* The render target array index is provided in the thread payload as
3379 * bits 26:16 of r0.0.
3380 */
3381 const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3382 bld.AND(idx, elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 0, 1),
3383 elk_imm_uw(0x7ff));
3384 return idx;
3385 } else {
3386 /* Pre-SNB we only ever render into the first layer of the framebuffer
3387 * since layered rendering is not implemented.
3388 */
3389 return elk_imm_ud(0);
3390 }
3391 }
3392
3393 /* Sample from the MCS surface attached to this multisample texture. */
3394 static elk_fs_reg
emit_mcs_fetch(nir_to_elk_state & ntb,const elk_fs_reg & coordinate,unsigned components,const elk_fs_reg & texture,const elk_fs_reg & texture_handle)3395 emit_mcs_fetch(nir_to_elk_state &ntb, const elk_fs_reg &coordinate, unsigned components,
3396 const elk_fs_reg &texture,
3397 const elk_fs_reg &texture_handle)
3398 {
3399 const fs_builder &bld = ntb.bld;
3400
3401 const elk_fs_reg dest = ntb.s.vgrf(glsl_uvec4_type());
3402
3403 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3404 srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3405 srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3406 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3407 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3408 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(components);
3409 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
3410 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
3411
3412 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3413 ARRAY_SIZE(srcs));
3414
3415 /* We only care about one or two regs of response, but the sampler always
3416 * writes 4/8.
3417 */
3418 inst->size_written = 4 * dest.component_size(inst->exec_size);
3419
3420 return dest;
3421 }
3422
3423 /**
3424 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3425 * framebuffer at the current fragment coordinates and sample index.
3426 */
3427 static elk_fs_inst *
emit_non_coherent_fb_read(nir_to_elk_state & ntb,const fs_builder & bld,const elk_fs_reg & dst,unsigned target)3428 emit_non_coherent_fb_read(nir_to_elk_state &ntb, const fs_builder &bld, const elk_fs_reg &dst,
3429 unsigned target)
3430 {
3431 elk_fs_visitor &s = ntb.s;
3432 const struct intel_device_info *devinfo = s.devinfo;
3433
3434 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3435 const elk_wm_prog_key *wm_key =
3436 reinterpret_cast<const elk_wm_prog_key *>(s.key);
3437 assert(!wm_key->coherent_fb_fetch);
3438
3439 /* Calculate the fragment coordinates. */
3440 const elk_fs_reg coords = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
3441 bld.MOV(offset(coords, bld, 0), s.pixel_x);
3442 bld.MOV(offset(coords, bld, 1), s.pixel_y);
3443 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3444
3445 /* Calculate the sample index and MCS payload when multisampling. Luckily
3446 * the MCS fetch message behaves deterministically for UMS surfaces, so it
3447 * shouldn't be necessary to recompile based on whether the framebuffer is
3448 * CMS or UMS.
3449 */
3450 assert(wm_key->multisample_fbo == ELK_ALWAYS ||
3451 wm_key->multisample_fbo == ELK_NEVER);
3452 if (wm_key->multisample_fbo &&
3453 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3454 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3455
3456 const elk_fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3457 const elk_fs_reg mcs = wm_key->multisample_fbo ?
3458 emit_mcs_fetch(ntb, coords, 3, elk_imm_ud(target), elk_fs_reg()) : elk_fs_reg();
3459
3460 /* Use either a normal or a CMS texel fetch message depending on whether
3461 * the framebuffer is single or multisample. On SKL+ use the wide CMS
3462 * message just in case the framebuffer uses 16x multisampling, it should
3463 * be equivalent to the normal CMS fetch for lower multisampling modes.
3464 */
3465 elk_opcode op;
3466 if (wm_key->multisample_fbo) {
3467 /* On SKL+ use the wide CMS message just in case the framebuffer uses 16x
3468 * multisampling, it should be equivalent to the normal CMS fetch for
3469 * lower multisampling modes.
3470 *
3471 * On Gfx12HP, there is only CMS_W variant available.
3472 */
3473 if (devinfo->verx10 >= 125)
3474 op = ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
3475 else if (devinfo->ver >= 9)
3476 op = ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL;
3477 else
3478 op = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
3479 } else {
3480 op = ELK_SHADER_OPCODE_TXF_LOGICAL;
3481 }
3482
3483 /* Emit the instruction. */
3484 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3485 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords;
3486 srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_ud(0);
3487 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample;
3488 srcs[TEX_LOGICAL_SRC_MCS] = mcs;
3489 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(target);
3490 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3491 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_ud(3);
3492 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_ud(0);
3493 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(0);
3494
3495 elk_fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3496 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3497
3498 return inst;
3499 }
3500
3501 /**
3502 * Actual coherent framebuffer read implemented using the native render target
3503 * read message. Requires SKL+.
3504 */
3505 static elk_fs_inst *
emit_coherent_fb_read(const fs_builder & bld,const elk_fs_reg & dst,unsigned target)3506 emit_coherent_fb_read(const fs_builder &bld, const elk_fs_reg &dst, unsigned target)
3507 {
3508 assert(bld.shader->devinfo->ver >= 9);
3509 elk_fs_inst *inst = bld.emit(ELK_FS_OPCODE_FB_READ_LOGICAL, dst);
3510 inst->target = target;
3511 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3512
3513 return inst;
3514 }
3515
3516 static elk_fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,elk_fs_reg * regs,unsigned n)3517 alloc_temporary(const fs_builder &bld, unsigned size, elk_fs_reg *regs, unsigned n)
3518 {
3519 if (n && regs[0].file != BAD_FILE) {
3520 return regs[0];
3521
3522 } else {
3523 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, size);
3524
3525 for (unsigned i = 0; i < n; i++)
3526 regs[i] = tmp;
3527
3528 return tmp;
3529 }
3530 }
3531
3532 static elk_fs_reg
alloc_frag_output(nir_to_elk_state & ntb,unsigned location)3533 alloc_frag_output(nir_to_elk_state &ntb, unsigned location)
3534 {
3535 elk_fs_visitor &s = ntb.s;
3536
3537 assert(s.stage == MESA_SHADER_FRAGMENT);
3538 const elk_wm_prog_key *const key =
3539 reinterpret_cast<const elk_wm_prog_key *>(s.key);
3540 const unsigned l = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_LOCATION);
3541 const unsigned i = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_INDEX);
3542
3543 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3544 return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3545
3546 else if (l == FRAG_RESULT_COLOR)
3547 return alloc_temporary(ntb.bld, 4, s.outputs,
3548 MAX2(key->nr_color_regions, 1));
3549
3550 else if (l == FRAG_RESULT_DEPTH)
3551 return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3552
3553 else if (l == FRAG_RESULT_STENCIL)
3554 return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3555
3556 else if (l == FRAG_RESULT_SAMPLE_MASK)
3557 return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3558
3559 else if (l >= FRAG_RESULT_DATA0 &&
3560 l < FRAG_RESULT_DATA0 + ELK_MAX_DRAW_BUFFERS)
3561 return alloc_temporary(ntb.bld, 4,
3562 &s.outputs[l - FRAG_RESULT_DATA0], 1);
3563
3564 else
3565 unreachable("Invalid location");
3566 }
3567
3568 static void
emit_is_helper_invocation(nir_to_elk_state & ntb,elk_fs_reg result)3569 emit_is_helper_invocation(nir_to_elk_state &ntb, elk_fs_reg result)
3570 {
3571 const fs_builder &bld = ntb.bld;
3572
3573 /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3574 * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3575 * consideration demoted invocations.
3576 */
3577 result.type = ELK_REGISTER_TYPE_UD;
3578
3579 bld.MOV(result, elk_imm_ud(0));
3580
3581 /* See elk_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3582 unsigned width = bld.dispatch_width();
3583 for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3584 const fs_builder b = bld.group(MIN2(width, 16), i);
3585
3586 elk_fs_inst *mov = b.MOV(offset(result, b, i), elk_imm_ud(~0));
3587
3588 /* The at() ensures that any code emitted to get the predicate happens
3589 * before the mov right above. This is not an issue elsewhere because
3590 * lowering code already set up the builder this way.
3591 */
3592 elk_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3593 mov->predicate_inverse = true;
3594 }
3595 }
3596
3597 static void
emit_fragcoord_interpolation(nir_to_elk_state & ntb,elk_fs_reg wpos)3598 emit_fragcoord_interpolation(nir_to_elk_state &ntb, elk_fs_reg wpos)
3599 {
3600 const intel_device_info *devinfo = ntb.devinfo;
3601 const fs_builder &bld = ntb.bld;
3602 elk_fs_visitor &s = ntb.s;
3603
3604 assert(s.stage == MESA_SHADER_FRAGMENT);
3605
3606 /* gl_FragCoord.x */
3607 bld.MOV(wpos, s.pixel_x);
3608 wpos = offset(wpos, bld, 1);
3609
3610 /* gl_FragCoord.y */
3611 bld.MOV(wpos, s.pixel_y);
3612 wpos = offset(wpos, bld, 1);
3613
3614 /* gl_FragCoord.z */
3615 if (devinfo->ver >= 6) {
3616 bld.MOV(wpos, s.pixel_z);
3617 } else {
3618 bld.emit(ELK_FS_OPCODE_LINTERP, wpos,
3619 s.delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL],
3620 s.interp_reg(bld, VARYING_SLOT_POS, 2, 0));
3621 }
3622 wpos = offset(wpos, bld, 1);
3623
3624 /* gl_FragCoord.w: Already set up in emit_interpolation */
3625 bld.MOV(wpos, s.wpos_w);
3626 }
3627
3628 static elk_fs_reg
emit_frontfacing_interpolation(nir_to_elk_state & ntb)3629 emit_frontfacing_interpolation(nir_to_elk_state &ntb)
3630 {
3631 const intel_device_info *devinfo = ntb.devinfo;
3632 const fs_builder &bld = ntb.bld;
3633 elk_fs_visitor &s = ntb.s;
3634
3635 elk_fs_reg ff = bld.vgrf(ELK_REGISTER_TYPE_D);
3636
3637 if (devinfo->ver >= 20) {
3638 /* Gfx20+ has separate back-facing bits for each pair of
3639 * subspans in order to support multiple polygons, so we need to
3640 * use a <1;8,0> region in order to select the correct word for
3641 * each channel.
3642 */
3643 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UW);
3644
3645 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3646 const fs_builder hbld = bld.group(16, i);
3647 const struct elk_reg gi_uw = retype(xe2_vec1_grf(i, 9),
3648 ELK_REGISTER_TYPE_UW);
3649 hbld.AND(offset(tmp, hbld, i), gi_uw, elk_imm_uw(0x800));
3650 }
3651
3652 bld.CMP(ff, tmp, elk_imm_uw(0), ELK_CONDITIONAL_Z);
3653
3654 } else if (devinfo->ver >= 12 && s.max_polygons == 2) {
3655 /* According to the BSpec "PS Thread Payload for Normal
3656 * Dispatch", the front/back facing interpolation bit is stored
3657 * as bit 15 of either the R1.1 or R1.6 poly info field, for the
3658 * first and second polygons respectively in multipolygon PS
3659 * dispatch mode.
3660 */
3661 assert(s.dispatch_width == 16);
3662 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_W);
3663
3664 for (unsigned i = 0; i < s.max_polygons; i++) {
3665 const fs_builder hbld = bld.group(8, i);
3666 const struct elk_reg g1 = retype(elk_vec1_grf(1, 1 + 5 * i),
3667 ELK_REGISTER_TYPE_W);
3668 hbld.ASR(offset(tmp, hbld, i), g1, elk_imm_d(15));
3669 }
3670
3671 bld.NOT(ff, tmp);
3672
3673 } else if (devinfo->ver >= 12) {
3674 elk_fs_reg g1 = elk_fs_reg(retype(elk_vec1_grf(1, 1), ELK_REGISTER_TYPE_W));
3675
3676 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_W);
3677 bld.ASR(tmp, g1, elk_imm_d(15));
3678 bld.NOT(ff, tmp);
3679 } else if (devinfo->ver >= 6) {
3680 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3681 * a boolean result from this (~0/true or 0/false).
3682 *
3683 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3684 * this task in only one instruction:
3685 * - a negation source modifier will flip the bit; and
3686 * - a W -> D type conversion will sign extend the bit into the high
3687 * word of the destination.
3688 *
3689 * An ASR 15 fills the low word of the destination.
3690 */
3691 elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
3692 g0.negate = true;
3693
3694 bld.ASR(ff, g0, elk_imm_d(15));
3695 } else {
3696 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
3697 * a boolean result from this (1/true or 0/false).
3698 *
3699 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
3700 * the negation source modifier to flip it. Unfortunately the SHR
3701 * instruction only operates on UD (or D with an abs source modifier)
3702 * sources without negation.
3703 *
3704 * Instead, use ASR (which will give ~0/true or 0/false).
3705 */
3706 elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
3707 g1_6.negate = true;
3708
3709 bld.ASR(ff, g1_6, elk_imm_d(31));
3710 }
3711
3712 return ff;
3713 }
3714
3715 static elk_fs_reg
emit_samplepos_setup(nir_to_elk_state & ntb)3716 emit_samplepos_setup(nir_to_elk_state &ntb)
3717 {
3718 const intel_device_info *devinfo = ntb.devinfo;
3719 const fs_builder &bld = ntb.bld;
3720 elk_fs_visitor &s = ntb.s;
3721
3722 assert(s.stage == MESA_SHADER_FRAGMENT);
3723 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3724 assert(devinfo->ver >= 6);
3725
3726 const fs_builder abld = bld.annotate("compute sample position");
3727 elk_fs_reg pos = abld.vgrf(ELK_REGISTER_TYPE_F, 2);
3728
3729 if (wm_prog_data->persample_dispatch == ELK_NEVER) {
3730 /* From ARB_sample_shading specification:
3731 * "When rendering to a non-multisample buffer, or if multisample
3732 * rasterization is disabled, gl_SamplePosition will always be
3733 * (0.5, 0.5).
3734 */
3735 bld.MOV(offset(pos, bld, 0), elk_imm_f(0.5f));
3736 bld.MOV(offset(pos, bld, 1), elk_imm_f(0.5f));
3737 return pos;
3738 }
3739
3740 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3741 * mode will be enabled.
3742 *
3743 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3744 * R31.1:0 Position Offset X/Y for Slot[3:0]
3745 * R31.3:2 Position Offset X/Y for Slot[7:4]
3746 * .....
3747 *
3748 * The X, Y sample positions come in as bytes in thread payload. So, read
3749 * the positions using vstride=16, width=8, hstride=2.
3750 */
3751 const elk_fs_reg sample_pos_reg =
3752 fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, ELK_REGISTER_TYPE_W);
3753
3754 for (unsigned i = 0; i < 2; i++) {
3755 elk_fs_reg tmp_d = bld.vgrf(ELK_REGISTER_TYPE_D);
3756 abld.MOV(tmp_d, subscript(sample_pos_reg, ELK_REGISTER_TYPE_B, i));
3757 /* Convert int_sample_pos to floating point */
3758 elk_fs_reg tmp_f = bld.vgrf(ELK_REGISTER_TYPE_F);
3759 abld.MOV(tmp_f, tmp_d);
3760 /* Scale to the range [0, 1] */
3761 abld.MUL(offset(pos, abld, i), tmp_f, elk_imm_f(1 / 16.0f));
3762 }
3763
3764 if (wm_prog_data->persample_dispatch == ELK_SOMETIMES) {
3765 check_dynamic_msaa_flag(abld, wm_prog_data,
3766 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3767 for (unsigned i = 0; i < 2; i++) {
3768 set_predicate(ELK_PREDICATE_NORMAL,
3769 bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3770 elk_imm_f(0.5f)));
3771 }
3772 }
3773
3774 return pos;
3775 }
3776
3777 static elk_fs_reg
emit_sampleid_setup(nir_to_elk_state & ntb)3778 emit_sampleid_setup(nir_to_elk_state &ntb)
3779 {
3780 const intel_device_info *devinfo = ntb.devinfo;
3781 const fs_builder &bld = ntb.bld;
3782 elk_fs_visitor &s = ntb.s;
3783
3784 assert(s.stage == MESA_SHADER_FRAGMENT);
3785 ASSERTED elk_wm_prog_key *key = (elk_wm_prog_key*) s.key;
3786 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3787 assert(devinfo->ver >= 6);
3788
3789 const fs_builder abld = bld.annotate("compute sample id");
3790 elk_fs_reg sample_id = abld.vgrf(ELK_REGISTER_TYPE_UD);
3791
3792 assert(key->multisample_fbo != ELK_NEVER);
3793
3794 if (devinfo->ver >= 8) {
3795 /* Sample ID comes in as 4-bit numbers in g1.0:
3796 *
3797 * 15:12 Slot 3 SampleID (only used in SIMD16)
3798 * 11:8 Slot 2 SampleID (only used in SIMD16)
3799 * 7:4 Slot 1 SampleID
3800 * 3:0 Slot 0 SampleID
3801 *
3802 * Each slot corresponds to four channels, so we want to replicate each
3803 * half-byte value to 4 channels in a row:
3804 *
3805 * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
3806 * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
3807 *
3808 * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
3809 * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
3810 *
3811 * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
3812 * channels to read the first byte (7:0), and the second group of 8
3813 * channels to read the second byte (15:8). Then, we shift right by
3814 * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
3815 * values into place. Finally, we AND with 0xf to keep the low nibble.
3816 *
3817 * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
3818 * and(16) dst<1>D tmp<8,8,1>W 0xf:W
3819 *
3820 * TODO: These payload bits exist on Gfx7 too, but they appear to always
3821 * be zero, so this code fails to work. We should find out why.
3822 */
3823 const elk_fs_reg tmp = abld.vgrf(ELK_REGISTER_TYPE_UW);
3824
3825 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3826 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
3827 /* According to the "PS Thread Payload for Normal Dispatch"
3828 * pages on the BSpec, the sample ids are stored in R0.8/R1.8
3829 * on gfx20+ and in R1.0/R2.0 on gfx8+.
3830 */
3831 const struct elk_reg id_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
3832 elk_vec1_grf(i + 1, 0);
3833 hbld.SHR(offset(tmp, hbld, i),
3834 stride(retype(id_reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
3835 elk_imm_v(0x44440000));
3836 }
3837
3838 abld.AND(sample_id, tmp, elk_imm_w(0xf));
3839 } else {
3840 const elk_fs_reg t1 = component(abld.vgrf(ELK_REGISTER_TYPE_UD), 0);
3841 const elk_fs_reg t2 = abld.vgrf(ELK_REGISTER_TYPE_UW);
3842
3843 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
3844 * 8x multisampling, subspan 0 will represent sample N (where N
3845 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
3846 * 7. We can find the value of N by looking at R0.0 bits 7:6
3847 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
3848 * (since samples are always delivered in pairs). That is, we
3849 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
3850 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
3851 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
3852 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
3853 * populating a temporary variable with the sequence (0, 1, 2, 3),
3854 * and then reading from it using vstride=1, width=4, hstride=0.
3855 * These computations hold good for 4x multisampling as well.
3856 *
3857 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
3858 * the first four slots are sample 0 of subspan 0; the next four
3859 * are sample 1 of subspan 0; the third group is sample 0 of
3860 * subspan 1, and finally sample 1 of subspan 1.
3861 */
3862
3863 /* SKL+ has an extra bit for the Starting Sample Pair Index to
3864 * accommodate 16x MSAA.
3865 */
3866 abld.exec_all().group(1, 0)
3867 .AND(t1, elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD)),
3868 elk_imm_ud(0xc0));
3869 abld.exec_all().group(1, 0).SHR(t1, t1, elk_imm_d(5));
3870
3871 /* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we
3872 * can assume 4x MSAA. Disallow it on IVB+
3873 *
3874 * FINISHME: One day, we could come up with a way to do this that
3875 * actually works on gfx7.
3876 */
3877 if (devinfo->ver >= 7)
3878 s.limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
3879 abld.exec_all().group(8, 0).MOV(t2, elk_imm_v(0x32103210));
3880
3881 /* This special instruction takes care of setting vstride=1,
3882 * width=4, hstride=0 of t2 during an ADD instruction.
3883 */
3884 abld.emit(ELK_FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
3885 }
3886
3887 if (key->multisample_fbo == ELK_SOMETIMES) {
3888 check_dynamic_msaa_flag(abld, wm_prog_data,
3889 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3890 set_predicate(ELK_PREDICATE_NORMAL,
3891 abld.SEL(sample_id, sample_id, elk_imm_ud(0)));
3892 }
3893
3894 return sample_id;
3895 }
3896
3897 static elk_fs_reg
emit_samplemaskin_setup(nir_to_elk_state & ntb)3898 emit_samplemaskin_setup(nir_to_elk_state &ntb)
3899 {
3900 const intel_device_info *devinfo = ntb.devinfo;
3901 const fs_builder &bld = ntb.bld;
3902 elk_fs_visitor &s = ntb.s;
3903
3904 assert(s.stage == MESA_SHADER_FRAGMENT);
3905 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3906 assert(devinfo->ver >= 6);
3907
3908 /* The HW doesn't provide us with expected values. */
3909 assert(wm_prog_data->coarse_pixel_dispatch != ELK_ALWAYS);
3910
3911 elk_fs_reg coverage_mask =
3912 fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, ELK_REGISTER_TYPE_D);
3913
3914 if (wm_prog_data->persample_dispatch == ELK_NEVER)
3915 return coverage_mask;
3916
3917 /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
3918 * and a mask representing which sample is being processed by the
3919 * current shader invocation.
3920 *
3921 * From the OES_sample_variables specification:
3922 * "When per-sample shading is active due to the use of a fragment input
3923 * qualified by "sample" or due to the use of the gl_SampleID or
3924 * gl_SamplePosition variables, only the bit for the current sample is
3925 * set in gl_SampleMaskIn."
3926 */
3927 const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
3928
3929 if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3930 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3931
3932 elk_fs_reg one = s.vgrf(glsl_int_type());
3933 elk_fs_reg enabled_mask = s.vgrf(glsl_int_type());
3934 abld.MOV(one, elk_imm_d(1));
3935 abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
3936 elk_fs_reg mask = bld.vgrf(ELK_REGISTER_TYPE_D);
3937 abld.AND(mask, enabled_mask, coverage_mask);
3938
3939 if (wm_prog_data->persample_dispatch == ELK_ALWAYS)
3940 return mask;
3941
3942 check_dynamic_msaa_flag(abld, wm_prog_data,
3943 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3944 set_predicate(ELK_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
3945
3946 return mask;
3947 }
3948
3949 static elk_fs_reg
emit_shading_rate_setup(nir_to_elk_state & ntb)3950 emit_shading_rate_setup(nir_to_elk_state &ntb)
3951 {
3952 const intel_device_info *devinfo = ntb.devinfo;
3953 const fs_builder &bld = ntb.bld;
3954
3955 assert(devinfo->ver >= 11);
3956
3957 struct elk_wm_prog_data *wm_prog_data =
3958 elk_wm_prog_data(bld.shader->stage_prog_data);
3959
3960 /* Coarse pixel shading size fields overlap with other fields of not in
3961 * coarse pixel dispatch mode, so report 0 when that's not the case.
3962 */
3963 if (wm_prog_data->coarse_pixel_dispatch == ELK_NEVER)
3964 return elk_imm_ud(0);
3965
3966 const fs_builder abld = bld.annotate("compute fragment shading rate");
3967
3968 /* The shading rates provided in the shader are the actual 2D shading
3969 * rate while the SPIR-V built-in is the enum value that has the shading
3970 * rate encoded as a bitfield. Fortunately, the bitfield value is just
3971 * the shading rate divided by two and shifted.
3972 */
3973
3974 /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
3975 elk_fs_reg actual_x = elk_fs_reg(retype(elk_vec1_grf(1, 0), ELK_REGISTER_TYPE_UB));
3976 /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
3977 elk_fs_reg actual_y = byte_offset(actual_x, 1);
3978
3979 elk_fs_reg int_rate_x = bld.vgrf(ELK_REGISTER_TYPE_UD);
3980 elk_fs_reg int_rate_y = bld.vgrf(ELK_REGISTER_TYPE_UD);
3981
3982 abld.SHR(int_rate_y, actual_y, elk_imm_ud(1));
3983 abld.SHR(int_rate_x, actual_x, elk_imm_ud(1));
3984 abld.SHL(int_rate_x, int_rate_x, elk_imm_ud(2));
3985
3986 elk_fs_reg rate = abld.vgrf(ELK_REGISTER_TYPE_UD);
3987 abld.OR(rate, int_rate_x, int_rate_y);
3988
3989 if (wm_prog_data->coarse_pixel_dispatch == ELK_ALWAYS)
3990 return rate;
3991
3992 check_dynamic_msaa_flag(abld, wm_prog_data,
3993 INTEL_MSAA_FLAG_COARSE_RT_WRITES);
3994 set_predicate(ELK_PREDICATE_NORMAL, abld.SEL(rate, rate, elk_imm_ud(0)));
3995
3996 return rate;
3997 }
3998
3999 static void
fs_nir_emit_fs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)4000 fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
4001 nir_intrinsic_instr *instr)
4002 {
4003 const intel_device_info *devinfo = ntb.devinfo;
4004 const fs_builder &bld = ntb.bld;
4005 elk_fs_visitor &s = ntb.s;
4006
4007 assert(s.stage == MESA_SHADER_FRAGMENT);
4008
4009 elk_fs_reg dest;
4010 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4011 dest = get_nir_def(ntb, instr->def);
4012
4013 switch (instr->intrinsic) {
4014 case nir_intrinsic_load_front_face:
4015 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
4016 emit_frontfacing_interpolation(ntb));
4017 break;
4018
4019 case nir_intrinsic_load_sample_pos:
4020 case nir_intrinsic_load_sample_pos_or_center: {
4021 elk_fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
4022 assert(sample_pos.file != BAD_FILE);
4023 dest.type = sample_pos.type;
4024 bld.MOV(dest, sample_pos);
4025 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
4026 break;
4027 }
4028
4029 case nir_intrinsic_load_layer_id:
4030 dest.type = ELK_REGISTER_TYPE_UD;
4031 bld.MOV(dest, fetch_render_target_array_index(bld));
4032 break;
4033
4034 case nir_intrinsic_is_helper_invocation:
4035 emit_is_helper_invocation(ntb, dest);
4036 break;
4037
4038 case nir_intrinsic_load_helper_invocation:
4039 case nir_intrinsic_load_sample_mask_in:
4040 case nir_intrinsic_load_sample_id:
4041 case nir_intrinsic_load_frag_shading_rate: {
4042 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
4043 elk_fs_reg val = ntb.system_values[sv];
4044 assert(val.file != BAD_FILE);
4045 dest.type = val.type;
4046 bld.MOV(dest, val);
4047 break;
4048 }
4049
4050 case nir_intrinsic_store_output: {
4051 const elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
4052 const unsigned store_offset = nir_src_as_uint(instr->src[1]);
4053 const unsigned location = nir_intrinsic_base(instr) +
4054 SET_FIELD(store_offset, ELK_NIR_FRAG_OUTPUT_LOCATION);
4055 const elk_fs_reg new_dest = retype(alloc_frag_output(ntb, location),
4056 src.type);
4057
4058 for (unsigned j = 0; j < instr->num_components; j++)
4059 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
4060 offset(src, bld, j));
4061
4062 break;
4063 }
4064
4065 case nir_intrinsic_load_output: {
4066 const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
4067 ELK_NIR_FRAG_OUTPUT_LOCATION);
4068 assert(l >= FRAG_RESULT_DATA0);
4069 const unsigned load_offset = nir_src_as_uint(instr->src[0]);
4070 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
4071 const elk_fs_reg tmp = bld.vgrf(dest.type, 4);
4072
4073 if (reinterpret_cast<const elk_wm_prog_key *>(s.key)->coherent_fb_fetch)
4074 emit_coherent_fb_read(bld, tmp, target);
4075 else
4076 emit_non_coherent_fb_read(ntb, bld, tmp, target);
4077
4078 for (unsigned j = 0; j < instr->num_components; j++) {
4079 bld.MOV(offset(dest, bld, j),
4080 offset(tmp, bld, nir_intrinsic_component(instr) + j));
4081 }
4082
4083 break;
4084 }
4085
4086 case nir_intrinsic_demote:
4087 case nir_intrinsic_discard:
4088 case nir_intrinsic_terminate:
4089 case nir_intrinsic_demote_if:
4090 case nir_intrinsic_discard_if:
4091 case nir_intrinsic_terminate_if: {
4092 /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we
4093 * can update just the flag bits that aren't yet discarded. If there's
4094 * no condition, we emit a CMP of g0 != g0, so all currently executing
4095 * channels will get turned off.
4096 */
4097 elk_fs_inst *cmp = NULL;
4098 if (instr->intrinsic == nir_intrinsic_demote_if ||
4099 instr->intrinsic == nir_intrinsic_discard_if ||
4100 instr->intrinsic == nir_intrinsic_terminate_if) {
4101 nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
4102
4103 if (alu != NULL &&
4104 alu->op != nir_op_bcsel &&
4105 (devinfo->ver > 5 ||
4106 (alu->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) != ELK_NIR_BOOLEAN_NEEDS_RESOLVE ||
4107 alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
4108 alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
4109 alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
4110 alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
4111 alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
4112 /* Re-emit the instruction that generated the Boolean value, but
4113 * do not store it. Since this instruction will be conditional,
4114 * other instructions that want to use the real Boolean value may
4115 * get garbage. This was a problem for piglit's fs-discard-exit-2
4116 * test.
4117 *
4118 * Ideally we'd detect that the instruction cannot have a
4119 * conditional modifier before emitting the instructions. Alas,
4120 * that is nigh impossible. Instead, we're going to assume the
4121 * instruction (or last instruction) generated can have a
4122 * conditional modifier. If it cannot, fallback to the old-style
4123 * compare, and hope dead code elimination will clean up the
4124 * extra instructions generated.
4125 */
4126 fs_nir_emit_alu(ntb, alu, false);
4127
4128 cmp = (elk_fs_inst *) s.instructions.get_tail();
4129 if (cmp->conditional_mod == ELK_CONDITIONAL_NONE) {
4130 if (cmp->can_do_cmod())
4131 cmp->conditional_mod = ELK_CONDITIONAL_Z;
4132 else
4133 cmp = NULL;
4134 } else {
4135 /* The old sequence that would have been generated is,
4136 * basically, bool_result == false. This is equivalent to
4137 * !bool_result, so negate the old modifier.
4138 */
4139 cmp->conditional_mod = elk_negate_cmod(cmp->conditional_mod);
4140 }
4141 }
4142
4143 if (cmp == NULL) {
4144 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
4145 elk_imm_d(0), ELK_CONDITIONAL_Z);
4146 }
4147 } else {
4148 elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
4149 ELK_REGISTER_TYPE_UW));
4150 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, ELK_CONDITIONAL_NZ);
4151 }
4152
4153 cmp->predicate = ELK_PREDICATE_NORMAL;
4154 cmp->flag_subreg = sample_mask_flag_subreg(s);
4155
4156 elk_fs_inst *jump = bld.emit(ELK_OPCODE_HALT);
4157 jump->flag_subreg = sample_mask_flag_subreg(s);
4158 jump->predicate_inverse = true;
4159
4160 if (instr->intrinsic == nir_intrinsic_terminate ||
4161 instr->intrinsic == nir_intrinsic_terminate_if) {
4162 jump->predicate = ELK_PREDICATE_NORMAL;
4163 } else {
4164 /* Only jump when the whole quad is demoted. For historical
4165 * reasons this is also used for discard.
4166 */
4167 jump->predicate = (devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
4168 ELK_PREDICATE_ALIGN1_ANY4H);
4169 }
4170
4171 if (devinfo->ver < 7)
4172 s.limit_dispatch_width(
4173 16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
4174 break;
4175 }
4176
4177 case nir_intrinsic_load_input: {
4178 /* In Fragment Shaders load_input is used either for flat inputs or
4179 * per-primitive inputs.
4180 */
4181 assert(instr->def.bit_size == 32);
4182 unsigned base = nir_intrinsic_base(instr);
4183 unsigned comp = nir_intrinsic_component(instr);
4184 unsigned num_components = instr->num_components;
4185
4186 /* Special case fields in the VUE header */
4187 if (base == VARYING_SLOT_LAYER)
4188 comp = 1;
4189 else if (base == VARYING_SLOT_VIEWPORT)
4190 comp = 2;
4191
4192 if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
4193 assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
4194 for (unsigned int i = 0; i < num_components; i++) {
4195 bld.MOV(offset(dest, bld, i),
4196 retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
4197 }
4198 } else {
4199 /* Gfx20+ packs the plane parameters of a single logical
4200 * input in a vec3 format instead of the previously used vec4
4201 * format.
4202 */
4203 const unsigned k = devinfo->ver >= 20 ? 0 : 3;
4204 for (unsigned int i = 0; i < num_components; i++) {
4205 bld.MOV(offset(dest, bld, i),
4206 retype(s.interp_reg(bld, base, comp + i, k), dest.type));
4207 }
4208 }
4209 break;
4210 }
4211
4212 case nir_intrinsic_load_fs_input_interp_deltas: {
4213 assert(s.stage == MESA_SHADER_FRAGMENT);
4214 assert(nir_src_as_uint(instr->src[0]) == 0);
4215 const unsigned base = nir_intrinsic_base(instr);
4216 const unsigned comp = nir_intrinsic_component(instr);
4217 dest.type = ELK_REGISTER_TYPE_F;
4218
4219 /* Gfx20+ packs the plane parameters of a single logical
4220 * input in a vec3 format instead of the previously used vec4
4221 * format.
4222 */
4223 if (devinfo->ver >= 20) {
4224 bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 0));
4225 bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 2));
4226 bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 1));
4227 } else {
4228 bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
4229 bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
4230 bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
4231 }
4232
4233 break;
4234 }
4235
4236 case nir_intrinsic_load_barycentric_pixel:
4237 case nir_intrinsic_load_barycentric_centroid:
4238 case nir_intrinsic_load_barycentric_sample: {
4239 /* Use the delta_xy values computed from the payload */
4240 enum elk_barycentric_mode bary = elk_barycentric_mode(instr);
4241 const elk_fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
4242 offset(s.delta_xy[bary], bld, 1) };
4243 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4244 break;
4245 }
4246
4247 case nir_intrinsic_load_barycentric_at_sample: {
4248 const glsl_interp_mode interpolation =
4249 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4250
4251 elk_fs_reg msg_data;
4252 if (nir_src_is_const(instr->src[0])) {
4253 msg_data = elk_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
4254 } else {
4255 const elk_fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
4256 ELK_REGISTER_TYPE_UD);
4257 const elk_fs_reg sample_id = bld.emit_uniformize(sample_src);
4258 msg_data = component(bld.group(8, 0).vgrf(ELK_REGISTER_TYPE_UD), 0);
4259 bld.exec_all().group(1, 0).SHL(msg_data, sample_id, elk_imm_ud(4u));
4260 }
4261
4262 elk_fs_reg flag_reg;
4263 struct elk_wm_prog_key *wm_prog_key = (struct elk_wm_prog_key *) s.key;
4264 if (wm_prog_key->multisample_fbo == ELK_SOMETIMES) {
4265 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
4266
4267 check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
4268 wm_prog_data,
4269 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
4270 flag_reg = elk_flag_reg(0, 0);
4271 }
4272
4273 emit_pixel_interpolater_send(bld,
4274 ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE,
4275 dest,
4276 elk_fs_reg(), /* src */
4277 msg_data,
4278 flag_reg,
4279 interpolation);
4280 break;
4281 }
4282
4283 case nir_intrinsic_load_barycentric_at_offset: {
4284 const glsl_interp_mode interpolation =
4285 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
4286
4287 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4288
4289 if (const_offset) {
4290 assert(nir_src_bit_size(instr->src[0]) == 32);
4291 unsigned off_x = const_offset[0].u32 & 0xf;
4292 unsigned off_y = const_offset[1].u32 & 0xf;
4293
4294 emit_pixel_interpolater_send(bld,
4295 ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
4296 dest,
4297 elk_fs_reg(), /* src */
4298 elk_imm_ud(off_x | (off_y << 4)),
4299 elk_fs_reg(), /* flag_reg */
4300 interpolation);
4301 } else {
4302 elk_fs_reg src = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_D);
4303 const enum elk_opcode opcode = ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
4304 emit_pixel_interpolater_send(bld,
4305 opcode,
4306 dest,
4307 src,
4308 elk_imm_ud(0u),
4309 elk_fs_reg(), /* flag_reg */
4310 interpolation);
4311 }
4312 break;
4313 }
4314
4315 case nir_intrinsic_load_frag_coord:
4316 emit_fragcoord_interpolation(ntb, dest);
4317 break;
4318
4319 case nir_intrinsic_load_interpolated_input: {
4320 assert(instr->src[0].ssa &&
4321 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
4322 nir_intrinsic_instr *bary_intrinsic =
4323 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
4324 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
4325 enum glsl_interp_mode interp_mode =
4326 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
4327 elk_fs_reg dst_xy;
4328
4329 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
4330 bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
4331 /* Use the result of the PI message. */
4332 dst_xy = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F);
4333 } else {
4334 /* Use the delta_xy values computed from the payload */
4335 enum elk_barycentric_mode bary = elk_barycentric_mode(bary_intrinsic);
4336 dst_xy = s.delta_xy[bary];
4337 }
4338
4339 for (unsigned int i = 0; i < instr->num_components; i++) {
4340 elk_fs_reg interp =
4341 s.interp_reg(bld, nir_intrinsic_base(instr),
4342 nir_intrinsic_component(instr) + i, 0);
4343 interp.type = ELK_REGISTER_TYPE_F;
4344 dest.type = ELK_REGISTER_TYPE_F;
4345
4346 if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
4347 elk_fs_reg tmp = s.vgrf(glsl_float_type());
4348 bld.emit(ELK_FS_OPCODE_LINTERP, tmp, dst_xy, interp);
4349 bld.MUL(offset(dest, bld, i), tmp, s.pixel_w);
4350 } else {
4351 bld.emit(ELK_FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
4352 }
4353 }
4354 break;
4355 }
4356
4357 default:
4358 fs_nir_emit_intrinsic(ntb, bld, instr);
4359 break;
4360 }
4361 }
4362
4363 static void
fs_nir_emit_cs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)4364 fs_nir_emit_cs_intrinsic(nir_to_elk_state &ntb,
4365 nir_intrinsic_instr *instr)
4366 {
4367 const intel_device_info *devinfo = ntb.devinfo;
4368 const fs_builder &bld = ntb.bld;
4369 elk_fs_visitor &s = ntb.s;
4370
4371 assert(gl_shader_stage_uses_workgroup(s.stage));
4372 struct elk_cs_prog_data *cs_prog_data = elk_cs_prog_data(s.prog_data);
4373
4374 elk_fs_reg dest;
4375 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4376 dest = get_nir_def(ntb, instr->def);
4377
4378 switch (instr->intrinsic) {
4379 case nir_intrinsic_barrier:
4380 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4381 fs_nir_emit_intrinsic(ntb, bld, instr);
4382 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4383 /* The whole workgroup fits in a single HW thread, so all the
4384 * invocations are already executed lock-step. Instead of an actual
4385 * barrier just emit a scheduling fence, that will generate no code.
4386 */
4387 if (!s.nir->info.workgroup_size_variable &&
4388 s.workgroup_size() <= s.dispatch_width) {
4389 bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE);
4390 break;
4391 }
4392
4393 emit_barrier(ntb);
4394 cs_prog_data->uses_barrier = true;
4395 }
4396 break;
4397
4398 case nir_intrinsic_load_subgroup_id:
4399 s.cs_payload().load_subgroup_id(bld, dest);
4400 break;
4401
4402 case nir_intrinsic_load_local_invocation_id:
4403 /* This is only used for hardware generated local IDs. */
4404 assert(cs_prog_data->generate_local_id);
4405
4406 dest.type = ELK_REGISTER_TYPE_UD;
4407
4408 for (unsigned i = 0; i < 3; i++)
4409 bld.MOV(offset(dest, bld, i), s.cs_payload().local_invocation_id[i]);
4410 break;
4411
4412 case nir_intrinsic_load_workgroup_id:
4413 case nir_intrinsic_load_workgroup_id_zero_base: {
4414 elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4415 assert(val.file != BAD_FILE);
4416 dest.type = val.type;
4417 for (unsigned i = 0; i < 3; i++)
4418 bld.MOV(offset(dest, bld, i), offset(val, bld, i));
4419 break;
4420 }
4421
4422 case nir_intrinsic_load_num_workgroups: {
4423 assert(instr->def.bit_size == 32);
4424
4425 cs_prog_data->uses_num_work_groups = true;
4426
4427 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4428 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(0);
4429 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4430 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(3); /* num components */
4431 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = elk_imm_ud(0);
4432 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4433 elk_fs_inst *inst =
4434 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4435 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4436 inst->size_written = 3 * s.dispatch_width * 4;
4437 break;
4438 }
4439
4440 case nir_intrinsic_shared_atomic:
4441 case nir_intrinsic_shared_atomic_swap:
4442 fs_nir_emit_surface_atomic(ntb, bld, instr, elk_imm_ud(GFX7_BTI_SLM),
4443 false /* bindless */);
4444 break;
4445
4446 case nir_intrinsic_load_shared: {
4447 assert(devinfo->ver >= 7);
4448
4449 const unsigned bit_size = instr->def.bit_size;
4450 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4451 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4452
4453 elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
4454 int base = nir_intrinsic_base(instr);
4455 if (base) {
4456 elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4457 bld.ADD(addr_off, addr, elk_imm_d(base));
4458 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4459 } else {
4460 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4461 }
4462
4463 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4464 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4465
4466 /* Make dest unsigned because that's what the temporary will be */
4467 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4468
4469 /* Read the vector */
4470 assert(bit_size <= 32);
4471 assert(nir_intrinsic_align(instr) > 0);
4472 if (bit_size == 32 &&
4473 nir_intrinsic_align(instr) >= 4) {
4474 assert(instr->def.num_components <= 4);
4475 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4476 elk_fs_inst *inst =
4477 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4478 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4479 inst->size_written = instr->num_components * s.dispatch_width * 4;
4480 } else {
4481 assert(instr->def.num_components == 1);
4482 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4483
4484 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
4485 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4486 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4487 bld.MOV(dest, subscript(read_result, dest.type, 0));
4488 }
4489 break;
4490 }
4491
4492 case nir_intrinsic_store_shared: {
4493 assert(devinfo->ver >= 7);
4494
4495 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4496 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4497 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4498
4499 elk_fs_reg addr = get_nir_src(ntb, instr->src[1]);
4500 int base = nir_intrinsic_base(instr);
4501 if (base) {
4502 elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4503 bld.ADD(addr_off, addr, elk_imm_d(base));
4504 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4505 } else {
4506 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4507 }
4508
4509 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4510 /* No point in masking with sample mask, here we're handling compute
4511 * intrinsics.
4512 */
4513 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4514
4515 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
4516 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4517
4518 assert(bit_size <= 32);
4519 assert(nir_intrinsic_write_mask(instr) ==
4520 (1u << instr->num_components) - 1);
4521 assert(nir_intrinsic_align(instr) > 0);
4522 if (bit_size == 32 &&
4523 nir_intrinsic_align(instr) >= 4) {
4524 assert(nir_src_num_components(instr->src[0]) <= 4);
4525 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4526 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4527 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4528 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4529 } else {
4530 assert(nir_src_num_components(instr->src[0]) == 1);
4531 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4532
4533 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
4534 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4535
4536 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4537 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4538 }
4539 break;
4540 }
4541
4542 case nir_intrinsic_load_workgroup_size: {
4543 /* Should have been lowered by elk_nir_lower_cs_intrinsics() or
4544 * crocus/iris_setup_uniforms() for the variable group size case.
4545 */
4546 unreachable("Should have been lowered");
4547 break;
4548 }
4549
4550 case nir_intrinsic_dpas_intel: {
4551 const unsigned sdepth = nir_intrinsic_systolic_depth(instr);
4552 const unsigned rcount = nir_intrinsic_repeat_count(instr);
4553
4554 const elk_reg_type dest_type =
4555 elk_type_for_nir_type(devinfo, nir_intrinsic_dest_type(instr));
4556 const elk_reg_type src_type =
4557 elk_type_for_nir_type(devinfo, nir_intrinsic_src_type(instr));
4558
4559 dest = retype(dest, dest_type);
4560 elk_fs_reg src2 = retype(get_nir_src(ntb, instr->src[2]), dest_type);
4561 const elk_fs_reg dest_hf = dest;
4562
4563 fs_builder bld8 = bld.exec_all().group(8, 0);
4564 fs_builder bld16 = bld.exec_all().group(16, 0);
4565
4566 /* DG2 cannot have the destination or source 0 of DPAS be float16. It is
4567 * still advantageous to support these formats for memory and bandwidth
4568 * savings.
4569 *
4570 * The float16 source must be expanded to float32.
4571 */
4572 if (devinfo->verx10 == 125 && dest_type == ELK_REGISTER_TYPE_HF &&
4573 !s.compiler->lower_dpas) {
4574 dest = bld8.vgrf(ELK_REGISTER_TYPE_F, rcount);
4575
4576 if (src2.file != ARF) {
4577 const elk_fs_reg src2_hf = src2;
4578
4579 src2 = bld8.vgrf(ELK_REGISTER_TYPE_F, rcount);
4580
4581 for (unsigned i = 0; i < 4; i++) {
4582 bld16.MOV(byte_offset(src2, REG_SIZE * i * 2),
4583 byte_offset(src2_hf, REG_SIZE * i));
4584 }
4585 } else {
4586 src2 = retype(src2, ELK_REGISTER_TYPE_F);
4587 }
4588 }
4589
4590 bld8.DPAS(dest,
4591 src2,
4592 retype(get_nir_src(ntb, instr->src[1]), src_type),
4593 retype(get_nir_src(ntb, instr->src[0]), src_type),
4594 sdepth,
4595 rcount)
4596 ->saturate = nir_intrinsic_saturate(instr);
4597
4598 /* Compact the destination to float16 (from float32). */
4599 if (!dest.equals(dest_hf)) {
4600 for (unsigned i = 0; i < 4; i++) {
4601 bld16.MOV(byte_offset(dest_hf, REG_SIZE * i),
4602 byte_offset(dest, REG_SIZE * i * 2));
4603 }
4604 }
4605
4606 cs_prog_data->uses_systolic = true;
4607 break;
4608 }
4609
4610 default:
4611 fs_nir_emit_intrinsic(ntb, bld, instr);
4612 break;
4613 }
4614 }
4615
4616 static elk_fs_reg
elk_nir_reduction_op_identity(const fs_builder & bld,nir_op op,elk_reg_type type)4617 elk_nir_reduction_op_identity(const fs_builder &bld,
4618 nir_op op, elk_reg_type type)
4619 {
4620 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4621 switch (type_sz(type)) {
4622 case 1:
4623 if (type == ELK_REGISTER_TYPE_UB) {
4624 return elk_imm_uw(value.u8);
4625 } else {
4626 assert(type == ELK_REGISTER_TYPE_B);
4627 return elk_imm_w(value.i8);
4628 }
4629 case 2:
4630 return retype(elk_imm_uw(value.u16), type);
4631 case 4:
4632 return retype(elk_imm_ud(value.u32), type);
4633 case 8:
4634 if (type == ELK_REGISTER_TYPE_DF)
4635 return elk_setup_imm_df(bld, value.f64);
4636 else
4637 return retype(elk_imm_u64(value.u64), type);
4638 default:
4639 unreachable("Invalid type size");
4640 }
4641 }
4642
4643 static elk_opcode
elk_op_for_nir_reduction_op(nir_op op)4644 elk_op_for_nir_reduction_op(nir_op op)
4645 {
4646 switch (op) {
4647 case nir_op_iadd: return ELK_OPCODE_ADD;
4648 case nir_op_fadd: return ELK_OPCODE_ADD;
4649 case nir_op_imul: return ELK_OPCODE_MUL;
4650 case nir_op_fmul: return ELK_OPCODE_MUL;
4651 case nir_op_imin: return ELK_OPCODE_SEL;
4652 case nir_op_umin: return ELK_OPCODE_SEL;
4653 case nir_op_fmin: return ELK_OPCODE_SEL;
4654 case nir_op_imax: return ELK_OPCODE_SEL;
4655 case nir_op_umax: return ELK_OPCODE_SEL;
4656 case nir_op_fmax: return ELK_OPCODE_SEL;
4657 case nir_op_iand: return ELK_OPCODE_AND;
4658 case nir_op_ior: return ELK_OPCODE_OR;
4659 case nir_op_ixor: return ELK_OPCODE_XOR;
4660 default:
4661 unreachable("Invalid reduction operation");
4662 }
4663 }
4664
4665 static elk_conditional_mod
elk_cond_mod_for_nir_reduction_op(nir_op op)4666 elk_cond_mod_for_nir_reduction_op(nir_op op)
4667 {
4668 switch (op) {
4669 case nir_op_iadd: return ELK_CONDITIONAL_NONE;
4670 case nir_op_fadd: return ELK_CONDITIONAL_NONE;
4671 case nir_op_imul: return ELK_CONDITIONAL_NONE;
4672 case nir_op_fmul: return ELK_CONDITIONAL_NONE;
4673 case nir_op_imin: return ELK_CONDITIONAL_L;
4674 case nir_op_umin: return ELK_CONDITIONAL_L;
4675 case nir_op_fmin: return ELK_CONDITIONAL_L;
4676 case nir_op_imax: return ELK_CONDITIONAL_GE;
4677 case nir_op_umax: return ELK_CONDITIONAL_GE;
4678 case nir_op_fmax: return ELK_CONDITIONAL_GE;
4679 case nir_op_iand: return ELK_CONDITIONAL_NONE;
4680 case nir_op_ior: return ELK_CONDITIONAL_NONE;
4681 case nir_op_ixor: return ELK_CONDITIONAL_NONE;
4682 default:
4683 unreachable("Invalid reduction operation");
4684 }
4685 }
4686
4687 struct rebuild_resource {
4688 unsigned idx;
4689 std::vector<nir_def *> array;
4690 };
4691
4692 static bool
add_rebuild_src(nir_src * src,void * state)4693 add_rebuild_src(nir_src *src, void *state)
4694 {
4695 struct rebuild_resource *res = (struct rebuild_resource *) state;
4696
4697 for (nir_def *def : res->array) {
4698 if (def == src->ssa)
4699 return true;
4700 }
4701
4702 nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
4703 res->array.push_back(src->ssa);
4704 return true;
4705 }
4706
4707 static elk_fs_reg
try_rebuild_resource(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_def * resource_def)4708 try_rebuild_resource(nir_to_elk_state &ntb, const elk::fs_builder &bld, nir_def *resource_def)
4709 {
4710 /* Create a build at the location of the resource_intel intrinsic */
4711 fs_builder ubld8 = bld.exec_all().group(8, 0);
4712
4713 struct rebuild_resource resources = {};
4714 resources.idx = 0;
4715
4716 if (!nir_foreach_src(resource_def->parent_instr,
4717 add_rebuild_src, &resources))
4718 return elk_fs_reg();
4719 resources.array.push_back(resource_def);
4720
4721 if (resources.array.size() == 1) {
4722 nir_def *def = resources.array[0];
4723
4724 if (def->parent_instr->type == nir_instr_type_load_const) {
4725 nir_load_const_instr *load_const =
4726 nir_instr_as_load_const(def->parent_instr);
4727 return elk_imm_ud(load_const->value[0].i32);
4728 } else {
4729 assert(def->parent_instr->type == nir_instr_type_intrinsic &&
4730 (nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
4731 nir_intrinsic_load_uniform));
4732 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
4733 unsigned base_offset = nir_intrinsic_base(intrin);
4734 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4735 elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4736 src.offset = load_offset + base_offset % 4;
4737 return src;
4738 }
4739 }
4740
4741 for (unsigned i = 0; i < resources.array.size(); i++) {
4742 nir_def *def = resources.array[i];
4743
4744 nir_instr *instr = def->parent_instr;
4745 switch (instr->type) {
4746 case nir_instr_type_load_const: {
4747 nir_load_const_instr *load_const =
4748 nir_instr_as_load_const(instr);
4749 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4750 ntb.resource_insts[def->index] =
4751 ubld8.MOV(dst, elk_imm_ud(load_const->value[0].i32));
4752 break;
4753 }
4754
4755 case nir_instr_type_alu: {
4756 nir_alu_instr *alu = nir_instr_as_alu(instr);
4757
4758 if (nir_op_infos[alu->op].num_inputs == 2) {
4759 if (alu->src[0].swizzle[0] != 0 ||
4760 alu->src[1].swizzle[0] != 0)
4761 break;
4762 } else if (nir_op_infos[alu->op].num_inputs == 3) {
4763 if (alu->src[0].swizzle[0] != 0 ||
4764 alu->src[1].swizzle[0] != 0 ||
4765 alu->src[2].swizzle[0] != 0)
4766 break;
4767 } else {
4768 /* Not supported ALU input count */
4769 break;
4770 }
4771
4772 switch (alu->op) {
4773 case nir_op_iadd: {
4774 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4775 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4776 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4777 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4778 assert(src0.type == ELK_REGISTER_TYPE_UD);
4779 ntb.resource_insts[def->index] =
4780 ubld8.ADD(dst,
4781 src0.file != IMM ? src0 : src1,
4782 src0.file != IMM ? src1 : src0);
4783 break;
4784 }
4785 case nir_op_iadd3: {
4786 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4787 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4788 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4789 elk_fs_reg src2 = ntb.resource_insts[alu->src[2].src.ssa->index]->dst;
4790 assert(src0.file != BAD_FILE && src1.file != BAD_FILE && src2.file != BAD_FILE);
4791 assert(src0.type == ELK_REGISTER_TYPE_UD);
4792 ntb.resource_insts[def->index] =
4793 ubld8.ADD3(dst,
4794 src1.file == IMM ? src1 : src0,
4795 src1.file == IMM ? src0 : src1,
4796 src2);
4797 break;
4798 }
4799 case nir_op_ushr: {
4800 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4801 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4802 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4803 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4804 assert(src0.type == ELK_REGISTER_TYPE_UD);
4805 ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
4806 break;
4807 }
4808 case nir_op_ishl: {
4809 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4810 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4811 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4812 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4813 assert(src0.type == ELK_REGISTER_TYPE_UD);
4814 ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
4815 break;
4816 }
4817 case nir_op_mov: {
4818 break;
4819 }
4820 default:
4821 break;
4822 }
4823 break;
4824 }
4825
4826 case nir_instr_type_intrinsic: {
4827 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4828 switch (intrin->intrinsic) {
4829 case nir_intrinsic_resource_intel:
4830 ntb.resource_insts[def->index] =
4831 ntb.resource_insts[intrin->src[1].ssa->index];
4832 break;
4833
4834 case nir_intrinsic_load_uniform: {
4835 if (!nir_src_is_const(intrin->src[0]))
4836 break;
4837
4838 unsigned base_offset = nir_intrinsic_base(intrin);
4839 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4840 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4841 elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4842 src.offset = load_offset + base_offset % 4;
4843 ntb.resource_insts[def->index] = ubld8.MOV(dst, src);
4844 break;
4845 }
4846
4847 default:
4848 break;
4849 }
4850 break;
4851 }
4852
4853 default:
4854 break;
4855 }
4856
4857 if (ntb.resource_insts[def->index] == NULL)
4858 return elk_fs_reg();
4859 }
4860
4861 assert(ntb.resource_insts[resource_def->index] != NULL);
4862 return component(ntb.resource_insts[resource_def->index]->dst, 0);
4863 }
4864
4865 static elk_fs_reg
get_nir_image_intrinsic_image(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4866 get_nir_image_intrinsic_image(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4867 nir_intrinsic_instr *instr)
4868 {
4869 if (is_resource_src(instr->src[0])) {
4870 elk_fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]);
4871 if (surf_index.file != BAD_FILE)
4872 return surf_index;
4873 }
4874
4875 elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD);
4876 elk_fs_reg surf_index = image;
4877
4878 return bld.emit_uniformize(surf_index);
4879 }
4880
4881 static elk_fs_reg
get_nir_buffer_intrinsic_index(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4882 get_nir_buffer_intrinsic_index(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4883 nir_intrinsic_instr *instr)
4884 {
4885 /* SSBO stores are weird in that their index is in src[1] */
4886 const bool is_store =
4887 instr->intrinsic == nir_intrinsic_store_ssbo ||
4888 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4889 nir_src src = is_store ? instr->src[1] : instr->src[0];
4890
4891 if (nir_src_is_const(src)) {
4892 return elk_imm_ud(nir_src_as_uint(src));
4893 } else if (is_resource_src(src)) {
4894 elk_fs_reg surf_index = get_resource_nir_src(ntb, src);
4895 if (surf_index.file != BAD_FILE)
4896 return surf_index;
4897 }
4898 return bld.emit_uniformize(get_nir_src(ntb, src));
4899 }
4900
4901 /**
4902 * The offsets we get from NIR act as if each SIMD channel has it's own blob
4903 * of contiguous space. However, if we actually place each SIMD channel in
4904 * it's own space, we end up with terrible cache performance because each SIMD
4905 * channel accesses a different cache line even when they're all accessing the
4906 * same byte offset. To deal with this problem, we swizzle the address using
4907 * a simple algorithm which ensures that any time a SIMD message reads or
4908 * writes the same address, it's all in the same cache line. We have to keep
4909 * the bottom two bits fixed so that we can read/write up to a dword at a time
4910 * and the individual element is contiguous. We do this by splitting the
4911 * address as follows:
4912 *
4913 * 31 4-6 2 0
4914 * +-------------------------------+------------+----------+
4915 * | Hi address bits | chan index | addr low |
4916 * +-------------------------------+------------+----------+
4917 *
4918 * In other words, the bottom two address bits stay, and the top 30 get
4919 * shifted up so that we can stick the SIMD channel index in the middle. This
4920 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4921 * at the same logical offset, the scratch read/write instruction acts on
4922 * continuous elements and we get good cache locality.
4923 */
4924 static elk_fs_reg
swizzle_nir_scratch_addr(nir_to_elk_state & ntb,const elk::fs_builder & bld,const elk_fs_reg & nir_addr,bool in_dwords)4925 swizzle_nir_scratch_addr(nir_to_elk_state &ntb,
4926 const elk::fs_builder &bld,
4927 const elk_fs_reg &nir_addr,
4928 bool in_dwords)
4929 {
4930 elk_fs_visitor &s = ntb.s;
4931
4932 const elk_fs_reg &chan_index =
4933 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4934 const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4935
4936 elk_fs_reg addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4937 if (in_dwords) {
4938 /* In this case, we know the address is aligned to a DWORD and we want
4939 * the final address in DWORDs.
4940 */
4941 bld.SHL(addr, nir_addr, elk_imm_ud(chan_index_bits - 2));
4942 bld.OR(addr, addr, chan_index);
4943 } else {
4944 /* This case substantially more annoying because we have to pay
4945 * attention to those pesky two bottom bits.
4946 */
4947 elk_fs_reg addr_hi = bld.vgrf(ELK_REGISTER_TYPE_UD);
4948 bld.AND(addr_hi, nir_addr, elk_imm_ud(~0x3u));
4949 bld.SHL(addr_hi, addr_hi, elk_imm_ud(chan_index_bits));
4950 elk_fs_reg chan_addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4951 bld.SHL(chan_addr, chan_index, elk_imm_ud(2));
4952 bld.AND(addr, nir_addr, elk_imm_ud(0x3u));
4953 bld.OR(addr, addr, addr_hi);
4954 bld.OR(addr, addr, chan_addr);
4955 }
4956 return addr;
4957 }
4958
4959 static unsigned
choose_oword_block_size_dwords(const struct intel_device_info * devinfo,unsigned dwords)4960 choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
4961 unsigned dwords)
4962 {
4963 unsigned block;
4964 if (devinfo->has_lsc && dwords >= 64) {
4965 block = 64;
4966 } else if (dwords >= 32) {
4967 block = 32;
4968 } else if (dwords >= 16) {
4969 block = 16;
4970 } else {
4971 block = 8;
4972 }
4973 assert(block <= dwords);
4974 return block;
4975 }
4976
4977 static void
increment_a64_address(const fs_builder & bld,elk_fs_reg address,uint32_t v)4978 increment_a64_address(const fs_builder &bld, elk_fs_reg address, uint32_t v)
4979 {
4980 if (bld.shader->devinfo->has_64bit_int) {
4981 bld.ADD(address, address, elk_imm_ud(v));
4982 } else {
4983 elk_fs_reg low = retype(address, ELK_REGISTER_TYPE_UD);
4984 elk_fs_reg high = offset(low, bld, 1);
4985
4986 /* Add low and if that overflows, add carry to high. */
4987 bld.ADD(low, low, elk_imm_ud(v))->conditional_mod = ELK_CONDITIONAL_O;
4988 bld.ADD(high, high, elk_imm_ud(0x1))->predicate = ELK_PREDICATE_NORMAL;
4989 }
4990 }
4991
4992 static elk_fs_reg
emit_fence(const fs_builder & bld,enum elk_opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4993 emit_fence(const fs_builder &bld, enum elk_opcode opcode,
4994 uint8_t sfid, uint32_t desc,
4995 bool commit_enable, uint8_t bti)
4996 {
4997 assert(opcode == ELK_SHADER_OPCODE_INTERLOCK ||
4998 opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
4999
5000 elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
5001 elk_fs_inst *fence = bld.emit(opcode, dst, elk_vec8_grf(0, 0),
5002 elk_imm_ud(commit_enable),
5003 elk_imm_ud(bti));
5004 fence->sfid = sfid;
5005 fence->desc = desc;
5006
5007 return dst;
5008 }
5009
5010 static uint32_t
lsc_fence_descriptor_for_intrinsic(const struct intel_device_info * devinfo,nir_intrinsic_instr * instr)5011 lsc_fence_descriptor_for_intrinsic(const struct intel_device_info *devinfo,
5012 nir_intrinsic_instr *instr)
5013 {
5014 assert(devinfo->has_lsc);
5015
5016 enum lsc_fence_scope scope = LSC_FENCE_LOCAL;
5017 enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
5018
5019 if (nir_intrinsic_has_memory_scope(instr)) {
5020 switch (nir_intrinsic_memory_scope(instr)) {
5021 case SCOPE_DEVICE:
5022 case SCOPE_QUEUE_FAMILY:
5023 scope = LSC_FENCE_TILE;
5024 flush_type = LSC_FLUSH_TYPE_EVICT;
5025 break;
5026 case SCOPE_WORKGROUP:
5027 scope = LSC_FENCE_THREADGROUP;
5028 break;
5029 case SCOPE_SHADER_CALL:
5030 case SCOPE_INVOCATION:
5031 case SCOPE_SUBGROUP:
5032 case SCOPE_NONE:
5033 break;
5034 }
5035 } else {
5036 /* No scope defined. */
5037 scope = LSC_FENCE_TILE;
5038 flush_type = LSC_FLUSH_TYPE_EVICT;
5039 }
5040 return lsc_fence_msg_desc(devinfo, scope, flush_type, true);
5041 }
5042
5043 /**
5044 * Create a MOV to read the timestamp register.
5045 */
5046 static elk_fs_reg
get_timestamp(const fs_builder & bld)5047 get_timestamp(const fs_builder &bld)
5048 {
5049 elk_fs_visitor &s = *bld.shader;
5050 const intel_device_info *devinfo = s.devinfo;
5051
5052 assert(devinfo->ver >= 7);
5053
5054 elk_fs_reg ts = elk_fs_reg(retype(elk_vec4_reg(ELK_ARCHITECTURE_REGISTER_FILE,
5055 ELK_ARF_TIMESTAMP,
5056 0),
5057 ELK_REGISTER_TYPE_UD));
5058
5059 elk_fs_reg dst = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
5060
5061 /* We want to read the 3 fields we care about even if it's not enabled in
5062 * the dispatch.
5063 */
5064 bld.group(4, 0).exec_all().MOV(dst, ts);
5065
5066 return dst;
5067 }
5068
5069 static void
fs_nir_emit_intrinsic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)5070 fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
5071 const fs_builder &bld, nir_intrinsic_instr *instr)
5072 {
5073 const intel_device_info *devinfo = ntb.devinfo;
5074 elk_fs_visitor &s = ntb.s;
5075
5076 /* We handle this as a special case */
5077 if (instr->intrinsic == nir_intrinsic_decl_reg) {
5078 assert(nir_intrinsic_num_array_elems(instr) == 0);
5079 unsigned bit_size = nir_intrinsic_bit_size(instr);
5080 unsigned num_components = nir_intrinsic_num_components(instr);
5081 const elk_reg_type reg_type =
5082 elk_reg_type_from_bit_size(bit_size, bit_size == 8 ?
5083 ELK_REGISTER_TYPE_D :
5084 ELK_REGISTER_TYPE_F);
5085
5086 /* Re-use the destination's slot in the table for the register */
5087 ntb.ssa_values[instr->def.index] =
5088 bld.vgrf(reg_type, num_components);
5089 return;
5090 }
5091
5092 elk_fs_reg dest;
5093 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5094 dest = get_nir_def(ntb, instr->def);
5095
5096 switch (instr->intrinsic) {
5097 case nir_intrinsic_resource_intel:
5098 ntb.ssa_bind_infos[instr->def.index].valid = true;
5099 ntb.ssa_bind_infos[instr->def.index].bindless =
5100 (nir_intrinsic_resource_access_intel(instr) &
5101 nir_resource_intel_bindless) != 0;
5102 ntb.ssa_bind_infos[instr->def.index].block =
5103 nir_intrinsic_resource_block_intel(instr);
5104 ntb.ssa_bind_infos[instr->def.index].set =
5105 nir_intrinsic_desc_set(instr);
5106 ntb.ssa_bind_infos[instr->def.index].binding =
5107 nir_intrinsic_binding(instr);
5108
5109 if (nir_intrinsic_resource_access_intel(instr) &
5110 nir_resource_intel_non_uniform) {
5111 ntb.resource_values[instr->def.index] = elk_fs_reg();
5112 } else {
5113 ntb.resource_values[instr->def.index] =
5114 try_rebuild_resource(ntb, bld, instr->src[1].ssa);
5115 }
5116 ntb.ssa_values[instr->def.index] =
5117 ntb.ssa_values[instr->src[1].ssa->index];
5118 break;
5119
5120 case nir_intrinsic_load_reg:
5121 case nir_intrinsic_store_reg:
5122 /* Nothing to do with these. */
5123 break;
5124
5125 case nir_intrinsic_image_load:
5126 case nir_intrinsic_image_store:
5127 case nir_intrinsic_image_atomic:
5128 case nir_intrinsic_image_atomic_swap:
5129 case nir_intrinsic_bindless_image_load:
5130 case nir_intrinsic_bindless_image_store:
5131 case nir_intrinsic_bindless_image_atomic:
5132 case nir_intrinsic_bindless_image_atomic_swap: {
5133 /* Get some metadata from the image intrinsic. */
5134 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
5135
5136 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5137
5138 switch (instr->intrinsic) {
5139 case nir_intrinsic_image_load:
5140 case nir_intrinsic_image_store:
5141 case nir_intrinsic_image_atomic:
5142 case nir_intrinsic_image_atomic_swap:
5143 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5144 get_nir_image_intrinsic_image(ntb, bld, instr);
5145 break;
5146
5147 default:
5148 /* Bindless */
5149 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
5150 get_nir_image_intrinsic_image(ntb, bld, instr);
5151 break;
5152 }
5153
5154 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5155 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
5156 elk_imm_ud(nir_image_intrinsic_coord_components(instr));
5157
5158 /* Emit an image load, store or atomic op. */
5159 if (instr->intrinsic == nir_intrinsic_image_load ||
5160 instr->intrinsic == nir_intrinsic_bindless_image_load) {
5161 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5162 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5163 elk_fs_inst *inst =
5164 bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
5165 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5166 inst->size_written = instr->num_components * s.dispatch_width * 4;
5167 } else if (instr->intrinsic == nir_intrinsic_image_store ||
5168 instr->intrinsic == nir_intrinsic_bindless_image_store) {
5169 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5170 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]);
5171 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
5172 bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
5173 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5174 } else {
5175 unsigned num_srcs = info->num_srcs;
5176 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
5177 if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
5178 assert(num_srcs == 4);
5179 num_srcs = 3;
5180 }
5181
5182 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
5183
5184 elk_fs_reg data;
5185 if (num_srcs >= 4)
5186 data = get_nir_src(ntb, instr->src[3]);
5187 if (num_srcs >= 5) {
5188 elk_fs_reg tmp = bld.vgrf(data.type, 2);
5189 elk_fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) };
5190 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5191 data = tmp;
5192 }
5193 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5194 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
5195
5196 bld.emit(ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
5197 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5198 }
5199 break;
5200 }
5201
5202 case nir_intrinsic_image_size:
5203 case nir_intrinsic_bindless_image_size: {
5204 /* Cube image sizes should have previously been lowered to a 2D array */
5205 assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
5206
5207 /* Unlike the [un]typed load and store opcodes, the TXS that this turns
5208 * into will handle the binding table index for us in the geneerator.
5209 * Incidentally, this means that we can handle bindless with exactly the
5210 * same code.
5211 */
5212 elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]),
5213 ELK_REGISTER_TYPE_UD);
5214 image = bld.emit_uniformize(image);
5215
5216 assert(nir_src_as_uint(instr->src[1]) == 0);
5217
5218 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
5219 if (instr->intrinsic == nir_intrinsic_image_size)
5220 srcs[TEX_LOGICAL_SRC_SURFACE] = image;
5221 else
5222 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
5223 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_d(0);
5224 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(0);
5225 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
5226 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
5227
5228 /* Since the image size is always uniform, we can just emit a SIMD8
5229 * query instruction and splat the result out.
5230 */
5231 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
5232
5233 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
5234 elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
5235 tmp, srcs, ARRAY_SIZE(srcs));
5236 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5237
5238 for (unsigned c = 0; c < instr->def.num_components; ++c) {
5239 bld.MOV(offset(retype(dest, tmp.type), bld, c),
5240 component(offset(tmp, ubld, c), 0));
5241 }
5242 break;
5243 }
5244
5245 case nir_intrinsic_image_load_raw_intel: {
5246 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5247 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5248 get_nir_image_intrinsic_image(ntb, bld, instr);
5249 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5250 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5251 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5252 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5253
5254 elk_fs_inst *inst =
5255 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5256 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5257 inst->size_written = instr->num_components * s.dispatch_width * 4;
5258 break;
5259 }
5260
5261 case nir_intrinsic_image_store_raw_intel: {
5262 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5263 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5264 get_nir_image_intrinsic_image(ntb, bld, instr);
5265 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5266 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]);
5267 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5268 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5269 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
5270
5271 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5272 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5273 break;
5274 }
5275
5276 case nir_intrinsic_barrier:
5277 case nir_intrinsic_begin_invocation_interlock:
5278 case nir_intrinsic_end_invocation_interlock: {
5279 bool ugm_fence, slm_fence, tgm_fence, urb_fence;
5280 enum elk_opcode opcode = ELK_OPCODE_NOP;
5281
5282 /* Handling interlock intrinsics here will allow the logic for IVB
5283 * render cache (see below) to be reused.
5284 */
5285
5286 switch (instr->intrinsic) {
5287 case nir_intrinsic_barrier: {
5288 /* Note we only care about the memory part of the
5289 * barrier. The execution part will be taken care
5290 * of by the stage specific intrinsic handler functions.
5291 */
5292 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
5293 ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
5294 slm_fence = modes & nir_var_mem_shared;
5295 tgm_fence = modes & nir_var_image;
5296 urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
5297 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
5298 opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
5299 break;
5300 }
5301
5302 case nir_intrinsic_begin_invocation_interlock:
5303 /* For beginInvocationInterlockARB(), we will generate a memory fence
5304 * but with a different opcode so that generator can pick SENDC
5305 * instead of SEND.
5306 */
5307 assert(s.stage == MESA_SHADER_FRAGMENT);
5308 ugm_fence = tgm_fence = true;
5309 slm_fence = urb_fence = false;
5310 opcode = ELK_SHADER_OPCODE_INTERLOCK;
5311 break;
5312
5313 case nir_intrinsic_end_invocation_interlock:
5314 /* For endInvocationInterlockARB(), we need to insert a memory fence which
5315 * stalls in the shader until the memory transactions prior to that
5316 * fence are complete. This ensures that the shader does not end before
5317 * any writes from its critical section have landed. Otherwise, you can
5318 * end up with a case where the next invocation on that pixel properly
5319 * stalls for previous FS invocation on its pixel to complete but
5320 * doesn't actually wait for the dataport memory transactions from that
5321 * thread to land before submitting its own.
5322 */
5323 assert(s.stage == MESA_SHADER_FRAGMENT);
5324 ugm_fence = tgm_fence = true;
5325 slm_fence = urb_fence = false;
5326 opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
5327 break;
5328
5329 default:
5330 unreachable("invalid intrinsic");
5331 }
5332
5333 if (opcode == ELK_OPCODE_NOP)
5334 break;
5335
5336 if (s.nir->info.shared_size > 0) {
5337 assert(gl_shader_stage_uses_workgroup(s.stage));
5338 } else {
5339 slm_fence = false;
5340 }
5341
5342 /* If the workgroup fits in a single HW thread, the messages for SLM are
5343 * processed in-order and the shader itself is already synchronized so
5344 * the memory fence is not necessary.
5345 *
5346 * TODO: Check if applies for many HW threads sharing same Data Port.
5347 */
5348 if (!s.nir->info.workgroup_size_variable &&
5349 slm_fence && s.workgroup_size() <= s.dispatch_width)
5350 slm_fence = false;
5351
5352 switch (s.stage) {
5353 case MESA_SHADER_TESS_CTRL:
5354 break;
5355 default:
5356 urb_fence = false;
5357 break;
5358 }
5359
5360 unsigned fence_regs_count = 0;
5361 elk_fs_reg fence_regs[4] = {};
5362
5363 const fs_builder ubld = bld.group(8, 0);
5364
5365 /* A memory barrier with acquire semantics requires us to
5366 * guarantee that memory operations of the specified storage
5367 * class sequenced-after the barrier aren't reordered before the
5368 * barrier, nor before any previous atomic operation
5369 * sequenced-before the barrier which may be synchronizing this
5370 * acquire barrier with a prior release sequence.
5371 *
5372 * In order to guarantee the latter we must make sure that any
5373 * such previous operation has completed execution before
5374 * invalidating the relevant caches, since otherwise some cache
5375 * could be polluted by a concurrent thread after its
5376 * invalidation but before the previous atomic completes, which
5377 * could lead to a violation of the expected memory ordering if
5378 * a subsequent memory read hits the polluted cacheline, which
5379 * would return a stale value read from memory before the
5380 * completion of the atomic sequenced-before the barrier.
5381 *
5382 * This ordering inversion can be avoided trivially if the
5383 * operations we need to order are all handled by a single
5384 * in-order cache, since the flush implied by the memory fence
5385 * occurs after any pending operations have completed, however
5386 * that doesn't help us when dealing with multiple caches
5387 * processing requests out of order, in which case we need to
5388 * explicitly stall the EU until any pending memory operations
5389 * have executed.
5390 *
5391 * Note that that might be somewhat heavy handed in some cases.
5392 * In particular when this memory fence was inserted by
5393 * spirv_to_nir() lowering an atomic with acquire semantics into
5394 * an atomic+barrier sequence we could do a better job by
5395 * synchronizing with respect to that one atomic *only*, but
5396 * that would require additional information not currently
5397 * available to the backend.
5398 *
5399 * XXX - Use an alternative workaround on IVB and ICL, since
5400 * SYNC.ALLWR is only available on Gfx12+.
5401 */
5402 if (devinfo->ver >= 12 &&
5403 (!nir_intrinsic_has_memory_scope(instr) ||
5404 (nir_intrinsic_memory_semantics(instr) & NIR_MEMORY_ACQUIRE))) {
5405 ubld.exec_all().group(1, 0).emit(
5406 ELK_OPCODE_SYNC, ubld.null_reg_ud(), elk_imm_ud(TGL_SYNC_ALLWR));
5407 }
5408
5409 if (devinfo->has_lsc) {
5410 assert(devinfo->verx10 >= 125);
5411 uint32_t desc =
5412 lsc_fence_descriptor_for_intrinsic(devinfo, instr);
5413 if (ugm_fence) {
5414 fence_regs[fence_regs_count++] =
5415 emit_fence(ubld, opcode, GFX12_SFID_UGM, desc,
5416 true /* commit_enable */,
5417 0 /* bti; ignored for LSC */);
5418 }
5419
5420 if (tgm_fence) {
5421 fence_regs[fence_regs_count++] =
5422 emit_fence(ubld, opcode, GFX12_SFID_TGM, desc,
5423 true /* commit_enable */,
5424 0 /* bti; ignored for LSC */);
5425 }
5426
5427 if (slm_fence) {
5428 assert(opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
5429 if (intel_needs_workaround(devinfo, 14014063774)) {
5430 /* Wa_14014063774
5431 *
5432 * Before SLM fence compiler needs to insert SYNC.ALLWR in order
5433 * to avoid the SLM data race.
5434 */
5435 ubld.exec_all().group(1, 0).emit(
5436 ELK_OPCODE_SYNC, ubld.null_reg_ud(),
5437 elk_imm_ud(TGL_SYNC_ALLWR));
5438 }
5439 fence_regs[fence_regs_count++] =
5440 emit_fence(ubld, opcode, GFX12_SFID_SLM, desc,
5441 true /* commit_enable */,
5442 0 /* BTI; ignored for LSC */);
5443 }
5444
5445 if (urb_fence) {
5446 assert(opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
5447 fence_regs[fence_regs_count++] =
5448 emit_fence(ubld, opcode, ELK_SFID_URB, desc,
5449 true /* commit_enable */,
5450 0 /* BTI; ignored for LSC */);
5451 }
5452 } else if (devinfo->ver >= 11) {
5453 if (tgm_fence || ugm_fence || urb_fence) {
5454 fence_regs[fence_regs_count++] =
5455 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
5456 true /* commit_enable HSD ES # 1404612949 */,
5457 0 /* BTI = 0 means data cache */);
5458 }
5459
5460 if (slm_fence) {
5461 assert(opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
5462 fence_regs[fence_regs_count++] =
5463 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
5464 true /* commit_enable HSD ES # 1404612949 */,
5465 GFX7_BTI_SLM);
5466 }
5467 } else {
5468 /* Prior to Icelake, they're all lumped into a single cache except on
5469 * Ivy Bridge and Bay Trail where typed messages actually go through
5470 * the render cache. There, we need both fences because we may
5471 * access storage images as either typed or untyped.
5472 */
5473 const bool render_fence = tgm_fence && devinfo->verx10 == 70;
5474
5475 /* Simulation also complains on Gfx9 if we do not enable commit.
5476 */
5477 const bool commit_enable = render_fence ||
5478 instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
5479 devinfo->ver == 9;
5480
5481 if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
5482 fence_regs[fence_regs_count++] =
5483 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
5484 commit_enable, 0 /* BTI */);
5485 }
5486
5487 if (render_fence) {
5488 fence_regs[fence_regs_count++] =
5489 emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
5490 commit_enable, /* bti */ 0);
5491 }
5492 }
5493
5494 assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
5495
5496 /* Be conservative in Gen11+ and always stall in a fence. Since
5497 * there are two different fences, and shader might want to
5498 * synchronize between them.
5499 *
5500 * TODO: Use scope and visibility information for the barriers from NIR
5501 * to make a better decision on whether we need to stall.
5502 */
5503 bool force_stall = devinfo->ver >= 11;
5504
5505 /* There are four cases where we want to insert a stall:
5506 *
5507 * 1. If we're a nir_intrinsic_end_invocation_interlock. This is
5508 * required to ensure that the shader EOT doesn't happen until
5509 * after the fence returns. Otherwise, we might end up with the
5510 * next shader invocation for that pixel not respecting our fence
5511 * because it may happen on a different HW thread.
5512 *
5513 * 2. If we have multiple fences. This is required to ensure that
5514 * they all complete and nothing gets weirdly out-of-order.
5515 *
5516 * 3. If we have no fences. In this case, we need at least a
5517 * scheduling barrier to keep the compiler from moving things
5518 * around in an invalid way.
5519 *
5520 * 4. On Gen11+ and platforms with LSC, we have multiple fence types,
5521 * without further information about the fence, we need to force a
5522 * stall.
5523 */
5524 if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
5525 fence_regs_count != 1 || devinfo->has_lsc || force_stall) {
5526 ubld.exec_all().group(1, 0).emit(
5527 ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
5528 fence_regs, fence_regs_count);
5529 }
5530
5531 break;
5532 }
5533
5534 case nir_intrinsic_shader_clock: {
5535 /* We cannot do anything if there is an event, so ignore it for now */
5536 const elk_fs_reg shader_clock = get_timestamp(bld);
5537 const elk_fs_reg srcs[] = { component(shader_clock, 0),
5538 component(shader_clock, 1) };
5539 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
5540 break;
5541 }
5542
5543 case nir_intrinsic_load_reloc_const_intel: {
5544 uint32_t id = nir_intrinsic_param_idx(instr);
5545
5546 /* Emit the reloc in the smallest SIMD size to limit register usage. */
5547 const fs_builder ubld = bld.exec_all().group(1, 0);
5548 elk_fs_reg small_dest = ubld.vgrf(dest.type);
5549 ubld.UNDEF(small_dest);
5550 ubld.exec_all().group(1, 0).emit(ELK_SHADER_OPCODE_MOV_RELOC_IMM,
5551 small_dest, elk_imm_ud(id));
5552
5553 /* Copy propagation will get rid of this MOV. */
5554 bld.MOV(dest, component(small_dest, 0));
5555 break;
5556 }
5557
5558 case nir_intrinsic_load_uniform: {
5559 /* Offsets are in bytes but they should always aligned to
5560 * the type size
5561 */
5562 unsigned base_offset = nir_intrinsic_base(instr);
5563 assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
5564
5565 elk_fs_reg src(UNIFORM, base_offset / 4, dest.type);
5566
5567 if (nir_src_is_const(instr->src[0])) {
5568 unsigned load_offset = nir_src_as_uint(instr->src[0]);
5569 assert(load_offset % type_sz(dest.type) == 0);
5570 /* The base offset can only handle 32-bit units, so for 16-bit
5571 * data take the modulo of the offset with 4 bytes and add it to
5572 * the offset to read from within the source register.
5573 */
5574 src.offset = load_offset + base_offset % 4;
5575
5576 for (unsigned j = 0; j < instr->num_components; j++) {
5577 bld.MOV(offset(dest, bld, j), offset(src, bld, j));
5578 }
5579 } else {
5580 elk_fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
5581 ELK_REGISTER_TYPE_UD);
5582
5583 /* We need to pass a size to the MOV_INDIRECT but we don't want it to
5584 * go past the end of the uniform. In order to keep the n'th
5585 * component from running past, we subtract off the size of all but
5586 * one component of the vector.
5587 */
5588 assert(nir_intrinsic_range(instr) >=
5589 instr->num_components * type_sz(dest.type));
5590 unsigned read_size = nir_intrinsic_range(instr) -
5591 (instr->num_components - 1) * type_sz(dest.type);
5592
5593 bool supports_64bit_indirects =
5594 devinfo->platform != INTEL_PLATFORM_CHV && !intel_device_info_is_9lp(devinfo);
5595
5596 if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
5597 for (unsigned j = 0; j < instr->num_components; j++) {
5598 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5599 offset(dest, bld, j), offset(src, bld, j),
5600 indirect, elk_imm_ud(read_size));
5601 }
5602 } else {
5603 const unsigned num_mov_indirects =
5604 type_sz(dest.type) / type_sz(ELK_REGISTER_TYPE_UD);
5605 /* We read a little bit less per MOV INDIRECT, as they are now
5606 * 32-bits ones instead of 64-bit. Fix read_size then.
5607 */
5608 const unsigned read_size_32bit = read_size -
5609 (num_mov_indirects - 1) * type_sz(ELK_REGISTER_TYPE_UD);
5610 for (unsigned j = 0; j < instr->num_components; j++) {
5611 for (unsigned i = 0; i < num_mov_indirects; i++) {
5612 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5613 subscript(offset(dest, bld, j), ELK_REGISTER_TYPE_UD, i),
5614 subscript(offset(src, bld, j), ELK_REGISTER_TYPE_UD, i),
5615 indirect, elk_imm_ud(read_size_32bit));
5616 }
5617 }
5618 }
5619 }
5620 break;
5621 }
5622
5623 case nir_intrinsic_load_ubo:
5624 case nir_intrinsic_load_ubo_uniform_block_intel: {
5625 elk_fs_reg surface, surface_handle;
5626
5627 if (get_nir_src_bindless(ntb, instr->src[0]))
5628 surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5629 else
5630 surface = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5631
5632 if (!nir_src_is_const(instr->src[1])) {
5633 if (instr->intrinsic == nir_intrinsic_load_ubo) {
5634 /* load_ubo with non-uniform offset */
5635 elk_fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
5636 ELK_REGISTER_TYPE_UD);
5637
5638 const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4;
5639
5640 for (int i = 0; i < instr->num_components; i += comps_per_load) {
5641 const unsigned remaining = instr->num_components - i;
5642 s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
5643 surface, surface_handle,
5644 base_offset,
5645 i * type_sz(dest.type),
5646 instr->def.bit_size / 8,
5647 MIN2(remaining, comps_per_load));
5648 }
5649
5650 s.prog_data->has_ubo_pull = true;
5651 } else {
5652 /* load_ubo with uniform offset */
5653 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5654 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5655 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5656
5657 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5658
5659 srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface;
5660 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
5661
5662 const nir_src load_offset = instr->src[1];
5663 if (nir_src_is_const(load_offset)) {
5664 elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5665 ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5666 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5667 } else {
5668 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5669 bld.emit_uniformize(get_nir_src(ntb, load_offset));
5670 }
5671
5672 const unsigned total_dwords =
5673 ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
5674 unsigned loaded_dwords = 0;
5675
5676 const elk_fs_reg packed_consts =
5677 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5678
5679 while (loaded_dwords < total_dwords) {
5680 const unsigned block =
5681 choose_oword_block_size_dwords(devinfo,
5682 total_dwords - loaded_dwords);
5683 const unsigned block_bytes = block * 4;
5684
5685 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5686
5687 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5688 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5689 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5690 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5691 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5692
5693 loaded_dwords += block;
5694
5695 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5696 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5697 elk_imm_ud(block_bytes));
5698 }
5699
5700 for (unsigned c = 0; c < instr->num_components; c++) {
5701 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5702 component(packed_consts, c));
5703 }
5704
5705 s.prog_data->has_ubo_pull = true;
5706 }
5707 } else {
5708 /* Even if we are loading doubles, a pull constant load will load
5709 * a 32-bit vec4, so should only reserve vgrf space for that. If we
5710 * need to load a full dvec4 we will have to emit 2 loads. This is
5711 * similar to demote_pull_constants(), except that in that case we
5712 * see individual accesses to each component of the vector and then
5713 * we let CSE deal with duplicate loads. Here we see a vector access
5714 * and we have to split it if necessary.
5715 */
5716 const unsigned type_size = type_sz(dest.type);
5717 const unsigned load_offset = nir_src_as_uint(instr->src[1]);
5718 const unsigned ubo_block =
5719 elk_nir_ubo_surface_index_get_push_block(instr->src[0]);
5720 const unsigned offset_256b = load_offset / 32;
5721 const unsigned end_256b =
5722 DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
5723
5724 /* See if we've selected this as a push constant candidate */
5725 elk_fs_reg push_reg;
5726 for (int i = 0; i < 4; i++) {
5727 const struct elk_ubo_range *range = &s.prog_data->ubo_ranges[i];
5728 if (range->block == ubo_block &&
5729 offset_256b >= range->start &&
5730 end_256b <= range->start + range->length) {
5731
5732 push_reg = elk_fs_reg(UNIFORM, UBO_START + i, dest.type);
5733 push_reg.offset = load_offset - 32 * range->start;
5734 break;
5735 }
5736 }
5737
5738 if (push_reg.file != BAD_FILE) {
5739 for (unsigned i = 0; i < instr->num_components; i++) {
5740 bld.MOV(offset(dest, bld, i),
5741 byte_offset(push_reg, i * type_size));
5742 }
5743 break;
5744 }
5745
5746 s.prog_data->has_ubo_pull = true;
5747
5748 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
5749 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
5750
5751 for (unsigned c = 0; c < instr->num_components;) {
5752 const unsigned base = load_offset + c * type_size;
5753 /* Number of usable components in the next block-aligned load. */
5754 const unsigned count = MIN2(instr->num_components - c,
5755 (block_sz - base % block_sz) / type_size);
5756
5757 const elk_fs_reg packed_consts = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5758 elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
5759 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface;
5760 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
5761 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = elk_imm_ud(base & ~(block_sz - 1));
5762 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = elk_imm_ud(block_sz);
5763
5764 ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
5765 srcs, PULL_UNIFORM_CONSTANT_SRCS);
5766
5767 const elk_fs_reg consts =
5768 retype(byte_offset(packed_consts, base & (block_sz - 1)),
5769 dest.type);
5770
5771 for (unsigned d = 0; d < count; d++)
5772 bld.MOV(offset(dest, bld, c + d), component(consts, d));
5773
5774 c += count;
5775 }
5776 }
5777 break;
5778 }
5779
5780 case nir_intrinsic_load_global:
5781 case nir_intrinsic_load_global_constant: {
5782 assert(devinfo->ver >= 8);
5783
5784 assert(instr->def.bit_size <= 32);
5785 assert(nir_intrinsic_align(instr) > 0);
5786 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5787 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]);
5788 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5789 srcs[A64_LOGICAL_ENABLE_HELPERS] =
5790 elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5791
5792 if (instr->def.bit_size == 32 &&
5793 nir_intrinsic_align(instr) >= 4) {
5794 assert(instr->def.num_components <= 4);
5795
5796 srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5797
5798 elk_fs_inst *inst =
5799 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
5800 srcs, A64_LOGICAL_NUM_SRCS);
5801 inst->size_written = instr->num_components *
5802 inst->dst.component_size(inst->exec_size);
5803 } else {
5804 const unsigned bit_size = instr->def.bit_size;
5805 assert(instr->def.num_components == 1);
5806 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5807
5808 srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5809
5810 bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
5811 srcs, A64_LOGICAL_NUM_SRCS);
5812 bld.MOV(dest, subscript(tmp, dest.type, 0));
5813 }
5814 break;
5815 }
5816
5817 case nir_intrinsic_store_global: {
5818 assert(devinfo->ver >= 8);
5819
5820 assert(nir_src_bit_size(instr->src[0]) <= 32);
5821 assert(nir_intrinsic_write_mask(instr) ==
5822 (1u << instr->num_components) - 1);
5823 assert(nir_intrinsic_align(instr) > 0);
5824
5825 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5826 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5827 srcs[A64_LOGICAL_ENABLE_HELPERS] =
5828 elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5829
5830 if (nir_src_bit_size(instr->src[0]) == 32 &&
5831 nir_intrinsic_align(instr) >= 4) {
5832 assert(nir_src_num_components(instr->src[0]) <= 4);
5833
5834 srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */
5835 srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5836
5837 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, elk_fs_reg(),
5838 srcs, A64_LOGICAL_NUM_SRCS);
5839 } else {
5840 assert(nir_src_num_components(instr->src[0]) == 1);
5841 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5842 elk_reg_type data_type =
5843 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5844 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5845 bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type));
5846
5847 srcs[A64_LOGICAL_SRC] = tmp;
5848 srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5849
5850 bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, elk_fs_reg(),
5851 srcs, A64_LOGICAL_NUM_SRCS);
5852 }
5853 break;
5854 }
5855
5856 case nir_intrinsic_global_atomic:
5857 case nir_intrinsic_global_atomic_swap:
5858 fs_nir_emit_global_atomic(ntb, bld, instr);
5859 break;
5860
5861 case nir_intrinsic_load_global_const_block_intel: {
5862 assert(instr->def.bit_size == 32);
5863 assert(instr->num_components == 8 || instr->num_components == 16);
5864
5865 const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
5866 elk_fs_reg load_val;
5867
5868 bool is_pred_const = nir_src_is_const(instr->src[1]);
5869 if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
5870 /* In this case, we don't want the UBO load at all. We really
5871 * shouldn't get here but it's possible.
5872 */
5873 load_val = elk_imm_ud(0);
5874 } else {
5875 /* The uniform process may stomp the flag so do this first */
5876 elk_fs_reg addr = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
5877
5878 load_val = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5879
5880 /* If the predicate is constant and we got here, then it's non-zero
5881 * and we don't need the predicate at all.
5882 */
5883 if (!is_pred_const) {
5884 /* Load the predicate */
5885 elk_fs_reg pred = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
5886 elk_fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
5887 mov->conditional_mod = ELK_CONDITIONAL_NZ;
5888
5889 /* Stomp the destination with 0 if we're OOB */
5890 mov = ubld.MOV(load_val, elk_imm_ud(0));
5891 mov->predicate = ELK_PREDICATE_NORMAL;
5892 mov->predicate_inverse = true;
5893 }
5894
5895 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5896 srcs[A64_LOGICAL_ADDRESS] = addr;
5897 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5898 srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5899 /* This intrinsic loads memory from a uniform address, sometimes
5900 * shared across lanes. We never need to mask it.
5901 */
5902 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
5903
5904 elk_fs_inst *load = ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
5905 load_val, srcs, A64_LOGICAL_NUM_SRCS);
5906 if (!is_pred_const)
5907 load->predicate = ELK_PREDICATE_NORMAL;
5908 }
5909
5910 /* From the HW perspective, we just did a single SIMD16 instruction
5911 * which loaded a dword in each SIMD channel. From NIR's perspective,
5912 * this instruction returns a vec16. Any users of this data in the
5913 * back-end will expect a vec16 per SIMD channel so we have to emit a
5914 * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop
5915 * will generally clean them up for us.
5916 */
5917 for (unsigned i = 0; i < instr->num_components; i++) {
5918 bld.MOV(retype(offset(dest, bld, i), ELK_REGISTER_TYPE_UD),
5919 component(load_val, i));
5920 }
5921 break;
5922 }
5923
5924 case nir_intrinsic_load_global_constant_uniform_block_intel: {
5925 const unsigned total_dwords = ALIGN(instr->num_components,
5926 REG_SIZE * reg_unit(devinfo) / 4);
5927 unsigned loaded_dwords = 0;
5928
5929 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5930 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5931 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5932
5933 const elk_fs_reg packed_consts =
5934 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5935 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
5936
5937 while (loaded_dwords < total_dwords) {
5938 const unsigned block =
5939 choose_oword_block_size_dwords(devinfo,
5940 total_dwords - loaded_dwords);
5941 const unsigned block_bytes = block * 4;
5942
5943 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5944
5945 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5946 srcs[A64_LOGICAL_ADDRESS] = address;
5947 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5948 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
5949 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
5950 ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5951 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5952 srcs, A64_LOGICAL_NUM_SRCS)->size_written =
5953 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5954
5955 increment_a64_address(ubld1, address, block_bytes);
5956 loaded_dwords += block;
5957 }
5958
5959 for (unsigned c = 0; c < instr->num_components; c++)
5960 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5961 component(packed_consts, c));
5962
5963 break;
5964 }
5965
5966 case nir_intrinsic_load_ssbo: {
5967 assert(devinfo->ver >= 7);
5968
5969 const unsigned bit_size = instr->def.bit_size;
5970 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5971 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5972 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5973 SURFACE_LOGICAL_SRC_SURFACE] =
5974 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5975 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5976 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5977 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5978
5979 /* Make dest unsigned because that's what the temporary will be */
5980 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5981
5982 /* Read the vector */
5983 assert(bit_size <= 32);
5984 assert(nir_intrinsic_align(instr) > 0);
5985 if (bit_size == 32 &&
5986 nir_intrinsic_align(instr) >= 4) {
5987 assert(instr->def.num_components <= 4);
5988 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5989 elk_fs_inst *inst =
5990 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5991 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5992 inst->size_written = instr->num_components * s.dispatch_width * 4;
5993 } else {
5994 assert(instr->def.num_components == 1);
5995 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5996
5997 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5998 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5999 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
6000 bld.MOV(dest, subscript(read_result, dest.type, 0));
6001 }
6002 break;
6003 }
6004
6005 case nir_intrinsic_store_ssbo: {
6006 assert(devinfo->ver >= 7);
6007
6008 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6009 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6010 srcs[get_nir_src_bindless(ntb, instr->src[1]) ?
6011 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6012 SURFACE_LOGICAL_SRC_SURFACE] =
6013 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6014 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]);
6015 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
6016 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
6017
6018 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
6019 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
6020
6021 assert(bit_size <= 32);
6022 assert(nir_intrinsic_write_mask(instr) ==
6023 (1u << instr->num_components) - 1);
6024 assert(nir_intrinsic_align(instr) > 0);
6025 if (bit_size == 32 &&
6026 nir_intrinsic_align(instr) >= 4) {
6027 assert(nir_src_num_components(instr->src[0]) <= 4);
6028 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6029 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
6030 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
6031 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6032 } else {
6033 assert(nir_src_num_components(instr->src[0]) == 1);
6034 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
6035
6036 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
6037 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
6038
6039 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
6040 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6041 }
6042 break;
6043 }
6044
6045 case nir_intrinsic_load_ssbo_uniform_block_intel:
6046 case nir_intrinsic_load_shared_uniform_block_intel: {
6047 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6048
6049 const bool is_ssbo =
6050 instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
6051 if (is_ssbo) {
6052 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
6053 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6054 SURFACE_LOGICAL_SRC_SURFACE] =
6055 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6056 } else {
6057 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6058 }
6059
6060 const unsigned total_dwords = ALIGN(instr->num_components,
6061 REG_SIZE * reg_unit(devinfo) / 4);
6062 unsigned loaded_dwords = 0;
6063
6064 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6065 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6066 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6067
6068 const elk_fs_reg packed_consts =
6069 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
6070
6071 const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
6072 if (nir_src_is_const(load_offset)) {
6073 elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
6074 ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
6075 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
6076 } else {
6077 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6078 bld.emit_uniformize(get_nir_src(ntb, load_offset));
6079 }
6080
6081 while (loaded_dwords < total_dwords) {
6082 const unsigned block =
6083 choose_oword_block_size_dwords(devinfo,
6084 total_dwords - loaded_dwords);
6085 const unsigned block_bytes = block * 4;
6086
6087 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6088
6089 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
6090 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6091 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
6092 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
6093 align(block_bytes, REG_SIZE * reg_unit(devinfo));
6094
6095 loaded_dwords += block;
6096
6097 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6098 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6099 elk_imm_ud(block_bytes));
6100 }
6101
6102 for (unsigned c = 0; c < instr->num_components; c++)
6103 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
6104 component(packed_consts, c));
6105
6106 break;
6107 }
6108
6109 case nir_intrinsic_store_output: {
6110 assert(nir_src_bit_size(instr->src[0]) == 32);
6111 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6112
6113 unsigned store_offset = nir_src_as_uint(instr->src[1]);
6114 unsigned num_components = instr->num_components;
6115 unsigned first_component = nir_intrinsic_component(instr);
6116
6117 elk_fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
6118 4 * store_offset), src.type);
6119 for (unsigned j = 0; j < num_components; j++) {
6120 bld.MOV(offset(new_dest, bld, j + first_component),
6121 offset(src, bld, j));
6122 }
6123 break;
6124 }
6125
6126 case nir_intrinsic_ssbo_atomic:
6127 case nir_intrinsic_ssbo_atomic_swap:
6128 fs_nir_emit_surface_atomic(ntb, bld, instr,
6129 get_nir_buffer_intrinsic_index(ntb, bld, instr),
6130 get_nir_src_bindless(ntb, instr->src[0]));
6131 break;
6132
6133 case nir_intrinsic_get_ssbo_size: {
6134 assert(nir_src_num_components(instr->src[0]) == 1);
6135
6136 /* A resinfo's sampler message is used to get the buffer size. The
6137 * SIMD8's writeback message consists of four registers and SIMD16's
6138 * writeback message consists of 8 destination registers (two per each
6139 * component). Because we are only interested on the first channel of
6140 * the first returned component, where resinfo returns the buffer size
6141 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
6142 * the dispatch width.
6143 */
6144 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
6145 elk_fs_reg src_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD);
6146 elk_fs_reg ret_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
6147
6148 /* Set LOD = 0 */
6149 ubld.MOV(src_payload, elk_imm_d(0));
6150
6151 elk_fs_reg srcs[GET_BUFFER_SIZE_SRCS];
6152 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
6153 GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
6154 GET_BUFFER_SIZE_SRC_SURFACE] =
6155 get_nir_buffer_intrinsic_index(ntb, bld, instr);
6156 srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
6157 elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
6158 srcs, GET_BUFFER_SIZE_SRCS);
6159 inst->header_size = 0;
6160 inst->mlen = reg_unit(devinfo);
6161 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
6162
6163 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
6164 *
6165 * "Out-of-bounds checking is always performed at a DWord granularity. If
6166 * any part of the DWord is out-of-bounds then the whole DWord is
6167 * considered out-of-bounds."
6168 *
6169 * This implies that types with size smaller than 4-bytes need to be
6170 * padded if they don't complete the last dword of the buffer. But as we
6171 * need to maintain the original size we need to reverse the padding
6172 * calculation to return the correct size to know the number of elements
6173 * of an unsized array. As we stored in the last two bits of the surface
6174 * size the needed padding for the buffer, we calculate here the
6175 * original buffer_size reversing the surface_size calculation:
6176 *
6177 * surface_size = isl_align(buffer_size, 4) +
6178 * (isl_align(buffer_size) - buffer_size)
6179 *
6180 * buffer_size = surface_size & ~3 - surface_size & 3
6181 */
6182
6183 elk_fs_reg size_aligned4 = ubld.vgrf(ELK_REGISTER_TYPE_UD);
6184 elk_fs_reg size_padding = ubld.vgrf(ELK_REGISTER_TYPE_UD);
6185 elk_fs_reg buffer_size = ubld.vgrf(ELK_REGISTER_TYPE_UD);
6186
6187 ubld.AND(size_padding, ret_payload, elk_imm_ud(3));
6188 ubld.AND(size_aligned4, ret_payload, elk_imm_ud(~3));
6189 ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
6190
6191 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
6192 break;
6193 }
6194
6195 case nir_intrinsic_load_scratch: {
6196 assert(devinfo->ver >= 7);
6197
6198 assert(instr->def.num_components == 1);
6199 const unsigned bit_size = instr->def.bit_size;
6200 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6201
6202 if (devinfo->verx10 >= 125) {
6203 const fs_builder ubld = bld.exec_all().group(1, 0);
6204 elk_fs_reg handle = component(ubld.vgrf(ELK_REGISTER_TYPE_UD), 0);
6205 ubld.AND(handle, retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
6206 elk_imm_ud(INTEL_MASK(31, 10)));
6207 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX125_NON_BINDLESS);
6208 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
6209 } else if (devinfo->ver >= 8) {
6210 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
6211 elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
6212 } else {
6213 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
6214 }
6215
6216 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
6217 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
6218 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
6219 const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[0]);
6220
6221 /* Make dest unsigned because that's what the temporary will be */
6222 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
6223
6224 /* Read the vector */
6225 assert(instr->def.num_components == 1);
6226 assert(bit_size <= 32);
6227 assert(nir_intrinsic_align(instr) > 0);
6228 if (bit_size == 32 &&
6229 nir_intrinsic_align(instr) >= 4) {
6230 if (devinfo->verx10 >= 125) {
6231 assert(bit_size == 32 &&
6232 nir_intrinsic_align(instr) >= 4);
6233
6234 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6235 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6236 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(1);
6237
6238 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
6239 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6240 } else {
6241 /* The offset for a DWORD scattered message is in dwords. */
6242 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6243 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
6244
6245 bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
6246 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6247 }
6248 } else {
6249 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6250 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6251
6252 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
6253 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
6254 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
6255 bld.MOV(dest, read_result);
6256 }
6257
6258 s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16);
6259 break;
6260 }
6261
6262 case nir_intrinsic_store_scratch: {
6263 assert(devinfo->ver >= 7);
6264
6265 assert(nir_src_num_components(instr->src[0]) == 1);
6266 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6267 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6268
6269 if (devinfo->verx10 >= 125) {
6270 const fs_builder ubld = bld.exec_all().group(1, 0);
6271 elk_fs_reg handle = component(ubld.vgrf(ELK_REGISTER_TYPE_UD), 0);
6272 ubld.AND(handle, retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
6273 elk_imm_ud(INTEL_MASK(31, 10)));
6274 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX125_NON_BINDLESS);
6275 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
6276 } else if (devinfo->ver >= 8) {
6277 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
6278 elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
6279 } else {
6280 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
6281 }
6282
6283 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
6284 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
6285 /**
6286 * While this instruction has side-effects, it should not be predicated
6287 * on sample mask, because otherwise fs helper invocations would
6288 * load undefined values from scratch memory. And scratch memory
6289 * load-stores are produced from operations without side-effects, thus
6290 * they should not have different behaviour in the helper invocations.
6291 */
6292 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
6293 const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[1]);
6294
6295 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
6296 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
6297
6298 assert(nir_src_num_components(instr->src[0]) == 1);
6299 assert(bit_size <= 32);
6300 assert(nir_intrinsic_write_mask(instr) == 1);
6301 assert(nir_intrinsic_align(instr) > 0);
6302 if (bit_size == 32 &&
6303 nir_intrinsic_align(instr) >= 4) {
6304 if (devinfo->verx10 >= 125) {
6305 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6306
6307 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6308 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6309 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(1);
6310
6311 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
6312 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6313 } else {
6314 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6315
6316 /* The offset for a DWORD scattered message is in dwords. */
6317 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6318 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
6319
6320 bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
6321 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6322 }
6323 } else {
6324 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
6325 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
6326
6327 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6328 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
6329
6330 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
6331 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6332 }
6333 s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16);
6334 break;
6335 }
6336
6337 case nir_intrinsic_load_subgroup_size:
6338 /* This should only happen for fragment shaders because every other case
6339 * is lowered in NIR so we can optimize on it.
6340 */
6341 assert(s.stage == MESA_SHADER_FRAGMENT);
6342 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), elk_imm_d(s.dispatch_width));
6343 break;
6344
6345 case nir_intrinsic_load_subgroup_invocation:
6346 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
6347 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
6348 break;
6349
6350 case nir_intrinsic_load_subgroup_eq_mask:
6351 case nir_intrinsic_load_subgroup_ge_mask:
6352 case nir_intrinsic_load_subgroup_gt_mask:
6353 case nir_intrinsic_load_subgroup_le_mask:
6354 case nir_intrinsic_load_subgroup_lt_mask:
6355 unreachable("not reached");
6356
6357 case nir_intrinsic_vote_any: {
6358 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6359
6360 /* The any/all predicates do not consider channel enables. To prevent
6361 * dead channels from affecting the result, we initialize the flag with
6362 * with the identity value for the logical operation.
6363 */
6364 if (s.dispatch_width == 32) {
6365 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
6366 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
6367 elk_imm_ud(0));
6368 } else {
6369 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0));
6370 }
6371 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
6372
6373 /* For some reason, the any/all predicates don't work properly with
6374 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
6375 * doesn't read the correct subset of the flag register and you end up
6376 * getting garbage in the second half. Work around this by using a pair
6377 * of 1-wide MOVs and scattering the result.
6378 */
6379 const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
6380 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
6381 ubld.MOV(res1, elk_imm_d(0));
6382 set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ANY :
6383 s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ANY8H :
6384 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ANY16H :
6385 ELK_PREDICATE_ALIGN1_ANY32H,
6386 ubld.MOV(res1, elk_imm_d(-1)));
6387
6388 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
6389 break;
6390 }
6391 case nir_intrinsic_vote_all: {
6392 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6393
6394 /* The any/all predicates do not consider channel enables. To prevent
6395 * dead channels from affecting the result, we initialize the flag with
6396 * with the identity value for the logical operation.
6397 */
6398 if (s.dispatch_width == 32) {
6399 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
6400 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
6401 elk_imm_ud(0xffffffff));
6402 } else {
6403 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
6404 }
6405 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
6406
6407 /* For some reason, the any/all predicates don't work properly with
6408 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
6409 * doesn't read the correct subset of the flag register and you end up
6410 * getting garbage in the second half. Work around this by using a pair
6411 * of 1-wide MOVs and scattering the result.
6412 */
6413 const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
6414 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
6415 ubld.MOV(res1, elk_imm_d(0));
6416 set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
6417 s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
6418 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
6419 ELK_PREDICATE_ALIGN1_ALL32H,
6420 ubld.MOV(res1, elk_imm_d(-1)));
6421
6422 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
6423 break;
6424 }
6425 case nir_intrinsic_vote_feq:
6426 case nir_intrinsic_vote_ieq: {
6427 elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6428 if (instr->intrinsic == nir_intrinsic_vote_feq) {
6429 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
6430 value.type = bit_size == 8 ? ELK_REGISTER_TYPE_B :
6431 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_F);
6432 }
6433
6434 elk_fs_reg uniformized = bld.emit_uniformize(value);
6435 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6436
6437 /* The any/all predicates do not consider channel enables. To prevent
6438 * dead channels from affecting the result, we initialize the flag with
6439 * with the identity value for the logical operation.
6440 */
6441 if (s.dispatch_width == 32) {
6442 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
6443 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
6444 elk_imm_ud(0xffffffff));
6445 } else {
6446 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
6447 }
6448 bld.CMP(bld.null_reg_d(), value, uniformized, ELK_CONDITIONAL_Z);
6449
6450 /* For some reason, the any/all predicates don't work properly with
6451 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
6452 * doesn't read the correct subset of the flag register and you end up
6453 * getting garbage in the second half. Work around this by using a pair
6454 * of 1-wide MOVs and scattering the result.
6455 */
6456 const fs_builder ubld = devinfo->ver >= 20 ? bld.exec_all() : ubld1;
6457 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
6458 ubld.MOV(res1, elk_imm_d(0));
6459 set_predicate(devinfo->ver >= 20 ? XE2_PREDICATE_ALL :
6460 s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
6461 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
6462 ELK_PREDICATE_ALIGN1_ALL32H,
6463 ubld.MOV(res1, elk_imm_d(-1)));
6464
6465 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
6466 break;
6467 }
6468
6469 case nir_intrinsic_ballot: {
6470 const elk_fs_reg value = retype(get_nir_src(ntb, instr->src[0]),
6471 ELK_REGISTER_TYPE_UD);
6472 struct elk_reg flag = elk_flag_reg(0, 0);
6473 /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
6474 * as f0.0. This is a problem for fragment programs as we currently use
6475 * f0.1 for discards. Fortunately, we don't support SIMD32 fragment
6476 * programs yet so this isn't a problem. When we do, something will
6477 * have to change.
6478 */
6479 if (s.dispatch_width == 32)
6480 flag.type = ELK_REGISTER_TYPE_UD;
6481
6482 bld.exec_all().group(1, 0).MOV(flag, elk_imm_ud(0u));
6483 bld.CMP(bld.null_reg_ud(), value, elk_imm_ud(0u), ELK_CONDITIONAL_NZ);
6484
6485 if (instr->def.bit_size > 32) {
6486 dest.type = ELK_REGISTER_TYPE_UQ;
6487 } else {
6488 dest.type = ELK_REGISTER_TYPE_UD;
6489 }
6490 bld.MOV(dest, flag);
6491 break;
6492 }
6493
6494 case nir_intrinsic_read_invocation: {
6495 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6496 const elk_fs_reg invocation = get_nir_src(ntb, instr->src[1]);
6497
6498 elk_fs_reg tmp = bld.vgrf(value.type);
6499
6500 /* When for some reason the subgroup_size picked by NIR is larger than
6501 * the dispatch size picked by the backend (this could happen in RT,
6502 * FS), bound the invocation to the dispatch size.
6503 */
6504 elk_fs_reg bound_invocation;
6505 if (s.api_subgroup_size == 0 ||
6506 bld.dispatch_width() < s.api_subgroup_size) {
6507 bound_invocation = bld.vgrf(ELK_REGISTER_TYPE_UD);
6508 bld.AND(bound_invocation, invocation, elk_imm_ud(s.dispatch_width - 1));
6509 } else {
6510 bound_invocation = invocation;
6511 }
6512 bld.exec_all().emit(ELK_SHADER_OPCODE_BROADCAST, tmp, value,
6513 bld.emit_uniformize(bound_invocation));
6514
6515 bld.MOV(retype(dest, value.type), elk_fs_reg(component(tmp, 0)));
6516 break;
6517 }
6518
6519 case nir_intrinsic_read_first_invocation: {
6520 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6521 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
6522 break;
6523 }
6524
6525 case nir_intrinsic_shuffle: {
6526 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6527 const elk_fs_reg index = get_nir_src(ntb, instr->src[1]);
6528
6529 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
6530 break;
6531 }
6532
6533 case nir_intrinsic_first_invocation: {
6534 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
6535 bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
6536 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
6537 elk_fs_reg(component(tmp, 0)));
6538 break;
6539 }
6540
6541 case nir_intrinsic_last_invocation: {
6542 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
6543 bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
6544 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
6545 elk_fs_reg(component(tmp, 0)));
6546 break;
6547 }
6548
6549 case nir_intrinsic_quad_broadcast: {
6550 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6551 const unsigned index = nir_src_as_uint(instr->src[1]);
6552
6553 bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
6554 value, elk_imm_ud(index), elk_imm_ud(4));
6555 break;
6556 }
6557
6558 case nir_intrinsic_quad_swap_horizontal: {
6559 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6560 const elk_fs_reg tmp = bld.vgrf(value.type);
6561 if (devinfo->ver <= 7) {
6562 /* The hardware doesn't seem to support these crazy regions with
6563 * compressed instructions on gfx7 and earlier so we fall back to
6564 * using quad swizzles. Fortunately, we don't support 64-bit
6565 * anything in Vulkan on gfx7.
6566 */
6567 assert(nir_src_bit_size(instr->src[0]) == 32);
6568 const fs_builder ubld = bld.exec_all();
6569 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
6570 elk_imm_ud(ELK_SWIZZLE4(1,0,3,2)));
6571 bld.MOV(retype(dest, value.type), tmp);
6572 } else {
6573 const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
6574
6575 const elk_fs_reg src_left = horiz_stride(value, 2);
6576 const elk_fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
6577 const elk_fs_reg tmp_left = horiz_stride(tmp, 2);
6578 const elk_fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
6579
6580 ubld.MOV(tmp_left, src_right);
6581 ubld.MOV(tmp_right, src_left);
6582
6583 }
6584 bld.MOV(retype(dest, value.type), tmp);
6585 break;
6586 }
6587
6588 case nir_intrinsic_quad_swap_vertical: {
6589 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6590 if (nir_src_bit_size(instr->src[0]) == 32) {
6591 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
6592 const elk_fs_reg tmp = bld.vgrf(value.type);
6593 const fs_builder ubld = bld.exec_all();
6594 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
6595 elk_imm_ud(ELK_SWIZZLE4(2,3,0,1)));
6596 bld.MOV(retype(dest, value.type), tmp);
6597 } else {
6598 /* For larger data types, we have to either emit dispatch_width many
6599 * MOVs or else fall back to doing indirects.
6600 */
6601 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
6602 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
6603 elk_imm_w(0x2));
6604 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
6605 }
6606 break;
6607 }
6608
6609 case nir_intrinsic_quad_swap_diagonal: {
6610 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
6611 if (nir_src_bit_size(instr->src[0]) == 32) {
6612 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
6613 const elk_fs_reg tmp = bld.vgrf(value.type);
6614 const fs_builder ubld = bld.exec_all();
6615 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
6616 elk_imm_ud(ELK_SWIZZLE4(3,2,1,0)));
6617 bld.MOV(retype(dest, value.type), tmp);
6618 } else {
6619 /* For larger data types, we have to either emit dispatch_width many
6620 * MOVs or else fall back to doing indirects.
6621 */
6622 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
6623 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
6624 elk_imm_w(0x3));
6625 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
6626 }
6627 break;
6628 }
6629
6630 case nir_intrinsic_reduce: {
6631 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6632 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
6633 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
6634 if (cluster_size == 0 || cluster_size > s.dispatch_width)
6635 cluster_size = s.dispatch_width;
6636
6637 /* Figure out the source type */
6638 src.type = elk_type_for_nir_type(devinfo,
6639 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
6640 nir_src_bit_size(instr->src[0])));
6641
6642 elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
6643 elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
6644 elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
6645
6646 /* Set up a register for all of our scratching around and initialize it
6647 * to reduction operation's identity value.
6648 */
6649 elk_fs_reg scan = bld.vgrf(src.type);
6650 bld.exec_all().emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
6651
6652 bld.emit_scan(elk_op, scan, cluster_size, cond_mod);
6653
6654 dest.type = src.type;
6655 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
6656 /* In this case, CLUSTER_BROADCAST instruction isn't needed because
6657 * the distance between clusters is at least 2 GRFs. In this case,
6658 * we don't need the weird striding of the CLUSTER_BROADCAST
6659 * instruction and can just do regular MOVs.
6660 */
6661 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
6662 const unsigned groups =
6663 (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
6664 const unsigned group_size = s.dispatch_width / groups;
6665 for (unsigned i = 0; i < groups; i++) {
6666 const unsigned cluster = (i * group_size) / cluster_size;
6667 const unsigned comp = cluster * cluster_size + (cluster_size - 1);
6668 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
6669 component(scan, comp));
6670 }
6671 } else {
6672 bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
6673 elk_imm_ud(cluster_size - 1), elk_imm_ud(cluster_size));
6674 }
6675 break;
6676 }
6677
6678 case nir_intrinsic_inclusive_scan:
6679 case nir_intrinsic_exclusive_scan: {
6680 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6681 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
6682
6683 /* Figure out the source type */
6684 src.type = elk_type_for_nir_type(devinfo,
6685 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
6686 nir_src_bit_size(instr->src[0])));
6687
6688 elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
6689 elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
6690 elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
6691
6692 /* Set up a register for all of our scratching around and initialize it
6693 * to reduction operation's identity value.
6694 */
6695 elk_fs_reg scan = bld.vgrf(src.type);
6696 const fs_builder allbld = bld.exec_all();
6697 allbld.emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
6698
6699 if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
6700 /* Exclusive scan is a bit harder because we have to do an annoying
6701 * shift of the contents before we can begin. To make things worse,
6702 * we can't do this with a normal stride; we have to use indirects.
6703 */
6704 elk_fs_reg shifted = bld.vgrf(src.type);
6705 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
6706 allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
6707 elk_imm_w(-1));
6708 allbld.emit(ELK_SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
6709 allbld.group(1, 0).MOV(component(shifted, 0), identity);
6710 scan = shifted;
6711 }
6712
6713 bld.emit_scan(elk_op, scan, s.dispatch_width, cond_mod);
6714
6715 bld.MOV(retype(dest, src.type), scan);
6716 break;
6717 }
6718
6719 case nir_intrinsic_load_global_block_intel: {
6720 assert(instr->def.bit_size == 32);
6721
6722 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
6723
6724 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6725 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6726 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6727
6728 const unsigned total = instr->num_components * s.dispatch_width;
6729 unsigned loaded = 0;
6730
6731 while (loaded < total) {
6732 const unsigned block =
6733 choose_oword_block_size_dwords(devinfo, total - loaded);
6734 const unsigned block_bytes = block * 4;
6735
6736 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6737
6738 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6739 srcs[A64_LOGICAL_ADDRESS] = address;
6740 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
6741 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6742 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(1);
6743 ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6744 retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6745 srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6746
6747 increment_a64_address(ubld1, address, block_bytes);
6748 loaded += block;
6749 }
6750
6751 assert(loaded == total);
6752 break;
6753 }
6754
6755 case nir_intrinsic_store_global_block_intel: {
6756 assert(nir_src_bit_size(instr->src[0]) == 32);
6757
6758 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
6759 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6760
6761 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6762 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6763 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6764
6765 const unsigned total = instr->num_components * s.dispatch_width;
6766 unsigned written = 0;
6767
6768 while (written < total) {
6769 const unsigned block =
6770 choose_oword_block_size_dwords(devinfo, total - written);
6771
6772 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6773 srcs[A64_LOGICAL_ADDRESS] = address;
6774 srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
6775 ELK_REGISTER_TYPE_UD);
6776 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6777 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6778
6779 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6780 ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, elk_fs_reg(),
6781 srcs, A64_LOGICAL_NUM_SRCS);
6782
6783 const unsigned block_bytes = block * 4;
6784 increment_a64_address(ubld1, address, block_bytes);
6785 written += block;
6786 }
6787
6788 assert(written == total);
6789 break;
6790 }
6791
6792 case nir_intrinsic_load_shared_block_intel:
6793 case nir_intrinsic_load_ssbo_block_intel: {
6794 assert(instr->def.bit_size == 32);
6795
6796 const bool is_ssbo =
6797 instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
6798 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0]));
6799
6800 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6801 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6802 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6803 elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6804 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6805
6806 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6807 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6808 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6809
6810 const unsigned total = instr->num_components * s.dispatch_width;
6811 unsigned loaded = 0;
6812
6813 while (loaded < total) {
6814 const unsigned block =
6815 choose_oword_block_size_dwords(devinfo, total - loaded);
6816 const unsigned block_bytes = block * 4;
6817
6818 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6819
6820 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6821 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6822 retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6823 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6824
6825 ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6826 loaded += block;
6827 }
6828
6829 assert(loaded == total);
6830 break;
6831 }
6832
6833 case nir_intrinsic_store_shared_block_intel:
6834 case nir_intrinsic_store_ssbo_block_intel: {
6835 assert(nir_src_bit_size(instr->src[0]) == 32);
6836
6837 const bool is_ssbo =
6838 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
6839
6840 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1]));
6841 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6842
6843 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6844 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6845 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6846 elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6847 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6848
6849 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6850 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6851 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6852
6853 const unsigned total = instr->num_components * s.dispatch_width;
6854 unsigned written = 0;
6855
6856 while (written < total) {
6857 const unsigned block =
6858 choose_oword_block_size_dwords(devinfo, total - written);
6859
6860 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6861 srcs[SURFACE_LOGICAL_SRC_DATA] =
6862 retype(byte_offset(src, written * 4), ELK_REGISTER_TYPE_UD);
6863
6864 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6865 ubld.emit(ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
6866 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6867
6868 const unsigned block_bytes = block * 4;
6869 ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6870 written += block;
6871 }
6872
6873 assert(written == total);
6874 break;
6875 }
6876
6877 case nir_intrinsic_load_topology_id_intel: {
6878 /* These move around basically every hardware generation, so don't
6879 * do any unbounded checks and fail if the platform hasn't explicitly
6880 * been enabled here.
6881 */
6882 assert(devinfo->ver >= 12 && devinfo->ver <= 20);
6883
6884 /* Here is what the layout of SR0 looks like on Gfx12
6885 * https://gfxspecs.intel.com/Predator/Home/Index/47256
6886 * [13:11] : Slice ID.
6887 * [10:9] : Dual-SubSlice ID
6888 * [8] : SubSlice ID
6889 * [7] : EUID[2] (aka EU Row ID)
6890 * [6] : Reserved
6891 * [5:4] : EUID[1:0]
6892 * [2:0] : Thread ID
6893 *
6894 * Xe2: Engine 3D and GPGPU Programs, EU Overview, Registers and
6895 * Register Regions, ARF Registers, State Register,
6896 * https://gfxspecs.intel.com/Predator/Home/Index/56623
6897 * [15:11] : Slice ID.
6898 * [9:8] : SubSlice ID
6899 * [6:4] : EUID
6900 * [2:0] : Thread ID
6901 */
6902 elk_fs_reg raw_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
6903 bld.emit(ELK_SHADER_OPCODE_READ_SR_REG, raw_id, elk_imm_ud(0));
6904 switch (nir_intrinsic_base(instr)) {
6905 case ELK_TOPOLOGY_ID_DSS:
6906 if (devinfo->ver >= 20) {
6907 /* Xe2+: 3D and GPGPU Programs, Shared Functions, Ray Tracing:
6908 * https://gfxspecs.intel.com/Predator/Home/Index/56936
6909 *
6910 * Note: DSSID in all formulas below is a logical identifier of an
6911 * XeCore (a value that goes from 0 to (number_of_slices *
6912 * number_of_XeCores_per_slice -1). SW can get this value from
6913 * either:
6914 *
6915 * - Message Control Register LogicalSSID field (only in shaders
6916 * eligible for Mid-Thread Preemption).
6917 * - Calculated based of State Register with the following formula:
6918 * DSSID = StateRegister.SliceID * GT_ARCH_SS_PER_SLICE +
6919 * StateRRegister.SubSliceID where GT_SS_PER_SLICE is an
6920 * architectural parameter defined per product SKU.
6921 *
6922 * We are using the state register to calculate the DSSID.
6923 */
6924 elk_fs_reg slice_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
6925 elk_fs_reg subslice_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
6926 bld.AND(slice_id, raw_id, elk_imm_ud(INTEL_MASK(15, 11)));
6927 bld.SHR(slice_id, slice_id, elk_imm_ud(11));
6928
6929 /* Assert that max subslices covers at least 2 bits that we use for
6930 * subslices.
6931 */
6932 assert(devinfo->max_subslices_per_slice >= (1 << 2));
6933 bld.MUL(slice_id, slice_id,
6934 elk_imm_ud(devinfo->max_subslices_per_slice));
6935 bld.AND(subslice_id, raw_id, elk_imm_ud(INTEL_MASK(9, 8)));
6936 bld.SHR(subslice_id, subslice_id, elk_imm_ud(8));
6937 bld.ADD(retype(dest, ELK_REGISTER_TYPE_UD), slice_id,
6938 subslice_id);
6939 } else {
6940 bld.AND(raw_id, raw_id, elk_imm_ud(0x3fff));
6941 /* Get rid of anything below dualsubslice */
6942 bld.SHR(retype(dest, ELK_REGISTER_TYPE_UD), raw_id, elk_imm_ud(9));
6943 }
6944 break;
6945 case ELK_TOPOLOGY_ID_EU_THREAD_SIMD: {
6946 s.limit_dispatch_width(16, "Topology helper for Ray queries, "
6947 "not supported in SIMD32 mode.");
6948 elk_fs_reg dst = retype(dest, ELK_REGISTER_TYPE_UD);
6949
6950 if (devinfo->ver >= 20) {
6951 /* Xe2+: Graphics Engine, 3D and GPGPU Programs, Shared Functions
6952 * Ray Tracing,
6953 * https://gfxspecs.intel.com/Predator/Home/Index/56936
6954 *
6955 * SyncStackID = (EUID[2:0] << 8) | (ThreadID[2:0] << 4) |
6956 * SIMDLaneID[3:0];
6957 *
6958 * This section just deals with the EUID part.
6959 *
6960 * The 3bit EU[2:0] we need to build for ray query memory addresses
6961 * computations is a bit odd :
6962 *
6963 * EU[2:0] = raw_id[6:4] (identified as EUID[2:0])
6964 */
6965 bld.AND(dst, raw_id, elk_imm_ud(INTEL_MASK(6, 4)));
6966 bld.SHL(dst, dst, elk_imm_ud(4));
6967 } else {
6968 /* EU[3:0] << 7
6969 *
6970 * The 4bit EU[3:0] we need to build for ray query memory addresses
6971 * computations is a bit odd :
6972 *
6973 * EU[1:0] = raw_id[5:4] (identified as EUID[1:0])
6974 * EU[2] = raw_id[8] (identified as SubSlice ID)
6975 * EU[3] = raw_id[7] (identified as EUID[2] or Row ID)
6976 */
6977 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
6978 bld.AND(tmp, raw_id, elk_imm_ud(INTEL_MASK(7, 7)));
6979 bld.SHL(dst, tmp, elk_imm_ud(3));
6980 bld.AND(tmp, raw_id, elk_imm_ud(INTEL_MASK(8, 8)));
6981 bld.SHL(tmp, tmp, elk_imm_ud(1));
6982 bld.OR(dst, dst, tmp);
6983 bld.AND(tmp, raw_id, elk_imm_ud(INTEL_MASK(5, 4)));
6984 bld.SHL(tmp, tmp, elk_imm_ud(3));
6985 bld.OR(dst, dst, tmp);
6986 }
6987
6988 /* ThreadID[2:0] << 4 (ThreadID comes from raw_id[2:0]) */
6989 {
6990 bld.AND(raw_id, raw_id, elk_imm_ud(INTEL_MASK(2, 0)));
6991 bld.SHL(raw_id, raw_id, elk_imm_ud(4));
6992 bld.OR(dst, dst, raw_id);
6993 }
6994
6995 /* LaneID[0:3] << 0 (Use nir SYSTEM_VALUE_SUBGROUP_INVOCATION) */
6996 assert(bld.dispatch_width() <= 16); /* Limit to 4 bits */
6997 bld.ADD(dst, dst,
6998 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
6999 break;
7000 }
7001 default:
7002 unreachable("Invalid topology id type");
7003 }
7004 break;
7005 }
7006
7007 default:
7008 #ifndef NDEBUG
7009 assert(instr->intrinsic < nir_num_intrinsics);
7010 fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
7011 #endif
7012 unreachable("unknown intrinsic");
7013 }
7014 }
7015
7016 static elk_fs_reg
expand_to_32bit(const fs_builder & bld,const elk_fs_reg & src)7017 expand_to_32bit(const fs_builder &bld, const elk_fs_reg &src)
7018 {
7019 if (type_sz(src.type) == 2) {
7020 elk_fs_reg src32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
7021 bld.MOV(src32, retype(src, ELK_REGISTER_TYPE_UW));
7022 return src32;
7023 } else {
7024 return src;
7025 }
7026 }
7027
7028 static void
fs_nir_emit_surface_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,elk_fs_reg surface,bool bindless)7029 fs_nir_emit_surface_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
7030 nir_intrinsic_instr *instr,
7031 elk_fs_reg surface,
7032 bool bindless)
7033 {
7034 const intel_device_info *devinfo = ntb.devinfo;
7035 elk_fs_visitor &s = ntb.s;
7036
7037 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
7038 int num_data = lsc_op_num_data_values(op);
7039
7040 bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
7041
7042 /* The BTI untyped atomic messages only support 32-bit atomics. If you
7043 * just look at the big table of messages in the Vol 7 of the SKL PRM, they
7044 * appear to exist. However, if you look at Vol 2a, there are no message
7045 * descriptors provided for Qword atomic ops except for A64 messages.
7046 *
7047 * 16-bit float atomics are supported, however.
7048 */
7049 assert(instr->def.bit_size == 32 ||
7050 (instr->def.bit_size == 64 && devinfo->has_lsc) ||
7051 (instr->def.bit_size == 16 &&
7052 (devinfo->has_lsc || elk_lsc_opcode_is_atomic_float(op))));
7053
7054 elk_fs_reg dest = get_nir_def(ntb, instr->def);
7055
7056 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
7057 srcs[bindless ?
7058 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
7059 SURFACE_LOGICAL_SRC_SURFACE] = surface;
7060 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
7061 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
7062 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
7063
7064 if (shared) {
7065 /* SLM - Get the offset */
7066 if (nir_src_is_const(instr->src[0])) {
7067 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
7068 elk_imm_ud(nir_intrinsic_base(instr) +
7069 nir_src_as_uint(instr->src[0]));
7070 } else {
7071 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type());
7072 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
7073 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD),
7074 elk_imm_ud(nir_intrinsic_base(instr)));
7075 }
7076 } else {
7077 /* SSBOs */
7078 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
7079 }
7080
7081 elk_fs_reg data;
7082 if (num_data >= 1)
7083 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2]));
7084
7085 if (num_data >= 2) {
7086 elk_fs_reg tmp = bld.vgrf(data.type, 2);
7087 elk_fs_reg sources[2] = {
7088 data,
7089 expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3]))
7090 };
7091 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
7092 data = tmp;
7093 }
7094 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
7095
7096 /* Emit the actual atomic operation */
7097
7098 switch (instr->def.bit_size) {
7099 case 16: {
7100 elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
7101 bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
7102 retype(dest32, dest.type),
7103 srcs, SURFACE_LOGICAL_NUM_SRCS);
7104 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW),
7105 retype(dest32, ELK_REGISTER_TYPE_UD));
7106 break;
7107 }
7108
7109 case 32:
7110 case 64:
7111 bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
7112 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
7113 break;
7114 default:
7115 unreachable("Unsupported bit size");
7116 }
7117 }
7118
7119 static void
fs_nir_emit_global_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)7120 fs_nir_emit_global_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
7121 nir_intrinsic_instr *instr)
7122 {
7123 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
7124 int num_data = lsc_op_num_data_values(op);
7125
7126 elk_fs_reg dest = get_nir_def(ntb, instr->def);
7127
7128 elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
7129
7130 elk_fs_reg data;
7131 if (num_data >= 1)
7132 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1]));
7133
7134 if (num_data >= 2) {
7135 elk_fs_reg tmp = bld.vgrf(data.type, 2);
7136 elk_fs_reg sources[2] = {
7137 data,
7138 expand_to_32bit(bld, get_nir_src(ntb, instr->src[2]))
7139 };
7140 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
7141 data = tmp;
7142 }
7143
7144 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
7145 srcs[A64_LOGICAL_ADDRESS] = addr;
7146 srcs[A64_LOGICAL_SRC] = data;
7147 srcs[A64_LOGICAL_ARG] = elk_imm_ud(op);
7148 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
7149
7150 switch (instr->def.bit_size) {
7151 case 16: {
7152 elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
7153 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
7154 retype(dest32, dest.type),
7155 srcs, A64_LOGICAL_NUM_SRCS);
7156 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW), dest32);
7157 break;
7158 }
7159 case 32:
7160 case 64:
7161 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
7162 srcs, A64_LOGICAL_NUM_SRCS);
7163 break;
7164 default:
7165 unreachable("Unsupported bit size");
7166 }
7167 }
7168
7169 static void
fs_nir_emit_texture(nir_to_elk_state & ntb,nir_tex_instr * instr)7170 fs_nir_emit_texture(nir_to_elk_state &ntb,
7171 nir_tex_instr *instr)
7172 {
7173 const intel_device_info *devinfo = ntb.devinfo;
7174 const fs_builder &bld = ntb.bld;
7175 elk_fs_visitor &s = ntb.s;
7176
7177 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
7178
7179 /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
7180 *
7181 * "The Pixel Null Mask field, when enabled via the Pixel Null Mask
7182 * Enable will be incorect for sample_c when applied to a surface with
7183 * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
7184 * Enable may incorrectly report pixels as referencing a Null surface."
7185 *
7186 * We'll take care of this in NIR.
7187 */
7188 assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
7189
7190 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(instr->is_sparse);
7191
7192 int lod_components = 0;
7193
7194 /* The hardware requires a LOD for buffer textures */
7195 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
7196 srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_d(0);
7197
7198 ASSERTED bool got_lod = false;
7199 ASSERTED bool got_bias = false;
7200 uint32_t header_bits = 0;
7201 for (unsigned i = 0; i < instr->num_srcs; i++) {
7202 nir_src nir_src = instr->src[i].src;
7203 elk_fs_reg src = get_nir_src(ntb, nir_src);
7204 switch (instr->src[i].src_type) {
7205 case nir_tex_src_bias:
7206 assert(!got_lod);
7207 got_bias = true;
7208
7209 srcs[TEX_LOGICAL_SRC_LOD] =
7210 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
7211 break;
7212 case nir_tex_src_comparator:
7213 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, ELK_REGISTER_TYPE_F);
7214 break;
7215 case nir_tex_src_coord:
7216 switch (instr->op) {
7217 case nir_texop_txf:
7218 case nir_texop_txf_ms:
7219 case nir_texop_txf_ms_mcs_intel:
7220 case nir_texop_samples_identical:
7221 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_D);
7222 break;
7223 default:
7224 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_F);
7225 break;
7226 }
7227 break;
7228 case nir_tex_src_ddx:
7229 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, ELK_REGISTER_TYPE_F);
7230 lod_components = nir_tex_instr_src_size(instr, i);
7231 break;
7232 case nir_tex_src_ddy:
7233 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, ELK_REGISTER_TYPE_F);
7234 break;
7235 case nir_tex_src_lod:
7236 assert(!got_bias);
7237 got_lod = true;
7238
7239 switch (instr->op) {
7240 case nir_texop_txs:
7241 srcs[TEX_LOGICAL_SRC_LOD] =
7242 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_UD);
7243 break;
7244 case nir_texop_txf:
7245 srcs[TEX_LOGICAL_SRC_LOD] =
7246 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_D);
7247 break;
7248 default:
7249 srcs[TEX_LOGICAL_SRC_LOD] =
7250 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
7251 break;
7252 }
7253 break;
7254 case nir_tex_src_min_lod:
7255 srcs[TEX_LOGICAL_SRC_MIN_LOD] =
7256 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
7257 break;
7258 case nir_tex_src_ms_index:
7259 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, ELK_REGISTER_TYPE_UD);
7260 break;
7261
7262 case nir_tex_src_offset: {
7263 uint32_t offset_bits = 0;
7264 if (elk_texture_offset(instr, i, &offset_bits)) {
7265 header_bits |= offset_bits;
7266 } else {
7267 /* On gfx12.5+, if the offsets are not both constant and in the
7268 * {-8,7} range, nir_lower_tex() will have already lowered the
7269 * source offset. So we should never reach this point.
7270 */
7271 assert(devinfo->verx10 < 125);
7272 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
7273 retype(src, ELK_REGISTER_TYPE_D);
7274 }
7275 break;
7276 }
7277
7278 case nir_tex_src_projector:
7279 unreachable("should be lowered");
7280
7281 case nir_tex_src_texture_offset: {
7282 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
7283 /* Emit code to evaluate the actual indexing expression */
7284 if (instr->texture_index == 0 && is_resource_src(nir_src))
7285 srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src);
7286 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
7287 elk_fs_reg tmp = s.vgrf(glsl_uint_type());
7288 bld.ADD(tmp, src, elk_imm_ud(instr->texture_index));
7289 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
7290 }
7291 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
7292 break;
7293 }
7294
7295 case nir_tex_src_sampler_offset: {
7296 /* Emit code to evaluate the actual indexing expression */
7297 if (instr->sampler_index == 0 && is_resource_src(nir_src))
7298 srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src);
7299 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
7300 elk_fs_reg tmp = s.vgrf(glsl_uint_type());
7301 bld.ADD(tmp, src, elk_imm_ud(instr->sampler_index));
7302 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
7303 }
7304 break;
7305 }
7306
7307 case nir_tex_src_texture_handle:
7308 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
7309 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_fs_reg();
7310 if (is_resource_src(nir_src))
7311 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src);
7312 if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
7313 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
7314 break;
7315
7316 case nir_tex_src_sampler_handle:
7317 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
7318 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_fs_reg();
7319 if (is_resource_src(nir_src))
7320 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src);
7321 if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
7322 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
7323 break;
7324
7325 case nir_tex_src_ms_mcs_intel:
7326 assert(instr->op == nir_texop_txf_ms);
7327 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, ELK_REGISTER_TYPE_D);
7328 break;
7329
7330 /* If this parameter is present, we are packing either the explicit LOD
7331 * or LOD bias and the array index into a single (32-bit) value when
7332 * 32-bit texture coordinates are used.
7333 */
7334 case nir_tex_src_backend1:
7335 assert(!got_lod && !got_bias);
7336 got_lod = true;
7337
7338 assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
7339 srcs[TEX_LOGICAL_SRC_LOD] =
7340 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
7341 break;
7342
7343 default:
7344 unreachable("unknown texture source");
7345 }
7346 }
7347
7348 /* If the surface or sampler were not specified through sources, use the
7349 * instruction index.
7350 */
7351 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
7352 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
7353 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(instr->texture_index);
7354 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
7355 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
7356 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(instr->sampler_index);
7357
7358 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
7359 (instr->op == nir_texop_txf_ms ||
7360 instr->op == nir_texop_samples_identical)) {
7361 if (devinfo->ver >= 7) {
7362 srcs[TEX_LOGICAL_SRC_MCS] =
7363 emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
7364 instr->coord_components,
7365 srcs[TEX_LOGICAL_SRC_SURFACE],
7366 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
7367 } else {
7368 srcs[TEX_LOGICAL_SRC_MCS] = elk_imm_ud(0u);
7369 }
7370 }
7371
7372 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(instr->coord_components);
7373 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(lod_components);
7374
7375 enum elk_opcode opcode;
7376 switch (instr->op) {
7377 case nir_texop_tex:
7378 opcode = ELK_SHADER_OPCODE_TEX_LOGICAL;
7379 break;
7380 case nir_texop_txb:
7381 opcode = ELK_FS_OPCODE_TXB_LOGICAL;
7382 break;
7383 case nir_texop_txl:
7384 opcode = ELK_SHADER_OPCODE_TXL_LOGICAL;
7385 break;
7386 case nir_texop_txd:
7387 opcode = ELK_SHADER_OPCODE_TXD_LOGICAL;
7388 break;
7389 case nir_texop_txf:
7390 opcode = ELK_SHADER_OPCODE_TXF_LOGICAL;
7391 break;
7392 case nir_texop_txf_ms:
7393 /* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
7394 * Functions - 3D Sampler - Messages - Message Format:
7395 *
7396 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
7397 */
7398 if (devinfo->verx10 >= 125)
7399 opcode = ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
7400 else if (devinfo->ver >= 9)
7401 opcode = ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL;
7402 else
7403 opcode = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
7404 break;
7405 case nir_texop_txf_ms_mcs_intel:
7406 opcode = ELK_SHADER_OPCODE_TXF_MCS_LOGICAL;
7407 break;
7408 case nir_texop_query_levels:
7409 case nir_texop_txs:
7410 opcode = ELK_SHADER_OPCODE_TXS_LOGICAL;
7411 break;
7412 case nir_texop_lod:
7413 opcode = ELK_SHADER_OPCODE_LOD_LOGICAL;
7414 break;
7415 case nir_texop_tg4:
7416 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
7417 opcode = ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL;
7418 else
7419 opcode = ELK_SHADER_OPCODE_TG4_LOGICAL;
7420 break;
7421 case nir_texop_texture_samples:
7422 opcode = ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL;
7423 break;
7424 case nir_texop_samples_identical: {
7425 elk_fs_reg dst = retype(get_nir_def(ntb, instr->def), ELK_REGISTER_TYPE_D);
7426
7427 /* If mcs is an immediate value, it means there is no MCS. In that case
7428 * just return false.
7429 */
7430 if (srcs[TEX_LOGICAL_SRC_MCS].file == ELK_IMMEDIATE_VALUE) {
7431 bld.MOV(dst, elk_imm_ud(0u));
7432 } else if (devinfo->ver >= 9) {
7433 elk_fs_reg tmp = s.vgrf(glsl_uint_type());
7434 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
7435 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
7436 bld.CMP(dst, tmp, elk_imm_ud(0u), ELK_CONDITIONAL_EQ);
7437 } else {
7438 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], elk_imm_ud(0u),
7439 ELK_CONDITIONAL_EQ);
7440 }
7441 return;
7442 }
7443 default:
7444 unreachable("unknown texture opcode");
7445 }
7446
7447 if (instr->op == nir_texop_tg4) {
7448 if (instr->component == 1 &&
7449 s.key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) {
7450 /* gather4 sampler is broken for green channel on RG32F --
7451 * we must ask for blue instead.
7452 */
7453 header_bits |= 2 << 16;
7454 } else {
7455 header_bits |= instr->component << 16;
7456 }
7457 }
7458
7459 elk_fs_reg dst = bld.vgrf(elk_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
7460 elk_fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
7461 inst->offset = header_bits;
7462
7463 const unsigned dest_size = nir_tex_instr_dest_size(instr);
7464 if (devinfo->ver >= 9 &&
7465 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
7466 unsigned write_mask = nir_def_components_read(&instr->def);
7467 assert(write_mask != 0); /* dead code should have been eliminated */
7468 if (instr->is_sparse) {
7469 inst->size_written = (util_last_bit(write_mask) - 1) *
7470 inst->dst.component_size(inst->exec_size) +
7471 (reg_unit(devinfo) * REG_SIZE);
7472 } else {
7473 inst->size_written = util_last_bit(write_mask) *
7474 inst->dst.component_size(inst->exec_size);
7475 }
7476 } else {
7477 inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
7478 (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
7479 }
7480
7481 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
7482 inst->shadow_compare = true;
7483
7484 /* Wa_14012688258:
7485 *
7486 * Don't trim zeros at the end of payload for sample operations
7487 * in cube and cube arrays.
7488 */
7489 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
7490 intel_needs_workaround(devinfo, 14012688258)) {
7491
7492 /* Compiler should send U,V,R parameters even if V,R are 0. */
7493 if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
7494 assert(instr->coord_components >= 3u);
7495
7496 /* See opt_zero_samples(). */
7497 inst->keep_payload_trailing_zeros = true;
7498 }
7499
7500 elk_fs_reg nir_dest[5];
7501 for (unsigned i = 0; i < dest_size; i++)
7502 nir_dest[i] = offset(dst, bld, i);
7503
7504 if (instr->op == nir_texop_query_levels) {
7505 /* # levels is in .w */
7506 if (devinfo->ver <= 9) {
7507 /**
7508 * Wa_1940217:
7509 *
7510 * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
7511 * MIPCount returned is undefined instead of 0.
7512 */
7513 elk_fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
7514 mov->conditional_mod = ELK_CONDITIONAL_NZ;
7515 nir_dest[0] = bld.vgrf(ELK_REGISTER_TYPE_D);
7516 elk_fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), elk_imm_d(0));
7517 sel->predicate = ELK_PREDICATE_NORMAL;
7518 } else {
7519 nir_dest[0] = offset(dst, bld, 3);
7520 }
7521 } else if (instr->op == nir_texop_txs &&
7522 dest_size >= 3 && devinfo->ver < 7) {
7523 /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
7524 elk_fs_reg depth = offset(dst, bld, 2);
7525 nir_dest[2] = s.vgrf(glsl_int_type());
7526 bld.emit_minmax(nir_dest[2], depth, elk_imm_d(1), ELK_CONDITIONAL_GE);
7527 }
7528
7529 /* The residency bits are only in the first component. */
7530 if (instr->is_sparse)
7531 nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
7532
7533 bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0);
7534 }
7535
7536 static void
fs_nir_emit_jump(nir_to_elk_state & ntb,nir_jump_instr * instr)7537 fs_nir_emit_jump(nir_to_elk_state &ntb, nir_jump_instr *instr)
7538 {
7539 switch (instr->type) {
7540 case nir_jump_break:
7541 ntb.bld.emit(ELK_OPCODE_BREAK);
7542 break;
7543 case nir_jump_continue:
7544 ntb.bld.emit(ELK_OPCODE_CONTINUE);
7545 break;
7546 case nir_jump_halt:
7547 ntb.bld.emit(ELK_OPCODE_HALT);
7548 break;
7549 case nir_jump_return:
7550 default:
7551 unreachable("unknown jump");
7552 }
7553 }
7554
7555 /*
7556 * This helper takes a source register and un/shuffles it into the destination
7557 * register.
7558 *
7559 * If source type size is smaller than destination type size the operation
7560 * needed is a component shuffle. The opposite case would be an unshuffle. If
7561 * source/destination type size is equal a shuffle is done that would be
7562 * equivalent to a simple MOV.
7563 *
7564 * For example, if source is a 16-bit type and destination is 32-bit. A 3
7565 * components .xyz 16-bit vector on SIMD8 would be.
7566 *
7567 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
7568 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
7569 *
7570 * This helper will return the following 2 32-bit components with the 16-bit
7571 * values shuffled:
7572 *
7573 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
7574 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
7575 *
7576 * For unshuffle, the example would be the opposite, a 64-bit type source
7577 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
7578 * would be:
7579 *
7580 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
7581 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
7582 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
7583 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
7584 *
7585 * The returned result would be the following 4 32-bit components unshuffled:
7586 *
7587 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
7588 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
7589 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
7590 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
7591 *
7592 * - Source and destination register must not be overlapped.
7593 * - components units are measured in terms of the smaller type between
7594 * source and destination because we are un/shuffling the smaller
7595 * components from/into the bigger ones.
7596 * - first_component parameter allows skipping source components.
7597 */
7598 void
elk_shuffle_src_to_dst(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)7599 elk_shuffle_src_to_dst(const fs_builder &bld,
7600 const elk_fs_reg &dst,
7601 const elk_fs_reg &src,
7602 uint32_t first_component,
7603 uint32_t components)
7604 {
7605 if (type_sz(src.type) == type_sz(dst.type)) {
7606 assert(!regions_overlap(dst,
7607 type_sz(dst.type) * bld.dispatch_width() * components,
7608 offset(src, bld, first_component),
7609 type_sz(src.type) * bld.dispatch_width() * components));
7610 for (unsigned i = 0; i < components; i++) {
7611 bld.MOV(retype(offset(dst, bld, i), src.type),
7612 offset(src, bld, i + first_component));
7613 }
7614 } else if (type_sz(src.type) < type_sz(dst.type)) {
7615 /* Source is shuffled into destination */
7616 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
7617 assert(!regions_overlap(dst,
7618 type_sz(dst.type) * bld.dispatch_width() *
7619 DIV_ROUND_UP(components, size_ratio),
7620 offset(src, bld, first_component),
7621 type_sz(src.type) * bld.dispatch_width() * components));
7622
7623 elk_reg_type shuffle_type =
7624 elk_reg_type_from_bit_size(8 * type_sz(src.type),
7625 ELK_REGISTER_TYPE_D);
7626 for (unsigned i = 0; i < components; i++) {
7627 elk_fs_reg shuffle_component_i =
7628 subscript(offset(dst, bld, i / size_ratio),
7629 shuffle_type, i % size_ratio);
7630 bld.MOV(shuffle_component_i,
7631 retype(offset(src, bld, i + first_component), shuffle_type));
7632 }
7633 } else {
7634 /* Source is unshuffled into destination */
7635 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
7636 assert(!regions_overlap(dst,
7637 type_sz(dst.type) * bld.dispatch_width() * components,
7638 offset(src, bld, first_component / size_ratio),
7639 type_sz(src.type) * bld.dispatch_width() *
7640 DIV_ROUND_UP(components + (first_component % size_ratio),
7641 size_ratio)));
7642
7643 elk_reg_type shuffle_type =
7644 elk_reg_type_from_bit_size(8 * type_sz(dst.type),
7645 ELK_REGISTER_TYPE_D);
7646 for (unsigned i = 0; i < components; i++) {
7647 elk_fs_reg shuffle_component_i =
7648 subscript(offset(src, bld, (first_component + i) / size_ratio),
7649 shuffle_type, (first_component + i) % size_ratio);
7650 bld.MOV(retype(offset(dst, bld, i), shuffle_type),
7651 shuffle_component_i);
7652 }
7653 }
7654 }
7655
7656 void
elk_shuffle_from_32bit_read(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)7657 elk_shuffle_from_32bit_read(const fs_builder &bld,
7658 const elk_fs_reg &dst,
7659 const elk_fs_reg &src,
7660 uint32_t first_component,
7661 uint32_t components)
7662 {
7663 assert(type_sz(src.type) == 4);
7664
7665 /* This function takes components in units of the destination type while
7666 * elk_shuffle_src_to_dst takes components in units of the smallest type
7667 */
7668 if (type_sz(dst.type) > 4) {
7669 assert(type_sz(dst.type) == 8);
7670 first_component *= 2;
7671 components *= 2;
7672 }
7673
7674 elk_shuffle_src_to_dst(bld, dst, src, first_component, components);
7675 }
7676
7677 elk_fs_reg
elk_setup_imm_df(const fs_builder & bld,double v)7678 elk_setup_imm_df(const fs_builder &bld, double v)
7679 {
7680 const struct intel_device_info *devinfo = bld.shader->devinfo;
7681 assert(devinfo->ver >= 7);
7682
7683 if (devinfo->ver >= 8)
7684 return elk_imm_df(v);
7685
7686 /* gfx7.5 does not support DF immediates straightforward but the DIM
7687 * instruction allows to set the 64-bit immediate value.
7688 */
7689 if (devinfo->platform == INTEL_PLATFORM_HSW) {
7690 const fs_builder ubld = bld.exec_all().group(1, 0);
7691 elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_DF, 1);
7692 ubld.DIM(dst, elk_imm_df(v));
7693 return component(dst, 0);
7694 }
7695
7696 /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
7697 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
7698 * the high 32-bit to suboffset 4 and then applying a stride of 0.
7699 *
7700 * Alternatively, we could also produce a normal VGRF (without stride 0)
7701 * by writing to all the channels in the VGRF, however, that would hit the
7702 * gfx7 bug where we have to split writes that span more than 1 register
7703 * into instructions with a width of 4 (otherwise the write to the second
7704 * register written runs into an execmask hardware bug) which isn't very
7705 * nice.
7706 */
7707 union {
7708 double d;
7709 struct {
7710 uint32_t i1;
7711 uint32_t i2;
7712 };
7713 } di;
7714
7715 di.d = v;
7716
7717 const fs_builder ubld = bld.exec_all().group(1, 0);
7718 const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
7719 ubld.MOV(tmp, elk_imm_ud(di.i1));
7720 ubld.MOV(horiz_offset(tmp, 1), elk_imm_ud(di.i2));
7721
7722 return component(retype(tmp, ELK_REGISTER_TYPE_DF), 0);
7723 }
7724
7725 elk_fs_reg
elk_setup_imm_b(const fs_builder & bld,int8_t v)7726 elk_setup_imm_b(const fs_builder &bld, int8_t v)
7727 {
7728 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_B);
7729 bld.MOV(tmp, elk_imm_w(v));
7730 return tmp;
7731 }
7732
7733 elk_fs_reg
elk_setup_imm_ub(const fs_builder & bld,uint8_t v)7734 elk_setup_imm_ub(const fs_builder &bld, uint8_t v)
7735 {
7736 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UB);
7737 bld.MOV(tmp, elk_imm_uw(v));
7738 return tmp;
7739 }
7740
7741 static void
fs_nir_emit_instr(nir_to_elk_state & ntb,nir_instr * instr)7742 fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr)
7743 {
7744 ntb.bld = ntb.bld.annotate(NULL, instr);
7745
7746 switch (instr->type) {
7747 case nir_instr_type_alu:
7748 fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
7749 break;
7750
7751 case nir_instr_type_deref:
7752 unreachable("All derefs should've been lowered");
7753 break;
7754
7755 case nir_instr_type_intrinsic:
7756 switch (ntb.s.stage) {
7757 case MESA_SHADER_VERTEX:
7758 fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7759 break;
7760 case MESA_SHADER_TESS_CTRL:
7761 fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7762 break;
7763 case MESA_SHADER_TESS_EVAL:
7764 fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7765 break;
7766 case MESA_SHADER_GEOMETRY:
7767 fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7768 break;
7769 case MESA_SHADER_FRAGMENT:
7770 fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7771 break;
7772 case MESA_SHADER_COMPUTE:
7773 fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
7774 break;
7775 default:
7776 unreachable("unsupported shader stage");
7777 }
7778 break;
7779
7780 case nir_instr_type_tex:
7781 fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
7782 break;
7783
7784 case nir_instr_type_load_const:
7785 fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
7786 break;
7787
7788 case nir_instr_type_undef:
7789 /* We create a new VGRF for undefs on every use (by handling
7790 * them in get_nir_src()), rather than for each definition.
7791 * This helps register coalescing eliminate MOVs from undef.
7792 */
7793 break;
7794
7795 case nir_instr_type_jump:
7796 fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
7797 break;
7798
7799 default:
7800 unreachable("unknown instruction type");
7801 }
7802 }
7803
7804 static unsigned
elk_rnd_mode_from_nir(unsigned mode,unsigned * mask)7805 elk_rnd_mode_from_nir(unsigned mode, unsigned *mask)
7806 {
7807 unsigned elk_mode = 0;
7808 *mask = 0;
7809
7810 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
7811 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
7812 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
7813 mode) {
7814 elk_mode |= ELK_RND_MODE_RTZ << ELK_CR0_RND_MODE_SHIFT;
7815 *mask |= ELK_CR0_RND_MODE_MASK;
7816 }
7817 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
7818 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
7819 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
7820 mode) {
7821 elk_mode |= ELK_RND_MODE_RTNE << ELK_CR0_RND_MODE_SHIFT;
7822 *mask |= ELK_CR0_RND_MODE_MASK;
7823 }
7824 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
7825 elk_mode |= ELK_CR0_FP16_DENORM_PRESERVE;
7826 *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
7827 }
7828 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
7829 elk_mode |= ELK_CR0_FP32_DENORM_PRESERVE;
7830 *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
7831 }
7832 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
7833 elk_mode |= ELK_CR0_FP64_DENORM_PRESERVE;
7834 *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
7835 }
7836 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
7837 *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
7838 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
7839 *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
7840 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
7841 *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
7842 if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7843 *mask |= ELK_CR0_FP_MODE_MASK;
7844
7845 if (*mask != 0)
7846 assert((*mask & elk_mode) == elk_mode);
7847
7848 return elk_mode;
7849 }
7850
7851 static void
emit_shader_float_controls_execution_mode(nir_to_elk_state & ntb)7852 emit_shader_float_controls_execution_mode(nir_to_elk_state &ntb)
7853 {
7854 const fs_builder &bld = ntb.bld;
7855 elk_fs_visitor &s = ntb.s;
7856
7857 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
7858 if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7859 return;
7860
7861 fs_builder ubld = bld.exec_all().group(1, 0);
7862 fs_builder abld = ubld.annotate("shader floats control execution mode");
7863 unsigned mask, mode = elk_rnd_mode_from_nir(execution_mode, &mask);
7864
7865 if (mask == 0)
7866 return;
7867
7868 abld.emit(ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
7869 elk_imm_d(mode), elk_imm_d(mask));
7870 }
7871
7872 void
nir_to_elk(elk_fs_visitor * s)7873 nir_to_elk(elk_fs_visitor *s)
7874 {
7875 nir_to_elk_state ntb = {
7876 .s = *s,
7877 .nir = s->nir,
7878 .devinfo = s->devinfo,
7879 .mem_ctx = ralloc_context(NULL),
7880 .bld = fs_builder(s).at_end(),
7881 };
7882
7883 emit_shader_float_controls_execution_mode(ntb);
7884
7885 /* emit the arrays used for inputs and outputs - load/store intrinsics will
7886 * be converted to reads/writes of these arrays
7887 */
7888 fs_nir_setup_outputs(ntb);
7889 fs_nir_setup_uniforms(ntb.s);
7890 fs_nir_emit_system_values(ntb);
7891 ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
7892
7893 fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
7894
7895 ntb.bld.emit(ELK_SHADER_OPCODE_HALT_TARGET);
7896
7897 ralloc_free(ntb.mem_ctx);
7898 }
7899
7900