1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_fs.h"
25 #include "elk_fs_builder.h"
26 #include "elk_nir.h"
27 #include "elk_nir_private.h"
28 #include "elk_eu.h"
29 #include "nir.h"
30 #include "nir_intrinsics.h"
31 #include "nir_search_helpers.h"
32 #include "util/u_math.h"
33 #include "util/bitscan.h"
34
35 #include <vector>
36
37 using namespace elk;
38
39 struct elk_fs_bind_info {
40 bool valid;
41 bool bindless;
42 unsigned block;
43 unsigned set;
44 unsigned binding;
45 };
46
47 struct nir_to_elk_state {
48 elk_fs_visitor &s;
49 const nir_shader *nir;
50 const intel_device_info *devinfo;
51 void *mem_ctx;
52
53 /* Points to the end of the program. Annotated with the current NIR
54 * instruction when applicable.
55 */
56 fs_builder bld;
57
58 elk_fs_reg *ssa_values;
59 elk_fs_inst **resource_insts;
60 struct elk_fs_bind_info *ssa_bind_infos;
61 elk_fs_reg *resource_values;
62 elk_fs_reg *system_values;
63 };
64
65 static elk_fs_reg get_nir_src(nir_to_elk_state &ntb, const nir_src &src);
66 static elk_fs_reg get_nir_def(nir_to_elk_state &ntb, const nir_def &def);
67 static nir_component_mask_t get_nir_write_mask(const nir_def &def);
68
69 static void fs_nir_emit_intrinsic(nir_to_elk_state &ntb, const fs_builder &bld, nir_intrinsic_instr *instr);
70 static elk_fs_reg emit_samplepos_setup(nir_to_elk_state &ntb);
71 static elk_fs_reg emit_sampleid_setup(nir_to_elk_state &ntb);
72 static elk_fs_reg emit_samplemaskin_setup(nir_to_elk_state &ntb);
73
74 static void fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl);
75 static void fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list);
76 static void fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt);
77 static void fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop);
78 static void fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block);
79 static void fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr);
80
81 static void fs_nir_emit_surface_atomic(nir_to_elk_state &ntb,
82 const fs_builder &bld,
83 nir_intrinsic_instr *instr,
84 elk_fs_reg surface,
85 bool bindless);
86 static void fs_nir_emit_global_atomic(nir_to_elk_state &ntb,
87 const fs_builder &bld,
88 nir_intrinsic_instr *instr);
89
90 static void
fs_nir_setup_outputs(nir_to_elk_state & ntb)91 fs_nir_setup_outputs(nir_to_elk_state &ntb)
92 {
93 elk_fs_visitor &s = ntb.s;
94
95 if (s.stage == MESA_SHADER_TESS_CTRL ||
96 s.stage == MESA_SHADER_FRAGMENT)
97 return;
98
99 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
100
101 /* Calculate the size of output registers in a separate pass, before
102 * allocating them. With ARB_enhanced_layouts, multiple output variables
103 * may occupy the same slot, but have different type sizes.
104 */
105 nir_foreach_shader_out_variable(var, s.nir) {
106 const int loc = var->data.driver_location;
107 const unsigned var_vec4s = nir_variable_count_slots(var, var->type);
108 vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
109 }
110
111 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
112 if (vec4s[loc] == 0) {
113 loc++;
114 continue;
115 }
116
117 unsigned reg_size = vec4s[loc];
118
119 /* Check if there are any ranges that start within this range and extend
120 * past it. If so, include them in this allocation.
121 */
122 for (unsigned i = 1; i < reg_size; i++) {
123 assert(i + loc < ARRAY_SIZE(vec4s));
124 reg_size = MAX2(vec4s[i + loc] + i, reg_size);
125 }
126
127 elk_fs_reg reg = ntb.bld.vgrf(ELK_REGISTER_TYPE_F, 4 * reg_size);
128 for (unsigned i = 0; i < reg_size; i++) {
129 assert(loc + i < ARRAY_SIZE(s.outputs));
130 s.outputs[loc + i] = offset(reg, ntb.bld, 4 * i);
131 }
132
133 loc += reg_size;
134 }
135 }
136
137 static void
fs_nir_setup_uniforms(elk_fs_visitor & s)138 fs_nir_setup_uniforms(elk_fs_visitor &s)
139 {
140 /* Only the first compile gets to set up uniforms. */
141 if (s.push_constant_loc)
142 return;
143
144 s.uniforms = s.nir->num_uniforms / 4;
145
146 if (gl_shader_stage_is_compute(s.stage)) {
147 /* Add uniforms for builtins after regular NIR uniforms. */
148 assert(s.uniforms == s.prog_data->nr_params);
149
150 /* Subgroup ID must be the last uniform on the list. This will make
151 * easier later to split between cross thread and per thread
152 * uniforms.
153 */
154 uint32_t *param = elk_stage_prog_data_add_params(s.prog_data, 1);
155 *param = ELK_PARAM_BUILTIN_SUBGROUP_ID;
156 s.uniforms++;
157 }
158 }
159
160 static elk_fs_reg
emit_work_group_id_setup(nir_to_elk_state & ntb)161 emit_work_group_id_setup(nir_to_elk_state &ntb)
162 {
163 elk_fs_visitor &s = ntb.s;
164 const fs_builder &bld = ntb.bld;
165
166 assert(gl_shader_stage_is_compute(s.stage));
167
168 elk_fs_reg id = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
169
170 struct elk_reg r0_1(retype(elk_vec1_grf(0, 1), ELK_REGISTER_TYPE_UD));
171 bld.MOV(id, r0_1);
172
173 struct elk_reg r0_6(retype(elk_vec1_grf(0, 6), ELK_REGISTER_TYPE_UD));
174 struct elk_reg r0_7(retype(elk_vec1_grf(0, 7), ELK_REGISTER_TYPE_UD));
175 bld.MOV(offset(id, bld, 1), r0_6);
176 bld.MOV(offset(id, bld, 2), r0_7);
177
178 return id;
179 }
180
181 static bool
emit_system_values_block(nir_to_elk_state & ntb,nir_block * block)182 emit_system_values_block(nir_to_elk_state &ntb, nir_block *block)
183 {
184 elk_fs_visitor &s = ntb.s;
185 elk_fs_reg *reg;
186
187 nir_foreach_instr(instr, block) {
188 if (instr->type != nir_instr_type_intrinsic)
189 continue;
190
191 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
192 switch (intrin->intrinsic) {
193 case nir_intrinsic_load_vertex_id:
194 case nir_intrinsic_load_base_vertex:
195 unreachable("should be lowered by nir_lower_system_values().");
196
197 case nir_intrinsic_load_vertex_id_zero_base:
198 case nir_intrinsic_load_is_indexed_draw:
199 case nir_intrinsic_load_first_vertex:
200 case nir_intrinsic_load_instance_id:
201 case nir_intrinsic_load_base_instance:
202 unreachable("should be lowered by elk_nir_lower_vs_inputs().");
203 break;
204
205 case nir_intrinsic_load_draw_id:
206 unreachable("should be lowered by elk_nir_lower_vs_inputs().");
207 break;
208
209 case nir_intrinsic_load_invocation_id:
210 if (s.stage == MESA_SHADER_TESS_CTRL)
211 break;
212 assert(s.stage == MESA_SHADER_GEOMETRY);
213 reg = &ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
214 if (reg->file == BAD_FILE) {
215 *reg = s.gs_payload().instance_id;
216 }
217 break;
218
219 case nir_intrinsic_load_sample_pos:
220 case nir_intrinsic_load_sample_pos_or_center:
221 assert(s.stage == MESA_SHADER_FRAGMENT);
222 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
223 if (reg->file == BAD_FILE)
224 *reg = emit_samplepos_setup(ntb);
225 break;
226
227 case nir_intrinsic_load_sample_id:
228 assert(s.stage == MESA_SHADER_FRAGMENT);
229 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
230 if (reg->file == BAD_FILE)
231 *reg = emit_sampleid_setup(ntb);
232 break;
233
234 case nir_intrinsic_load_sample_mask_in:
235 assert(s.stage == MESA_SHADER_FRAGMENT);
236 assert(s.devinfo->ver >= 7);
237 reg = &ntb.system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
238 if (reg->file == BAD_FILE)
239 *reg = emit_samplemaskin_setup(ntb);
240 break;
241
242 case nir_intrinsic_load_workgroup_id:
243 assert(gl_shader_stage_is_compute(s.stage));
244 reg = &ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
245 if (reg->file == BAD_FILE)
246 *reg = emit_work_group_id_setup(ntb);
247 break;
248
249 case nir_intrinsic_load_helper_invocation:
250 assert(s.stage == MESA_SHADER_FRAGMENT);
251 reg = &ntb.system_values[SYSTEM_VALUE_HELPER_INVOCATION];
252 if (reg->file == BAD_FILE) {
253 const fs_builder abld =
254 ntb.bld.annotate("gl_HelperInvocation", NULL);
255
256 /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
257 * pixel mask is in g1.7 of the thread payload.
258 *
259 * We move the per-channel pixel enable bit to the low bit of each
260 * channel by shifting the byte containing the pixel mask by the
261 * vector immediate 0x76543210UV.
262 *
263 * The region of <1,8,0> reads only 1 byte (the pixel masks for
264 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
265 * masks for 2 and 3) in SIMD16.
266 */
267 elk_fs_reg shifted = abld.vgrf(ELK_REGISTER_TYPE_UW, 1);
268
269 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
270 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
271 /* According to the "PS Thread Payload for Normal
272 * Dispatch" pages on the BSpec, the dispatch mask is
273 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
274 * gfx6+.
275 */
276 const struct elk_reg reg = elk_vec1_grf(i + 1, 7);
277 hbld.SHR(offset(shifted, hbld, i),
278 stride(retype(reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
279 elk_imm_v(0x76543210));
280 }
281
282 /* A set bit in the pixel mask means the channel is enabled, but
283 * that is the opposite of gl_HelperInvocation so we need to invert
284 * the mask.
285 *
286 * The negate source-modifier bit of logical instructions on Gfx8+
287 * performs 1's complement negation, so we can use that instead of
288 * a NOT instruction.
289 */
290 elk_fs_reg inverted = negate(shifted);
291 if (s.devinfo->ver < 8) {
292 inverted = abld.vgrf(ELK_REGISTER_TYPE_UW);
293 abld.NOT(inverted, shifted);
294 }
295
296 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
297 * with 1 and negating.
298 */
299 elk_fs_reg anded = abld.vgrf(ELK_REGISTER_TYPE_UD, 1);
300 abld.AND(anded, inverted, elk_imm_uw(1));
301
302 elk_fs_reg dst = abld.vgrf(ELK_REGISTER_TYPE_D, 1);
303 abld.MOV(dst, negate(retype(anded, ELK_REGISTER_TYPE_D)));
304 *reg = dst;
305 }
306 break;
307
308 default:
309 break;
310 }
311 }
312
313 return true;
314 }
315
316 static void
fs_nir_emit_system_values(nir_to_elk_state & ntb)317 fs_nir_emit_system_values(nir_to_elk_state &ntb)
318 {
319 const fs_builder &bld = ntb.bld;
320 elk_fs_visitor &s = ntb.s;
321
322 ntb.system_values = ralloc_array(ntb.mem_ctx, elk_fs_reg, SYSTEM_VALUE_MAX);
323 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
324 ntb.system_values[i] = elk_fs_reg();
325 }
326
327 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we
328 * never end up using it.
329 */
330 {
331 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
332 elk_fs_reg ® = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
333 reg = abld.vgrf(ELK_REGISTER_TYPE_UW);
334 abld.UNDEF(reg);
335
336 const fs_builder allbld8 = abld.group(8, 0).exec_all();
337 allbld8.MOV(reg, elk_imm_v(0x76543210));
338 if (s.dispatch_width > 8)
339 allbld8.ADD(byte_offset(reg, 16), reg, elk_imm_uw(8u));
340 if (s.dispatch_width > 16) {
341 const fs_builder allbld16 = abld.group(16, 0).exec_all();
342 allbld16.ADD(byte_offset(reg, 32), reg, elk_imm_uw(16u));
343 }
344 }
345
346 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)s.nir);
347 nir_foreach_block(block, impl)
348 emit_system_values_block(ntb, block);
349 }
350
351 static void
fs_nir_emit_impl(nir_to_elk_state & ntb,nir_function_impl * impl)352 fs_nir_emit_impl(nir_to_elk_state &ntb, nir_function_impl *impl)
353 {
354 ntb.ssa_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
355 ntb.resource_insts = rzalloc_array(ntb.mem_ctx, elk_fs_inst *, impl->ssa_alloc);
356 ntb.ssa_bind_infos = rzalloc_array(ntb.mem_ctx, struct elk_fs_bind_info, impl->ssa_alloc);
357 ntb.resource_values = rzalloc_array(ntb.mem_ctx, elk_fs_reg, impl->ssa_alloc);
358
359 fs_nir_emit_cf_list(ntb, &impl->body);
360 }
361
362 static void
fs_nir_emit_cf_list(nir_to_elk_state & ntb,exec_list * list)363 fs_nir_emit_cf_list(nir_to_elk_state &ntb, exec_list *list)
364 {
365 exec_list_validate(list);
366 foreach_list_typed(nir_cf_node, node, node, list) {
367 switch (node->type) {
368 case nir_cf_node_if:
369 fs_nir_emit_if(ntb, nir_cf_node_as_if(node));
370 break;
371
372 case nir_cf_node_loop:
373 fs_nir_emit_loop(ntb, nir_cf_node_as_loop(node));
374 break;
375
376 case nir_cf_node_block:
377 fs_nir_emit_block(ntb, nir_cf_node_as_block(node));
378 break;
379
380 default:
381 unreachable("Invalid CFG node block");
382 }
383 }
384 }
385
386 static void
fs_nir_emit_if(nir_to_elk_state & ntb,nir_if * if_stmt)387 fs_nir_emit_if(nir_to_elk_state &ntb, nir_if *if_stmt)
388 {
389 const intel_device_info *devinfo = ntb.devinfo;
390 const fs_builder &bld = ntb.bld;
391
392 bool invert;
393 elk_fs_reg cond_reg;
394
395 /* If the condition has the form !other_condition, use other_condition as
396 * the source, but invert the predicate on the if instruction.
397 */
398 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
399 if (cond != NULL && cond->op == nir_op_inot) {
400 invert = true;
401 cond_reg = get_nir_src(ntb, cond->src[0].src);
402 cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
403
404 if (devinfo->ver <= 5 &&
405 (cond->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
406 /* redo boolean resolve on gen5 */
407 elk_fs_reg masked = ntb.s.vgrf(glsl_int_type());
408 bld.AND(masked, cond_reg, elk_imm_d(1));
409 masked.negate = true;
410 elk_fs_reg tmp = bld.vgrf(cond_reg.type);
411 bld.MOV(retype(tmp, ELK_REGISTER_TYPE_D), masked);
412 cond_reg = tmp;
413 }
414 } else {
415 invert = false;
416 cond_reg = get_nir_src(ntb, if_stmt->condition);
417 }
418
419 /* first, put the condition into f0 */
420 elk_fs_inst *inst = bld.MOV(bld.null_reg_d(),
421 retype(cond_reg, ELK_REGISTER_TYPE_D));
422 inst->conditional_mod = ELK_CONDITIONAL_NZ;
423
424 bld.IF(ELK_PREDICATE_NORMAL)->predicate_inverse = invert;
425
426 fs_nir_emit_cf_list(ntb, &if_stmt->then_list);
427
428 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
429 bld.emit(ELK_OPCODE_ELSE);
430 fs_nir_emit_cf_list(ntb, &if_stmt->else_list);
431 }
432
433 bld.emit(ELK_OPCODE_ENDIF);
434
435 if (devinfo->ver < 7)
436 ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
437 "in SIMD32 mode.");
438 }
439
440 static void
fs_nir_emit_loop(nir_to_elk_state & ntb,nir_loop * loop)441 fs_nir_emit_loop(nir_to_elk_state &ntb, nir_loop *loop)
442 {
443 const intel_device_info *devinfo = ntb.devinfo;
444 const fs_builder &bld = ntb.bld;
445
446 assert(!nir_loop_has_continue_construct(loop));
447 bld.emit(ELK_OPCODE_DO);
448
449 fs_nir_emit_cf_list(ntb, &loop->body);
450
451 bld.emit(ELK_OPCODE_WHILE);
452
453 if (devinfo->ver < 7)
454 ntb.s.limit_dispatch_width(16, "Non-uniform control flow unsupported "
455 "in SIMD32 mode.");
456 }
457
458 static void
fs_nir_emit_block(nir_to_elk_state & ntb,nir_block * block)459 fs_nir_emit_block(nir_to_elk_state &ntb, nir_block *block)
460 {
461 fs_builder bld = ntb.bld;
462
463 nir_foreach_instr(instr, block) {
464 fs_nir_emit_instr(ntb, instr);
465 }
466
467 ntb.bld = bld;
468 }
469
470 /**
471 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
472 * match instr.
473 */
474 static bool
optimize_extract_to_float(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)475 optimize_extract_to_float(nir_to_elk_state &ntb, nir_alu_instr *instr,
476 const elk_fs_reg &result)
477 {
478 const intel_device_info *devinfo = ntb.devinfo;
479 const fs_builder &bld = ntb.bld;
480
481 /* No fast path for f16 or f64. */
482 assert(instr->op == nir_op_i2f32 || instr->op == nir_op_u2f32);
483
484 if (!instr->src[0].src.ssa->parent_instr)
485 return false;
486
487 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
488 return false;
489
490 nir_alu_instr *src0 =
491 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
492
493 unsigned bytes;
494 bool is_signed;
495
496 switch (src0->op) {
497 case nir_op_extract_u8:
498 case nir_op_extract_u16:
499 bytes = src0->op == nir_op_extract_u8 ? 1 : 2;
500
501 /* i2f(extract_u8(a, b)) and u2f(extract_u8(a, b)) produce the same
502 * result. Ditto for extract_u16.
503 */
504 is_signed = false;
505 break;
506
507 case nir_op_extract_i8:
508 case nir_op_extract_i16:
509 bytes = src0->op == nir_op_extract_i8 ? 1 : 2;
510
511 /* The fast path can't handle u2f(extract_i8(a, b)) because the implicit
512 * sign extension of the extract_i8 is lost. For example,
513 * u2f(extract_i8(0x0000ff00, 1)) should produce 4294967295.0, but a
514 * fast path could either give 255.0 (by implementing the fast path as
515 * u2f(extract_u8(x))) or -1.0 (by implementing the fast path as
516 * i2f(extract_i8(x))). At one point in time, we incorrectly implemented
517 * the former.
518 */
519 if (instr->op != nir_op_i2f32)
520 return false;
521
522 is_signed = true;
523 break;
524
525 default:
526 return false;
527 }
528
529 unsigned element = nir_src_as_uint(src0->src[1].src);
530
531 /* Element type to extract.*/
532 const elk_reg_type type = elk_int_type(bytes, is_signed);
533
534 elk_fs_reg op0 = get_nir_src(ntb, src0->src[0].src);
535 op0.type = elk_type_for_nir_type(devinfo,
536 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
537 nir_src_bit_size(src0->src[0].src)));
538 op0 = offset(op0, bld, src0->src[0].swizzle[0]);
539
540 bld.MOV(result, subscript(op0, type, element));
541 return true;
542 }
543
544 static bool
optimize_frontfacing_ternary(nir_to_elk_state & ntb,nir_alu_instr * instr,const elk_fs_reg & result)545 optimize_frontfacing_ternary(nir_to_elk_state &ntb,
546 nir_alu_instr *instr,
547 const elk_fs_reg &result)
548 {
549 const intel_device_info *devinfo = ntb.devinfo;
550 elk_fs_visitor &s = ntb.s;
551
552 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
553 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
554 return false;
555
556 if (!nir_src_is_const(instr->src[1].src) ||
557 !nir_src_is_const(instr->src[2].src))
558 return false;
559
560 const float value1 = nir_src_as_float(instr->src[1].src);
561 const float value2 = nir_src_as_float(instr->src[2].src);
562 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
563 return false;
564
565 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
566 assert(value1 == -value2);
567
568 elk_fs_reg tmp = s.vgrf(glsl_int_type());
569
570 if (devinfo->ver >= 6) {
571 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
572 elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
573
574 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
575 *
576 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W
577 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
578 *
579 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
580 *
581 * This negation looks like it's safe in practice, because bits 0:4 will
582 * surely be TRIANGLES
583 */
584
585 if (value1 == -1.0f) {
586 g0.negate = true;
587 }
588
589 ntb.bld.OR(subscript(tmp, ELK_REGISTER_TYPE_W, 1),
590 g0, elk_imm_uw(0x3f80));
591 } else {
592 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
593 elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
594
595 /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
596 *
597 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D
598 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D
599 *
600 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
601 *
602 * This negation looks like it's safe in practice, because bits 0:4 will
603 * surely be TRIANGLES
604 */
605
606 if (value1 == -1.0f) {
607 g1_6.negate = true;
608 }
609
610 ntb.bld.OR(tmp, g1_6, elk_imm_d(0x3f800000));
611 }
612 ntb.bld.AND(retype(result, ELK_REGISTER_TYPE_D), tmp, elk_imm_d(0xbf800000));
613
614 return true;
615 }
616
617 static elk_rnd_mode
elk_rnd_mode_from_nir_op(const nir_op op)618 elk_rnd_mode_from_nir_op (const nir_op op) {
619 switch (op) {
620 case nir_op_f2f16_rtz:
621 return ELK_RND_MODE_RTZ;
622 case nir_op_f2f16_rtne:
623 return ELK_RND_MODE_RTNE;
624 default:
625 unreachable("Operation doesn't support rounding mode");
626 }
627 }
628
629 static elk_rnd_mode
elk_rnd_mode_from_execution_mode(unsigned execution_mode)630 elk_rnd_mode_from_execution_mode(unsigned execution_mode)
631 {
632 if (nir_has_any_rounding_mode_rtne(execution_mode))
633 return ELK_RND_MODE_RTNE;
634 if (nir_has_any_rounding_mode_rtz(execution_mode))
635 return ELK_RND_MODE_RTZ;
636 return ELK_RND_MODE_UNSPECIFIED;
637 }
638
639 static elk_fs_reg
prepare_alu_destination_and_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op,bool need_dest)640 prepare_alu_destination_and_sources(nir_to_elk_state &ntb,
641 const fs_builder &bld,
642 nir_alu_instr *instr,
643 elk_fs_reg *op,
644 bool need_dest)
645 {
646 const intel_device_info *devinfo = ntb.devinfo;
647
648 elk_fs_reg result =
649 need_dest ? get_nir_def(ntb, instr->def) : bld.null_reg_ud();
650
651 result.type = elk_type_for_nir_type(devinfo,
652 (nir_alu_type)(nir_op_infos[instr->op].output_type |
653 instr->def.bit_size));
654
655 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
656 op[i] = get_nir_src(ntb, instr->src[i].src);
657 op[i].type = elk_type_for_nir_type(devinfo,
658 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
659 nir_src_bit_size(instr->src[i].src)));
660 }
661
662 /* Move and vecN instrutions may still be vectored. Return the raw,
663 * vectored source and destination so that elk_fs_visitor::nir_emit_alu can
664 * handle it. Other callers should not have to handle these kinds of
665 * instructions.
666 */
667 switch (instr->op) {
668 case nir_op_mov:
669 case nir_op_vec2:
670 case nir_op_vec3:
671 case nir_op_vec4:
672 case nir_op_vec8:
673 case nir_op_vec16:
674 return result;
675 default:
676 break;
677 }
678
679 /* At this point, we have dealt with any instruction that operates on
680 * more than a single channel. Therefore, we can just adjust the source
681 * and destination registers for that channel and emit the instruction.
682 */
683 unsigned channel = 0;
684 if (nir_op_infos[instr->op].output_size == 0) {
685 /* Since NIR is doing the scalarizing for us, we should only ever see
686 * vectorized operations with a single channel.
687 */
688 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
689 assert(util_bitcount(write_mask) == 1);
690 channel = ffs(write_mask) - 1;
691
692 result = offset(result, bld, channel);
693 }
694
695 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
696 assert(nir_op_infos[instr->op].input_sizes[i] < 2);
697 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
698 }
699
700 return result;
701 }
702
703 static elk_fs_reg
resolve_source_modifiers(const fs_builder & bld,const elk_fs_reg & src)704 resolve_source_modifiers(const fs_builder &bld, const elk_fs_reg &src)
705 {
706 if (!src.abs && !src.negate)
707 return src;
708
709 elk_fs_reg temp = bld.vgrf(src.type);
710 bld.MOV(temp, src);
711
712 return temp;
713 }
714
715 static void
resolve_inot_sources(nir_to_elk_state & ntb,const fs_builder & bld,nir_alu_instr * instr,elk_fs_reg * op)716 resolve_inot_sources(nir_to_elk_state &ntb, const fs_builder &bld, nir_alu_instr *instr,
717 elk_fs_reg *op)
718 {
719 for (unsigned i = 0; i < 2; i++) {
720 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
721
722 if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
723 /* The source of the inot is now the source of instr. */
724 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op[i], false);
725
726 assert(!op[i].negate);
727 op[i].negate = true;
728 } else {
729 op[i] = resolve_source_modifiers(bld, op[i]);
730 }
731 }
732 }
733
734 static bool
try_emit_b2fi_of_inot(nir_to_elk_state & ntb,const fs_builder & bld,elk_fs_reg result,nir_alu_instr * instr)735 try_emit_b2fi_of_inot(nir_to_elk_state &ntb, const fs_builder &bld,
736 elk_fs_reg result,
737 nir_alu_instr *instr)
738 {
739 const intel_device_info *devinfo = bld.shader->devinfo;
740
741 if (devinfo->ver < 6)
742 return false;
743
744 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
745
746 if (inot_instr == NULL || inot_instr->op != nir_op_inot)
747 return false;
748
749 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set
750 * of valid size-changing combinations is a bit more complex.
751 *
752 * The source restriction is just because I was lazy about generating the
753 * constant below.
754 */
755 if (instr->def.bit_size != 32 ||
756 nir_src_bit_size(inot_instr->src[0].src) != 32)
757 return false;
758
759 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1,
760 * this is float(1 + a).
761 */
762 elk_fs_reg op;
763
764 prepare_alu_destination_and_sources(ntb, bld, inot_instr, &op, false);
765
766 /* Ignore the saturate modifier, if there is one. The result of the
767 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
768 */
769 bld.ADD(result, op, elk_imm_d(1));
770
771 return true;
772 }
773
774 /**
775 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
776 *
777 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
778 * the source of \c instr that is a \c nir_op_fsign.
779 */
780 static void
emit_fsign(nir_to_elk_state & ntb,const fs_builder & bld,const nir_alu_instr * instr,elk_fs_reg result,elk_fs_reg * op,unsigned fsign_src)781 emit_fsign(nir_to_elk_state &ntb, const fs_builder &bld, const nir_alu_instr *instr,
782 elk_fs_reg result, elk_fs_reg *op, unsigned fsign_src)
783 {
784 const intel_device_info *devinfo = ntb.devinfo;
785
786 elk_fs_inst *inst;
787
788 assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
789 assert(fsign_src < nir_op_infos[instr->op].num_inputs);
790
791 if (instr->op != nir_op_fsign) {
792 const nir_alu_instr *const fsign_instr =
793 nir_src_as_alu_instr(instr->src[fsign_src].src);
794
795 /* op[fsign_src] has the nominal result of the fsign, and op[1 -
796 * fsign_src] has the other multiply source. This must be rearranged so
797 * that op[0] is the source of the fsign op[1] is the other multiply
798 * source.
799 */
800 if (fsign_src != 0)
801 op[1] = op[0];
802
803 op[0] = get_nir_src(ntb, fsign_instr->src[0].src);
804
805 const nir_alu_type t =
806 (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
807 nir_src_bit_size(fsign_instr->src[0].src));
808
809 op[0].type = elk_type_for_nir_type(devinfo, t);
810
811 unsigned channel = 0;
812 if (nir_op_infos[instr->op].output_size == 0) {
813 /* Since NIR is doing the scalarizing for us, we should only ever see
814 * vectorized operations with a single channel.
815 */
816 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
817 assert(util_bitcount(write_mask) == 1);
818 channel = ffs(write_mask) - 1;
819 }
820
821 op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
822 }
823
824 if (type_sz(op[0].type) == 2) {
825 /* AND(val, 0x8000) gives the sign bit.
826 *
827 * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
828 */
829 elk_fs_reg zero = retype(elk_imm_uw(0), ELK_REGISTER_TYPE_HF);
830 bld.CMP(bld.null_reg_f(), op[0], zero, ELK_CONDITIONAL_NZ);
831
832 op[0].type = ELK_REGISTER_TYPE_UW;
833 result.type = ELK_REGISTER_TYPE_UW;
834 bld.AND(result, op[0], elk_imm_uw(0x8000u));
835
836 if (instr->op == nir_op_fsign)
837 inst = bld.OR(result, result, elk_imm_uw(0x3c00u));
838 else {
839 /* Use XOR here to get the result sign correct. */
840 inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UW));
841 }
842
843 inst->predicate = ELK_PREDICATE_NORMAL;
844 } else if (type_sz(op[0].type) == 4) {
845 /* AND(val, 0x80000000) gives the sign bit.
846 *
847 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
848 * zero.
849 */
850 bld.CMP(bld.null_reg_f(), op[0], elk_imm_f(0.0f), ELK_CONDITIONAL_NZ);
851
852 op[0].type = ELK_REGISTER_TYPE_UD;
853 result.type = ELK_REGISTER_TYPE_UD;
854 bld.AND(result, op[0], elk_imm_ud(0x80000000u));
855
856 if (instr->op == nir_op_fsign)
857 inst = bld.OR(result, result, elk_imm_ud(0x3f800000u));
858 else {
859 /* Use XOR here to get the result sign correct. */
860 inst = bld.XOR(result, result, retype(op[1], ELK_REGISTER_TYPE_UD));
861 }
862
863 inst->predicate = ELK_PREDICATE_NORMAL;
864 } else {
865 unreachable("Should have been lowered by nir_opt_algebraic.");
866 }
867 }
868
869 /**
870 * Determine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
871 *
872 * Checks the operands of a \c nir_op_fmul to determine whether or not
873 * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
874 *
875 * \param instr The multiplication instruction
876 *
877 * \param fsign_src The source of \c instr that may or may not be a
878 * \c nir_op_fsign
879 */
880 static bool
can_fuse_fmul_fsign(nir_alu_instr * instr,unsigned fsign_src)881 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
882 {
883 assert(instr->op == nir_op_fmul);
884
885 nir_alu_instr *const fsign_instr =
886 nir_src_as_alu_instr(instr->src[fsign_src].src);
887
888 /* Rules:
889 *
890 * 1. instr->src[fsign_src] must be a nir_op_fsign.
891 * 2. The nir_op_fsign can only be used by this multiplication.
892 * 3. The source that is the nir_op_fsign does not have source modifiers.
893 * \c emit_fsign only examines the source modifiers of the source of the
894 * \c nir_op_fsign.
895 *
896 * The nir_op_fsign must also not have the saturate modifier, but steps
897 * have already been taken (in nir_opt_algebraic) to ensure that.
898 */
899 return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
900 is_used_once(fsign_instr);
901 }
902
903 static bool
is_const_zero(const nir_src & src)904 is_const_zero(const nir_src &src)
905 {
906 return nir_src_is_const(src) && nir_src_as_int(src) == 0;
907 }
908
909 static void
fs_nir_emit_alu(nir_to_elk_state & ntb,nir_alu_instr * instr,bool need_dest)910 fs_nir_emit_alu(nir_to_elk_state &ntb, nir_alu_instr *instr,
911 bool need_dest)
912 {
913 const intel_device_info *devinfo = ntb.devinfo;
914 const fs_builder &bld = ntb.bld;
915 elk_fs_visitor &s = ntb.s;
916
917 elk_fs_inst *inst;
918 unsigned execution_mode =
919 bld.shader->nir->info.float_controls_execution_mode;
920
921 elk_fs_reg op[NIR_MAX_VEC_COMPONENTS];
922 elk_fs_reg result = prepare_alu_destination_and_sources(ntb, bld, instr, op, need_dest);
923
924 #ifndef NDEBUG
925 /* Everything except raw moves, some type conversions, iabs, and ineg
926 * should have 8-bit sources lowered by nir_lower_bit_size in
927 * elk_preprocess_nir or by elk_nir_lower_conversions in
928 * elk_postprocess_nir.
929 */
930 switch (instr->op) {
931 case nir_op_mov:
932 case nir_op_vec2:
933 case nir_op_vec3:
934 case nir_op_vec4:
935 case nir_op_vec8:
936 case nir_op_vec16:
937 case nir_op_i2f16:
938 case nir_op_i2f32:
939 case nir_op_i2i16:
940 case nir_op_i2i32:
941 case nir_op_u2f16:
942 case nir_op_u2f32:
943 case nir_op_u2u16:
944 case nir_op_u2u32:
945 case nir_op_iabs:
946 case nir_op_ineg:
947 case nir_op_pack_32_4x8_split:
948 break;
949
950 default:
951 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
952 assert(type_sz(op[i].type) > 1);
953 }
954 }
955 #endif
956
957 switch (instr->op) {
958 case nir_op_mov:
959 case nir_op_vec2:
960 case nir_op_vec3:
961 case nir_op_vec4:
962 case nir_op_vec8:
963 case nir_op_vec16: {
964 elk_fs_reg temp = result;
965 bool need_extra_copy = false;
966
967 nir_intrinsic_instr *store_reg =
968 nir_store_reg_for_def(&instr->def);
969 if (store_reg != NULL) {
970 nir_def *dest_reg = store_reg->src[1].ssa;
971 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
972 nir_intrinsic_instr *load_reg =
973 nir_load_reg_for_def(instr->src[i].src.ssa);
974 if (load_reg == NULL)
975 continue;
976
977 if (load_reg->src[0].ssa == dest_reg) {
978 need_extra_copy = true;
979 temp = bld.vgrf(result.type, 4);
980 break;
981 }
982 }
983 }
984
985 nir_component_mask_t write_mask = get_nir_write_mask(instr->def);
986 unsigned last_bit = util_last_bit(write_mask);
987
988 for (unsigned i = 0; i < last_bit; i++) {
989 if (!(write_mask & (1 << i)))
990 continue;
991
992 if (instr->op == nir_op_mov) {
993 bld.MOV(offset(temp, bld, i),
994 offset(op[0], bld, instr->src[0].swizzle[i]));
995 } else {
996 bld.MOV(offset(temp, bld, i),
997 offset(op[i], bld, instr->src[i].swizzle[0]));
998 }
999 }
1000
1001 /* In this case the source and destination registers were the same,
1002 * so we need to insert an extra set of moves in order to deal with
1003 * any swizzling.
1004 */
1005 if (need_extra_copy) {
1006 for (unsigned i = 0; i < last_bit; i++) {
1007 if (!(write_mask & (1 << i)))
1008 continue;
1009
1010 bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1011 }
1012 }
1013 return;
1014 }
1015
1016 case nir_op_i2f32:
1017 case nir_op_u2f32:
1018 if (optimize_extract_to_float(ntb, instr, result))
1019 return;
1020 inst = bld.MOV(result, op[0]);
1021 break;
1022
1023 case nir_op_f2f16_rtne:
1024 case nir_op_f2f16_rtz:
1025 case nir_op_f2f16: {
1026 elk_rnd_mode rnd = ELK_RND_MODE_UNSPECIFIED;
1027
1028 if (nir_op_f2f16 == instr->op)
1029 rnd = elk_rnd_mode_from_execution_mode(execution_mode);
1030 else
1031 rnd = elk_rnd_mode_from_nir_op(instr->op);
1032
1033 if (ELK_RND_MODE_UNSPECIFIED != rnd)
1034 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), elk_imm_d(rnd));
1035
1036 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1037 inst = bld.F32TO16(result, op[0]);
1038 break;
1039 }
1040
1041 case nir_op_b2i8:
1042 case nir_op_b2i16:
1043 case nir_op_b2i32:
1044 case nir_op_b2i64:
1045 case nir_op_b2f16:
1046 case nir_op_b2f32:
1047 case nir_op_b2f64:
1048 if (try_emit_b2fi_of_inot(ntb, bld, result, instr))
1049 break;
1050 op[0].type = ELK_REGISTER_TYPE_D;
1051 op[0].negate = !op[0].negate;
1052 FALLTHROUGH;
1053 case nir_op_i2f64:
1054 case nir_op_i2i64:
1055 case nir_op_u2f64:
1056 case nir_op_u2u64:
1057 case nir_op_f2f64:
1058 case nir_op_f2i64:
1059 case nir_op_f2u64:
1060 case nir_op_i2i32:
1061 case nir_op_u2u32:
1062 case nir_op_f2i32:
1063 case nir_op_f2u32:
1064 case nir_op_i2f16:
1065 case nir_op_u2f16:
1066 case nir_op_f2i16:
1067 case nir_op_f2u16:
1068 case nir_op_f2i8:
1069 case nir_op_f2u8:
1070 if (result.type == ELK_REGISTER_TYPE_B ||
1071 result.type == ELK_REGISTER_TYPE_UB ||
1072 result.type == ELK_REGISTER_TYPE_HF)
1073 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1074
1075 if (op[0].type == ELK_REGISTER_TYPE_B ||
1076 op[0].type == ELK_REGISTER_TYPE_UB ||
1077 op[0].type == ELK_REGISTER_TYPE_HF)
1078 assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1079
1080 inst = bld.MOV(result, op[0]);
1081 break;
1082
1083 case nir_op_i2i8:
1084 case nir_op_u2u8:
1085 assert(type_sz(op[0].type) < 8); /* elk_nir_lower_conversions */
1086 FALLTHROUGH;
1087 case nir_op_i2i16:
1088 case nir_op_u2u16: {
1089 /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1090 * Emitting the instructions one by one results in two MOV instructions
1091 * that won't be propagated. By handling both instructions here, a
1092 * single MOV is emitted.
1093 */
1094 nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1095 if (extract_instr != NULL) {
1096 if (extract_instr->op == nir_op_extract_u8 ||
1097 extract_instr->op == nir_op_extract_i8) {
1098 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1099
1100 const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1101 const elk_reg_type type =
1102 elk_int_type(1, extract_instr->op == nir_op_extract_i8);
1103
1104 op[0] = subscript(op[0], type, byte);
1105 } else if (extract_instr->op == nir_op_extract_u16 ||
1106 extract_instr->op == nir_op_extract_i16) {
1107 prepare_alu_destination_and_sources(ntb, bld, extract_instr, op, false);
1108
1109 const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1110 const elk_reg_type type =
1111 elk_int_type(2, extract_instr->op == nir_op_extract_i16);
1112
1113 op[0] = subscript(op[0], type, word);
1114 }
1115 }
1116
1117 inst = bld.MOV(result, op[0]);
1118 break;
1119 }
1120
1121 case nir_op_fsat:
1122 inst = bld.MOV(result, op[0]);
1123 inst->saturate = true;
1124 break;
1125
1126 case nir_op_fneg:
1127 case nir_op_ineg:
1128 op[0].negate = true;
1129 inst = bld.MOV(result, op[0]);
1130 break;
1131
1132 case nir_op_fabs:
1133 case nir_op_iabs:
1134 op[0].negate = false;
1135 op[0].abs = true;
1136 inst = bld.MOV(result, op[0]);
1137 break;
1138
1139 case nir_op_f2f32:
1140 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1141 elk_rnd_mode rnd =
1142 elk_rnd_mode_from_execution_mode(execution_mode);
1143 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1144 elk_imm_d(rnd));
1145 }
1146
1147 if (op[0].type == ELK_REGISTER_TYPE_HF)
1148 assert(type_sz(result.type) < 8); /* elk_nir_lower_conversions */
1149
1150 inst = bld.MOV(result, op[0]);
1151 break;
1152
1153 case nir_op_fsign:
1154 emit_fsign(ntb, bld, instr, result, op, 0);
1155 break;
1156
1157 case nir_op_frcp:
1158 inst = bld.emit(ELK_SHADER_OPCODE_RCP, result, op[0]);
1159 break;
1160
1161 case nir_op_fexp2:
1162 inst = bld.emit(ELK_SHADER_OPCODE_EXP2, result, op[0]);
1163 break;
1164
1165 case nir_op_flog2:
1166 inst = bld.emit(ELK_SHADER_OPCODE_LOG2, result, op[0]);
1167 break;
1168
1169 case nir_op_fsin:
1170 inst = bld.emit(ELK_SHADER_OPCODE_SIN, result, op[0]);
1171 break;
1172
1173 case nir_op_fcos:
1174 inst = bld.emit(ELK_SHADER_OPCODE_COS, result, op[0]);
1175 break;
1176
1177 case nir_op_fadd:
1178 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1179 elk_rnd_mode rnd =
1180 elk_rnd_mode_from_execution_mode(execution_mode);
1181 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1182 elk_imm_d(rnd));
1183 }
1184 FALLTHROUGH;
1185 case nir_op_iadd:
1186 inst = bld.ADD(result, op[0], op[1]);
1187 break;
1188
1189 case nir_op_iadd_sat:
1190 case nir_op_uadd_sat:
1191 inst = bld.ADD(result, op[0], op[1]);
1192 inst->saturate = true;
1193 break;
1194
1195 case nir_op_isub_sat:
1196 bld.emit(ELK_SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1197 break;
1198
1199 case nir_op_usub_sat:
1200 bld.emit(ELK_SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1201 break;
1202
1203 case nir_op_irhadd:
1204 case nir_op_urhadd:
1205 assert(instr->def.bit_size < 64);
1206 inst = bld.AVG(result, op[0], op[1]);
1207 break;
1208
1209 case nir_op_ihadd:
1210 case nir_op_uhadd: {
1211 assert(instr->def.bit_size < 64);
1212 elk_fs_reg tmp = bld.vgrf(result.type);
1213
1214 if (devinfo->ver >= 8) {
1215 op[0] = resolve_source_modifiers(bld, op[0]);
1216 op[1] = resolve_source_modifiers(bld, op[1]);
1217 }
1218
1219 /* AVG(x, y) - ((x ^ y) & 1) */
1220 bld.XOR(tmp, op[0], op[1]);
1221 bld.AND(tmp, tmp, retype(elk_imm_ud(1), result.type));
1222 bld.AVG(result, op[0], op[1]);
1223 inst = bld.ADD(result, result, tmp);
1224 inst->src[1].negate = true;
1225 break;
1226 }
1227
1228 case nir_op_fmul:
1229 for (unsigned i = 0; i < 2; i++) {
1230 if (can_fuse_fmul_fsign(instr, i)) {
1231 emit_fsign(ntb, bld, instr, result, op, i);
1232 return;
1233 }
1234 }
1235
1236 /* We emit the rounding mode after the previous fsign optimization since
1237 * it won't result in a MUL, but will try to negate the value by other
1238 * means.
1239 */
1240 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1241 elk_rnd_mode rnd =
1242 elk_rnd_mode_from_execution_mode(execution_mode);
1243 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1244 elk_imm_d(rnd));
1245 }
1246
1247 inst = bld.MUL(result, op[0], op[1]);
1248 break;
1249
1250 case nir_op_imul_2x32_64:
1251 case nir_op_umul_2x32_64:
1252 bld.MUL(result, op[0], op[1]);
1253 break;
1254
1255 case nir_op_imul_32x16:
1256 case nir_op_umul_32x16: {
1257 const bool ud = instr->op == nir_op_umul_32x16;
1258 const enum elk_reg_type word_type =
1259 ud ? ELK_REGISTER_TYPE_UW : ELK_REGISTER_TYPE_W;
1260 const enum elk_reg_type dword_type =
1261 ud ? ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_D;
1262
1263 assert(instr->def.bit_size == 32);
1264
1265 /* Before copy propagation there are no immediate values. */
1266 assert(op[0].file != IMM && op[1].file != IMM);
1267
1268 op[1] = subscript(op[1], word_type, 0);
1269
1270 if (devinfo->ver >= 7)
1271 bld.MUL(result, retype(op[0], dword_type), op[1]);
1272 else
1273 bld.MUL(result, op[1], retype(op[0], dword_type));
1274
1275 break;
1276 }
1277
1278 case nir_op_imul:
1279 assert(instr->def.bit_size < 64);
1280 bld.MUL(result, op[0], op[1]);
1281 break;
1282
1283 case nir_op_imul_high:
1284 case nir_op_umul_high:
1285 assert(instr->def.bit_size < 64);
1286 if (instr->def.bit_size == 32) {
1287 bld.emit(ELK_SHADER_OPCODE_MULH, result, op[0], op[1]);
1288 } else {
1289 elk_fs_reg tmp = bld.vgrf(elk_reg_type_from_bit_size(32, op[0].type));
1290 bld.MUL(tmp, op[0], op[1]);
1291 bld.MOV(result, subscript(tmp, result.type, 1));
1292 }
1293 break;
1294
1295 case nir_op_idiv:
1296 case nir_op_udiv:
1297 assert(instr->def.bit_size < 64);
1298 bld.emit(ELK_SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1299 break;
1300
1301 case nir_op_uadd_carry:
1302 unreachable("Should have been lowered by carry_to_arith().");
1303
1304 case nir_op_usub_borrow:
1305 unreachable("Should have been lowered by borrow_to_arith().");
1306
1307 case nir_op_umod:
1308 case nir_op_irem:
1309 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1310 * appears that our hardware just does the right thing for signed
1311 * remainder.
1312 */
1313 assert(instr->def.bit_size < 64);
1314 bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1315 break;
1316
1317 case nir_op_imod: {
1318 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */
1319 bld.emit(ELK_SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1320
1321 /* Math instructions don't support conditional mod */
1322 inst = bld.MOV(bld.null_reg_d(), result);
1323 inst->conditional_mod = ELK_CONDITIONAL_NZ;
1324
1325 /* Now, we need to determine if signs of the sources are different.
1326 * When we XOR the sources, the top bit is 0 if they are the same and 1
1327 * if they are different. We can then use a conditional modifier to
1328 * turn that into a predicate. This leads us to an XOR.l instruction.
1329 *
1330 * Technically, according to the PRM, you're not allowed to use .l on a
1331 * XOR instruction. However, empirical experiments and Curro's reading
1332 * of the simulator source both indicate that it's safe.
1333 */
1334 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_D);
1335 inst = bld.XOR(tmp, op[0], op[1]);
1336 inst->predicate = ELK_PREDICATE_NORMAL;
1337 inst->conditional_mod = ELK_CONDITIONAL_L;
1338
1339 /* If the result of the initial remainder operation is non-zero and the
1340 * two sources have different signs, add in a copy of op[1] to get the
1341 * final integer modulus value.
1342 */
1343 inst = bld.ADD(result, result, op[1]);
1344 inst->predicate = ELK_PREDICATE_NORMAL;
1345 break;
1346 }
1347
1348 case nir_op_flt32:
1349 case nir_op_fge32:
1350 case nir_op_feq32:
1351 case nir_op_fneu32: {
1352 elk_fs_reg dest = result;
1353
1354 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1355 if (bit_size != 32) {
1356 dest = bld.vgrf(op[0].type, 1);
1357 bld.UNDEF(dest);
1358 }
1359
1360 bld.CMP(dest, op[0], op[1], elk_cmod_for_nir_comparison(instr->op));
1361
1362 if (bit_size > 32) {
1363 bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1364 } else if(bit_size < 32) {
1365 /* When we convert the result to 32-bit we need to be careful and do
1366 * it as a signed conversion to get sign extension (for 32-bit true)
1367 */
1368 const elk_reg_type src_type =
1369 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1370
1371 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1372 }
1373 break;
1374 }
1375
1376 case nir_op_ilt32:
1377 case nir_op_ult32:
1378 case nir_op_ige32:
1379 case nir_op_uge32:
1380 case nir_op_ieq32:
1381 case nir_op_ine32: {
1382 elk_fs_reg dest = result;
1383
1384 const uint32_t bit_size = type_sz(op[0].type) * 8;
1385 if (bit_size != 32) {
1386 dest = bld.vgrf(op[0].type, 1);
1387 bld.UNDEF(dest);
1388 }
1389
1390 bld.CMP(dest, op[0], op[1],
1391 elk_cmod_for_nir_comparison(instr->op));
1392
1393 if (bit_size > 32) {
1394 bld.MOV(result, subscript(dest, ELK_REGISTER_TYPE_UD, 0));
1395 } else if (bit_size < 32) {
1396 /* When we convert the result to 32-bit we need to be careful and do
1397 * it as a signed conversion to get sign extension (for 32-bit true)
1398 */
1399 const elk_reg_type src_type =
1400 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_D);
1401
1402 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), retype(dest, src_type));
1403 }
1404 break;
1405 }
1406
1407 case nir_op_inot:
1408 if (devinfo->ver >= 8) {
1409 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1410
1411 if (inot_src_instr != NULL &&
1412 (inot_src_instr->op == nir_op_ior ||
1413 inot_src_instr->op == nir_op_ixor ||
1414 inot_src_instr->op == nir_op_iand)) {
1415 /* The sources of the source logical instruction are now the
1416 * sources of the instruction that will be generated.
1417 */
1418 prepare_alu_destination_and_sources(ntb, bld, inot_src_instr, op, false);
1419 resolve_inot_sources(ntb, bld, inot_src_instr, op);
1420
1421 /* Smash all of the sources and destination to be signed. This
1422 * doesn't matter for the operation of the instruction, but cmod
1423 * propagation fails on unsigned sources with negation (due to
1424 * elk_fs_inst::can_do_cmod returning false).
1425 */
1426 result.type =
1427 elk_type_for_nir_type(devinfo,
1428 (nir_alu_type)(nir_type_int |
1429 instr->def.bit_size));
1430 op[0].type =
1431 elk_type_for_nir_type(devinfo,
1432 (nir_alu_type)(nir_type_int |
1433 nir_src_bit_size(inot_src_instr->src[0].src)));
1434 op[1].type =
1435 elk_type_for_nir_type(devinfo,
1436 (nir_alu_type)(nir_type_int |
1437 nir_src_bit_size(inot_src_instr->src[1].src)));
1438
1439 /* For XOR, only invert one of the sources. Arbitrarily choose
1440 * the first source.
1441 */
1442 op[0].negate = !op[0].negate;
1443 if (inot_src_instr->op != nir_op_ixor)
1444 op[1].negate = !op[1].negate;
1445
1446 switch (inot_src_instr->op) {
1447 case nir_op_ior:
1448 bld.AND(result, op[0], op[1]);
1449 return;
1450
1451 case nir_op_iand:
1452 bld.OR(result, op[0], op[1]);
1453 return;
1454
1455 case nir_op_ixor:
1456 bld.XOR(result, op[0], op[1]);
1457 return;
1458
1459 default:
1460 unreachable("impossible opcode");
1461 }
1462 }
1463 op[0] = resolve_source_modifiers(bld, op[0]);
1464 }
1465 bld.NOT(result, op[0]);
1466 break;
1467 case nir_op_ixor:
1468 if (devinfo->ver >= 8) {
1469 resolve_inot_sources(ntb, bld, instr, op);
1470 }
1471 bld.XOR(result, op[0], op[1]);
1472 break;
1473 case nir_op_ior:
1474 if (devinfo->ver >= 8) {
1475 resolve_inot_sources(ntb, bld, instr, op);
1476 }
1477 bld.OR(result, op[0], op[1]);
1478 break;
1479 case nir_op_iand:
1480 if (devinfo->ver >= 8) {
1481 resolve_inot_sources(ntb, bld, instr, op);
1482 }
1483 bld.AND(result, op[0], op[1]);
1484 break;
1485
1486 case nir_op_fdot2:
1487 case nir_op_fdot3:
1488 case nir_op_fdot4:
1489 case nir_op_b32all_fequal2:
1490 case nir_op_b32all_iequal2:
1491 case nir_op_b32all_fequal3:
1492 case nir_op_b32all_iequal3:
1493 case nir_op_b32all_fequal4:
1494 case nir_op_b32all_iequal4:
1495 case nir_op_b32any_fnequal2:
1496 case nir_op_b32any_inequal2:
1497 case nir_op_b32any_fnequal3:
1498 case nir_op_b32any_inequal3:
1499 case nir_op_b32any_fnequal4:
1500 case nir_op_b32any_inequal4:
1501 unreachable("Lowered by nir_lower_alu_reductions");
1502
1503 case nir_op_ldexp:
1504 unreachable("not reached: should be handled by ldexp_to_arith()");
1505
1506 case nir_op_fsqrt:
1507 inst = bld.emit(ELK_SHADER_OPCODE_SQRT, result, op[0]);
1508 break;
1509
1510 case nir_op_frsq:
1511 inst = bld.emit(ELK_SHADER_OPCODE_RSQ, result, op[0]);
1512 break;
1513
1514 case nir_op_ftrunc:
1515 inst = bld.RNDZ(result, op[0]);
1516 if (devinfo->ver < 6) {
1517 set_condmod(ELK_CONDITIONAL_R, inst);
1518 set_predicate(ELK_PREDICATE_NORMAL,
1519 bld.ADD(result, result, elk_imm_f(1.0f)));
1520 inst = bld.MOV(result, result); /* for potential saturation */
1521 }
1522 break;
1523
1524 case nir_op_fceil: {
1525 op[0].negate = !op[0].negate;
1526 elk_fs_reg temp = s.vgrf(glsl_float_type());
1527 bld.RNDD(temp, op[0]);
1528 temp.negate = true;
1529 inst = bld.MOV(result, temp);
1530 break;
1531 }
1532 case nir_op_ffloor:
1533 inst = bld.RNDD(result, op[0]);
1534 break;
1535 case nir_op_ffract:
1536 inst = bld.FRC(result, op[0]);
1537 break;
1538 case nir_op_fround_even:
1539 inst = bld.RNDE(result, op[0]);
1540 if (devinfo->ver < 6) {
1541 set_condmod(ELK_CONDITIONAL_R, inst);
1542 set_predicate(ELK_PREDICATE_NORMAL,
1543 bld.ADD(result, result, elk_imm_f(1.0f)));
1544 inst = bld.MOV(result, result); /* for potential saturation */
1545 }
1546 break;
1547
1548 case nir_op_fquantize2f16: {
1549 elk_fs_reg tmp16 = bld.vgrf(ELK_REGISTER_TYPE_D);
1550 elk_fs_reg tmp32 = bld.vgrf(ELK_REGISTER_TYPE_F);
1551 elk_fs_reg zero = bld.vgrf(ELK_REGISTER_TYPE_F);
1552
1553 /* The destination stride must be at least as big as the source stride. */
1554 tmp16 = subscript(tmp16, ELK_REGISTER_TYPE_HF, 0);
1555
1556 /* Check for denormal */
1557 elk_fs_reg abs_src0 = op[0];
1558 abs_src0.abs = true;
1559 bld.CMP(bld.null_reg_f(), abs_src0, elk_imm_f(ldexpf(1.0, -14)),
1560 ELK_CONDITIONAL_L);
1561 /* Get the appropriately signed zero */
1562 bld.AND(retype(zero, ELK_REGISTER_TYPE_UD),
1563 retype(op[0], ELK_REGISTER_TYPE_UD),
1564 elk_imm_ud(0x80000000));
1565 /* Do the actual F32 -> F16 -> F32 conversion */
1566 bld.F32TO16(tmp16, op[0]);
1567 bld.F16TO32(tmp32, tmp16);
1568 /* Select that or zero based on normal status */
1569 inst = bld.SEL(result, zero, tmp32);
1570 inst->predicate = ELK_PREDICATE_NORMAL;
1571 break;
1572 }
1573
1574 case nir_op_imin:
1575 case nir_op_umin:
1576 case nir_op_fmin:
1577 inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_L);
1578 break;
1579
1580 case nir_op_imax:
1581 case nir_op_umax:
1582 case nir_op_fmax:
1583 inst = bld.emit_minmax(result, op[0], op[1], ELK_CONDITIONAL_GE);
1584 break;
1585
1586 case nir_op_pack_snorm_2x16:
1587 case nir_op_pack_snorm_4x8:
1588 case nir_op_pack_unorm_2x16:
1589 case nir_op_pack_unorm_4x8:
1590 case nir_op_unpack_snorm_2x16:
1591 case nir_op_unpack_snorm_4x8:
1592 case nir_op_unpack_unorm_2x16:
1593 case nir_op_unpack_unorm_4x8:
1594 case nir_op_unpack_half_2x16:
1595 case nir_op_pack_half_2x16:
1596 unreachable("not reached: should be handled by lower_packing_builtins");
1597
1598 case nir_op_unpack_half_2x16_split_x:
1599 inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 0));
1600 break;
1601
1602 case nir_op_unpack_half_2x16_split_y:
1603 inst = bld.F16TO32(result, subscript(op[0], ELK_REGISTER_TYPE_HF, 1));
1604 break;
1605
1606 case nir_op_pack_64_2x32_split:
1607 case nir_op_pack_32_2x16_split:
1608 bld.emit(ELK_FS_OPCODE_PACK, result, op[0], op[1]);
1609 break;
1610
1611 case nir_op_pack_32_4x8_split:
1612 bld.emit(ELK_FS_OPCODE_PACK, result, op, 4);
1613 break;
1614
1615 case nir_op_unpack_64_2x32_split_x:
1616 case nir_op_unpack_64_2x32_split_y: {
1617 if (instr->op == nir_op_unpack_64_2x32_split_x)
1618 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 0));
1619 else
1620 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UD, 1));
1621 break;
1622 }
1623
1624 case nir_op_unpack_32_2x16_split_x:
1625 case nir_op_unpack_32_2x16_split_y: {
1626 if (instr->op == nir_op_unpack_32_2x16_split_x)
1627 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 0));
1628 else
1629 bld.MOV(result, subscript(op[0], ELK_REGISTER_TYPE_UW, 1));
1630 break;
1631 }
1632
1633 case nir_op_fpow:
1634 inst = bld.emit(ELK_SHADER_OPCODE_POW, result, op[0], op[1]);
1635 break;
1636
1637 case nir_op_bitfield_reverse:
1638 assert(instr->def.bit_size == 32);
1639 assert(nir_src_bit_size(instr->src[0].src) == 32);
1640 bld.BFREV(result, op[0]);
1641 break;
1642
1643 case nir_op_bit_count:
1644 assert(instr->def.bit_size == 32);
1645 assert(nir_src_bit_size(instr->src[0].src) < 64);
1646 bld.CBIT(result, op[0]);
1647 break;
1648
1649 case nir_op_uclz:
1650 assert(instr->def.bit_size == 32);
1651 assert(nir_src_bit_size(instr->src[0].src) == 32);
1652 bld.LZD(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1653 break;
1654
1655 case nir_op_ifind_msb: {
1656 assert(instr->def.bit_size == 32);
1657 assert(nir_src_bit_size(instr->src[0].src) == 32);
1658 assert(devinfo->ver >= 7);
1659
1660 bld.FBH(retype(result, ELK_REGISTER_TYPE_UD), op[0]);
1661
1662 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1663 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1664 * subtract the result from 31 to convert the MSB count into an LSB
1665 * count.
1666 */
1667 bld.CMP(bld.null_reg_d(), result, elk_imm_d(-1), ELK_CONDITIONAL_NZ);
1668
1669 inst = bld.ADD(result, result, elk_imm_d(31));
1670 inst->predicate = ELK_PREDICATE_NORMAL;
1671 inst->src[0].negate = true;
1672 break;
1673 }
1674
1675 case nir_op_find_lsb:
1676 assert(instr->def.bit_size == 32);
1677 assert(nir_src_bit_size(instr->src[0].src) == 32);
1678 assert(devinfo->ver >= 7);
1679 bld.FBL(result, op[0]);
1680 break;
1681
1682 case nir_op_ubitfield_extract:
1683 case nir_op_ibitfield_extract:
1684 unreachable("should have been lowered");
1685 case nir_op_ubfe:
1686 case nir_op_ibfe:
1687 assert(instr->def.bit_size < 64);
1688 bld.BFE(result, op[2], op[1], op[0]);
1689 break;
1690 case nir_op_bfm:
1691 assert(instr->def.bit_size < 64);
1692 bld.BFI1(result, op[0], op[1]);
1693 break;
1694 case nir_op_bfi:
1695 assert(instr->def.bit_size < 64);
1696
1697 /* bfi is ((...) | (~src0 & src2)). The second part is zero when src2 is
1698 * either 0 or src0. Replacing the 0 with another value can eliminate a
1699 * temporary register.
1700 */
1701 if (is_const_zero(instr->src[2].src))
1702 bld.BFI2(result, op[0], op[1], op[0]);
1703 else
1704 bld.BFI2(result, op[0], op[1], op[2]);
1705
1706 break;
1707
1708 case nir_op_bitfield_insert:
1709 unreachable("not reached: should have been lowered");
1710
1711 /* With regards to implicit masking of the shift counts for 8- and 16-bit
1712 * types, the PRMs are **incorrect**. They falsely state that on Gen9+ only
1713 * the low bits of src1 matching the size of src0 (e.g., 4-bits for W or UW
1714 * src0) are used. The Bspec (backed by data from experimentation) state
1715 * that 0x3f is used for Q and UQ types, and 0x1f is used for **all** other
1716 * types.
1717 *
1718 * The match the behavior expected for the NIR opcodes, explicit masks for
1719 * 8- and 16-bit types must be added.
1720 */
1721 case nir_op_ishl:
1722 if (instr->def.bit_size < 32) {
1723 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1724 bld.SHL(result, op[0], result);
1725 } else {
1726 bld.SHL(result, op[0], op[1]);
1727 }
1728
1729 break;
1730 case nir_op_ishr:
1731 if (instr->def.bit_size < 32) {
1732 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1733 bld.ASR(result, op[0], result);
1734 } else {
1735 bld.ASR(result, op[0], op[1]);
1736 }
1737
1738 break;
1739 case nir_op_ushr:
1740 if (instr->def.bit_size < 32) {
1741 bld.AND(result, op[1], elk_imm_ud(instr->def.bit_size - 1));
1742 bld.SHR(result, op[0], result);
1743 } else {
1744 bld.SHR(result, op[0], op[1]);
1745 }
1746
1747 break;
1748
1749 case nir_op_pack_half_2x16_split:
1750 bld.emit(ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1751 break;
1752
1753 case nir_op_ffma:
1754 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1755 elk_rnd_mode rnd =
1756 elk_rnd_mode_from_execution_mode(execution_mode);
1757 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1758 elk_imm_d(rnd));
1759 }
1760
1761 inst = bld.MAD(result, op[2], op[1], op[0]);
1762 break;
1763
1764 case nir_op_flrp:
1765 if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1766 elk_rnd_mode rnd =
1767 elk_rnd_mode_from_execution_mode(execution_mode);
1768 bld.exec_all().emit(ELK_SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1769 elk_imm_d(rnd));
1770 }
1771
1772 inst = bld.LRP(result, op[0], op[1], op[2]);
1773 break;
1774
1775 case nir_op_b32csel:
1776 if (optimize_frontfacing_ternary(ntb, instr, result))
1777 return;
1778
1779 bld.CMP(bld.null_reg_d(), op[0], elk_imm_d(0), ELK_CONDITIONAL_NZ);
1780 inst = bld.SEL(result, op[1], op[2]);
1781 inst->predicate = ELK_PREDICATE_NORMAL;
1782 break;
1783
1784 case nir_op_extract_u8:
1785 case nir_op_extract_i8: {
1786 unsigned byte = nir_src_as_uint(instr->src[1].src);
1787
1788 /* The PRMs say:
1789 *
1790 * BDW+
1791 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1792 * Use two instructions and a word or DWord intermediate integer type.
1793 */
1794 if (instr->def.bit_size == 64) {
1795 const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1796
1797 if (instr->op == nir_op_extract_i8) {
1798 /* If we need to sign extend, extract to a word first */
1799 elk_fs_reg w_temp = bld.vgrf(ELK_REGISTER_TYPE_W);
1800 bld.MOV(w_temp, subscript(op[0], type, byte));
1801 bld.MOV(result, w_temp);
1802 } else if (byte & 1) {
1803 /* Extract the high byte from the word containing the desired byte
1804 * offset.
1805 */
1806 bld.SHR(result,
1807 subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1808 elk_imm_uw(8));
1809 } else {
1810 /* Otherwise use an AND with 0xff and a word type */
1811 bld.AND(result,
1812 subscript(op[0], ELK_REGISTER_TYPE_UW, byte / 2),
1813 elk_imm_uw(0xff));
1814 }
1815 } else {
1816 const elk_reg_type type = elk_int_type(1, instr->op == nir_op_extract_i8);
1817 bld.MOV(result, subscript(op[0], type, byte));
1818 }
1819 break;
1820 }
1821
1822 case nir_op_extract_u16:
1823 case nir_op_extract_i16: {
1824 const elk_reg_type type = elk_int_type(2, instr->op == nir_op_extract_i16);
1825 unsigned word = nir_src_as_uint(instr->src[1].src);
1826 bld.MOV(result, subscript(op[0], type, word));
1827 break;
1828 }
1829
1830 default:
1831 unreachable("unhandled instruction");
1832 }
1833
1834 /* If we need to do a boolean resolve, replace the result with -(x & 1)
1835 * to sign extend the low bit to 0/~0
1836 */
1837 if (devinfo->ver <= 5 &&
1838 !result.is_null() &&
1839 (instr->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) == ELK_NIR_BOOLEAN_NEEDS_RESOLVE) {
1840 elk_fs_reg masked = s.vgrf(glsl_int_type());
1841 bld.AND(masked, result, elk_imm_d(1));
1842 masked.negate = true;
1843 bld.MOV(retype(result, ELK_REGISTER_TYPE_D), masked);
1844 }
1845 }
1846
1847 static void
fs_nir_emit_load_const(nir_to_elk_state & ntb,nir_load_const_instr * instr)1848 fs_nir_emit_load_const(nir_to_elk_state &ntb,
1849 nir_load_const_instr *instr)
1850 {
1851 const intel_device_info *devinfo = ntb.devinfo;
1852 const fs_builder &bld = ntb.bld;
1853
1854 const elk_reg_type reg_type =
1855 elk_reg_type_from_bit_size(instr->def.bit_size, ELK_REGISTER_TYPE_D);
1856 elk_fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1857
1858 switch (instr->def.bit_size) {
1859 case 8:
1860 for (unsigned i = 0; i < instr->def.num_components; i++)
1861 bld.MOV(offset(reg, bld, i), elk_setup_imm_b(bld, instr->value[i].i8));
1862 break;
1863
1864 case 16:
1865 for (unsigned i = 0; i < instr->def.num_components; i++)
1866 bld.MOV(offset(reg, bld, i), elk_imm_w(instr->value[i].i16));
1867 break;
1868
1869 case 32:
1870 for (unsigned i = 0; i < instr->def.num_components; i++)
1871 bld.MOV(offset(reg, bld, i), elk_imm_d(instr->value[i].i32));
1872 break;
1873
1874 case 64:
1875 assert(devinfo->ver >= 7);
1876 if (!devinfo->has_64bit_int) {
1877 for (unsigned i = 0; i < instr->def.num_components; i++) {
1878 bld.MOV(retype(offset(reg, bld, i), ELK_REGISTER_TYPE_DF),
1879 elk_setup_imm_df(bld, instr->value[i].f64));
1880 }
1881 } else {
1882 for (unsigned i = 0; i < instr->def.num_components; i++)
1883 bld.MOV(offset(reg, bld, i), elk_imm_q(instr->value[i].i64));
1884 }
1885 break;
1886
1887 default:
1888 unreachable("Invalid bit size");
1889 }
1890
1891 ntb.ssa_values[instr->def.index] = reg;
1892 }
1893
1894 static bool
get_nir_src_bindless(nir_to_elk_state & ntb,const nir_src & src)1895 get_nir_src_bindless(nir_to_elk_state &ntb, const nir_src &src)
1896 {
1897 return ntb.ssa_bind_infos[src.ssa->index].bindless;
1898 }
1899
1900 static bool
is_resource_src(nir_src src)1901 is_resource_src(nir_src src)
1902 {
1903 return src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
1904 nir_instr_as_intrinsic(src.ssa->parent_instr)->intrinsic == nir_intrinsic_resource_intel;
1905 }
1906
1907 static elk_fs_reg
get_resource_nir_src(nir_to_elk_state & ntb,const nir_src & src)1908 get_resource_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1909 {
1910 if (!is_resource_src(src))
1911 return elk_fs_reg();
1912 return ntb.resource_values[src.ssa->index];
1913 }
1914
1915 static elk_fs_reg
get_nir_src(nir_to_elk_state & ntb,const nir_src & src)1916 get_nir_src(nir_to_elk_state &ntb, const nir_src &src)
1917 {
1918 const intel_device_info *devinfo = ntb.devinfo;
1919
1920 nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa);
1921
1922 elk_fs_reg reg;
1923 if (!load_reg) {
1924 if (nir_src_is_undef(src)) {
1925 const elk_reg_type reg_type =
1926 elk_reg_type_from_bit_size(src.ssa->bit_size,
1927 ELK_REGISTER_TYPE_D);
1928 reg = ntb.bld.vgrf(reg_type, src.ssa->num_components);
1929 } else {
1930 reg = ntb.ssa_values[src.ssa->index];
1931 }
1932 } else {
1933 nir_intrinsic_instr *decl_reg = nir_reg_get_decl(load_reg->src[0].ssa);
1934 /* We don't handle indirects on locals */
1935 assert(nir_intrinsic_base(load_reg) == 0);
1936 assert(load_reg->intrinsic != nir_intrinsic_load_reg_indirect);
1937 reg = ntb.ssa_values[decl_reg->def.index];
1938 }
1939
1940 if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
1941 /* The only 64-bit type available on gfx7 is DF, so use that. */
1942 reg.type = ELK_REGISTER_TYPE_DF;
1943 } else {
1944 /* To avoid floating-point denorm flushing problems, set the type by
1945 * default to an integer type - instructions that need floating point
1946 * semantics will set this to F if they need to
1947 */
1948 reg.type = elk_reg_type_from_bit_size(nir_src_bit_size(src),
1949 ELK_REGISTER_TYPE_D);
1950 }
1951
1952 return reg;
1953 }
1954
1955 /**
1956 * Return an IMM for constants; otherwise call get_nir_src() as normal.
1957 *
1958 * This function should not be called on any value which may be 64 bits.
1959 * We could theoretically support 64-bit on gfx8+ but we choose not to
1960 * because it wouldn't work in general (no gfx7 support) and there are
1961 * enough restrictions in 64-bit immediates that you can't take the return
1962 * value and treat it the same as the result of get_nir_src().
1963 */
1964 static elk_fs_reg
get_nir_src_imm(nir_to_elk_state & ntb,const nir_src & src)1965 get_nir_src_imm(nir_to_elk_state &ntb, const nir_src &src)
1966 {
1967 assert(nir_src_bit_size(src) == 32);
1968 return nir_src_is_const(src) ?
1969 elk_fs_reg(elk_imm_d(nir_src_as_int(src))) : get_nir_src(ntb, src);
1970 }
1971
1972 static elk_fs_reg
get_nir_def(nir_to_elk_state & ntb,const nir_def & def)1973 get_nir_def(nir_to_elk_state &ntb, const nir_def &def)
1974 {
1975 const fs_builder &bld = ntb.bld;
1976
1977 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
1978 if (!store_reg) {
1979 const elk_reg_type reg_type =
1980 elk_reg_type_from_bit_size(def.bit_size,
1981 def.bit_size == 8 ?
1982 ELK_REGISTER_TYPE_D :
1983 ELK_REGISTER_TYPE_F);
1984 ntb.ssa_values[def.index] =
1985 bld.vgrf(reg_type, def.num_components);
1986 bld.UNDEF(ntb.ssa_values[def.index]);
1987 return ntb.ssa_values[def.index];
1988 } else {
1989 nir_intrinsic_instr *decl_reg =
1990 nir_reg_get_decl(store_reg->src[1].ssa);
1991 /* We don't handle indirects on locals */
1992 assert(nir_intrinsic_base(store_reg) == 0);
1993 assert(store_reg->intrinsic != nir_intrinsic_store_reg_indirect);
1994 return ntb.ssa_values[decl_reg->def.index];
1995 }
1996 }
1997
1998 static nir_component_mask_t
get_nir_write_mask(const nir_def & def)1999 get_nir_write_mask(const nir_def &def)
2000 {
2001 nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def);
2002 if (!store_reg) {
2003 return nir_component_mask(def.num_components);
2004 } else {
2005 return nir_intrinsic_write_mask(store_reg);
2006 }
2007 }
2008
2009 static elk_fs_inst *
emit_pixel_interpolater_send(const fs_builder & bld,enum elk_opcode opcode,const elk_fs_reg & dst,const elk_fs_reg & src,const elk_fs_reg & desc,const elk_fs_reg & flag_reg,glsl_interp_mode interpolation)2010 emit_pixel_interpolater_send(const fs_builder &bld,
2011 enum elk_opcode opcode,
2012 const elk_fs_reg &dst,
2013 const elk_fs_reg &src,
2014 const elk_fs_reg &desc,
2015 const elk_fs_reg &flag_reg,
2016 glsl_interp_mode interpolation)
2017 {
2018 struct elk_wm_prog_data *wm_prog_data =
2019 elk_wm_prog_data(bld.shader->stage_prog_data);
2020
2021 elk_fs_reg srcs[INTERP_NUM_SRCS];
2022 srcs[INTERP_SRC_OFFSET] = src;
2023 srcs[INTERP_SRC_MSG_DESC] = desc;
2024 srcs[INTERP_SRC_DYNAMIC_MODE] = flag_reg;
2025
2026 elk_fs_inst *inst = bld.emit(opcode, dst, srcs, INTERP_NUM_SRCS);
2027 /* 2 floats per slot returned */
2028 inst->size_written = 2 * dst.component_size(inst->exec_size);
2029 if (interpolation == INTERP_MODE_NOPERSPECTIVE) {
2030 inst->pi_noperspective = true;
2031 /* TGL BSpec says:
2032 * This field cannot be set to "Linear Interpolation"
2033 * unless Non-Perspective Barycentric Enable in 3DSTATE_CLIP is enabled"
2034 */
2035 wm_prog_data->uses_nonperspective_interp_modes = true;
2036 }
2037
2038 wm_prog_data->pulls_bary = true;
2039
2040 return inst;
2041 }
2042
2043 /**
2044 * Computes 1 << x, given a D/UD register containing some value x.
2045 */
2046 static elk_fs_reg
intexp2(const fs_builder & bld,const elk_fs_reg & x)2047 intexp2(const fs_builder &bld, const elk_fs_reg &x)
2048 {
2049 assert(x.type == ELK_REGISTER_TYPE_UD || x.type == ELK_REGISTER_TYPE_D);
2050
2051 elk_fs_reg result = bld.vgrf(x.type, 1);
2052 elk_fs_reg one = bld.vgrf(x.type, 1);
2053
2054 bld.MOV(one, retype(elk_imm_d(1), one.type));
2055 bld.SHL(result, one, x);
2056 return result;
2057 }
2058
2059 static void
emit_gs_end_primitive(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src)2060 emit_gs_end_primitive(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src)
2061 {
2062 elk_fs_visitor &s = ntb.s;
2063 assert(s.stage == MESA_SHADER_GEOMETRY);
2064
2065 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2066
2067 if (s.gs_compile->control_data_header_size_bits == 0)
2068 return;
2069
2070 /* We can only do EndPrimitive() functionality when the control data
2071 * consists of cut bits. Fortunately, the only time it isn't is when the
2072 * output type is points, in which case EndPrimitive() is a no-op.
2073 */
2074 if (gs_prog_data->control_data_format !=
2075 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2076 return;
2077 }
2078
2079 /* Cut bits use one bit per vertex. */
2080 assert(s.gs_compile->control_data_bits_per_vertex == 1);
2081
2082 elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2083 vertex_count.type = ELK_REGISTER_TYPE_UD;
2084
2085 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2086 * vertex n, 0 otherwise. So all we need to do here is mark bit
2087 * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2088 * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2089 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2090 *
2091 * Note that if EndPrimitive() is called before emitting any vertices, this
2092 * will cause us to set bit 31 of the control_data_bits register to 1.
2093 * That's fine because:
2094 *
2095 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2096 * output, so the hardware will ignore cut bit 31.
2097 *
2098 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2099 * last vertex, so setting cut bit 31 has no effect (since the primitive
2100 * is automatically ended when the GS terminates).
2101 *
2102 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2103 * control_data_bits register to 0 when the first vertex is emitted.
2104 */
2105
2106 const fs_builder abld = ntb.bld.annotate("end primitive");
2107
2108 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2109 elk_fs_reg prev_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2110 abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2111 elk_fs_reg mask = intexp2(abld, prev_count);
2112 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2113 * attention to the lower 5 bits of its second source argument, so on this
2114 * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2115 * ((vertex_count - 1) % 32).
2116 */
2117 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2118 }
2119
2120 void
emit_gs_control_data_bits(const elk_fs_reg & vertex_count)2121 elk_fs_visitor::emit_gs_control_data_bits(const elk_fs_reg &vertex_count)
2122 {
2123 assert(stage == MESA_SHADER_GEOMETRY);
2124 assert(gs_compile->control_data_bits_per_vertex != 0);
2125
2126 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
2127
2128 const fs_builder bld = fs_builder(this).at_end();
2129 const fs_builder abld = bld.annotate("emit control data bits");
2130 const fs_builder fwa_bld = bld.exec_all();
2131
2132 /* We use a single UD register to accumulate control data bits (32 bits
2133 * for each of the SIMD8 channels). So we need to write a DWord (32 bits)
2134 * at a time.
2135 *
2136 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2137 * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2138 * use the Channel Mask phase to enable/disable which DWord within that
2139 * group to write. (Remember, different SIMD8 channels may have emitted
2140 * different numbers of vertices, so we may need per-slot offsets.)
2141 *
2142 * Channel masking presents an annoying problem: we may have to replicate
2143 * the data up to 4 times:
2144 *
2145 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2146 *
2147 * To avoid penalizing shaders that emit a small number of vertices, we
2148 * can avoid these sometimes: if the size of the control data header is
2149 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land
2150 * land in the same 128-bit group, so we can skip per-slot offsets.
2151 *
2152 * Similarly, if the control data header is <= 32 bits, there is only one
2153 * DWord, so we can skip channel masks.
2154 */
2155 elk_fs_reg channel_mask, per_slot_offset;
2156
2157 if (gs_compile->control_data_header_size_bits > 32)
2158 channel_mask = vgrf(glsl_uint_type());
2159
2160 if (gs_compile->control_data_header_size_bits > 128)
2161 per_slot_offset = vgrf(glsl_uint_type());
2162
2163 /* Figure out which DWord we're trying to write to using the formula:
2164 *
2165 * dword_index = (vertex_count - 1) * bits_per_vertex / 32
2166 *
2167 * Since bits_per_vertex is a power of two, and is known at compile
2168 * time, this can be optimized to:
2169 *
2170 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2171 */
2172 if (channel_mask.file != BAD_FILE || per_slot_offset.file != BAD_FILE) {
2173 elk_fs_reg dword_index = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2174 elk_fs_reg prev_count = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2175 abld.ADD(prev_count, vertex_count, elk_imm_ud(0xffffffffu));
2176 unsigned log2_bits_per_vertex =
2177 util_last_bit(gs_compile->control_data_bits_per_vertex);
2178 abld.SHR(dword_index, prev_count, elk_imm_ud(6u - log2_bits_per_vertex));
2179
2180 if (per_slot_offset.file != BAD_FILE) {
2181 /* Set the per-slot offset to dword_index / 4, so that we'll write to
2182 * the appropriate OWord within the control data header.
2183 */
2184 abld.SHR(per_slot_offset, dword_index, elk_imm_ud(2u));
2185 }
2186
2187 /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2188 * write to the appropriate DWORD within the OWORD.
2189 */
2190 elk_fs_reg channel = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2191 fwa_bld.AND(channel, dword_index, elk_imm_ud(3u));
2192 channel_mask = intexp2(fwa_bld, channel);
2193 /* Then the channel masks need to be in bits 23:16. */
2194 fwa_bld.SHL(channel_mask, channel_mask, elk_imm_ud(16u));
2195 }
2196
2197 /* If there are channel masks, add 3 extra copies of the data. */
2198 const unsigned length = 1 + 3 * unsigned(channel_mask.file != BAD_FILE);
2199 elk_fs_reg sources[4];
2200
2201 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
2202 sources[i] = this->control_data_bits;
2203
2204 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2205 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
2206 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offset;
2207 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = channel_mask;
2208 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, length);
2209 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
2210 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
2211
2212 elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2213 srcs, ARRAY_SIZE(srcs));
2214
2215 /* We need to increment Global Offset by 256-bits to make room for
2216 * Broadwell's extra "Vertex Count" payload at the beginning of the
2217 * URB entry. Since this is an OWord message, Global Offset is counted
2218 * in 128-bit units, so we must set it to 2.
2219 */
2220 if (gs_prog_data->static_vertex_count == -1)
2221 inst->offset = 2;
2222 }
2223
2224 static void
set_gs_stream_control_data_bits(nir_to_elk_state & ntb,const elk_fs_reg & vertex_count,unsigned stream_id)2225 set_gs_stream_control_data_bits(nir_to_elk_state &ntb, const elk_fs_reg &vertex_count,
2226 unsigned stream_id)
2227 {
2228 elk_fs_visitor &s = ntb.s;
2229
2230 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2231
2232 /* Note: we are calling this *before* increasing vertex_count, so
2233 * this->vertex_count == vertex_count - 1 in the formula above.
2234 */
2235
2236 /* Stream mode uses 2 bits per vertex */
2237 assert(s.gs_compile->control_data_bits_per_vertex == 2);
2238
2239 /* Must be a valid stream */
2240 assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
2241
2242 /* Control data bits are initialized to 0 so we don't have to set any
2243 * bits when sending vertices to stream 0.
2244 */
2245 if (stream_id == 0)
2246 return;
2247
2248 const fs_builder abld = ntb.bld.annotate("set stream control data bits", NULL);
2249
2250 /* reg::sid = stream_id */
2251 elk_fs_reg sid = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2252 abld.MOV(sid, elk_imm_ud(stream_id));
2253
2254 /* reg:shift_count = 2 * (vertex_count - 1) */
2255 elk_fs_reg shift_count = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2256 abld.SHL(shift_count, vertex_count, elk_imm_ud(1u));
2257
2258 /* Note: we're relying on the fact that the GEN SHL instruction only pays
2259 * attention to the lower 5 bits of its second source argument, so on this
2260 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2261 * stream_id << ((2 * (vertex_count - 1)) % 32).
2262 */
2263 elk_fs_reg mask = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2264 abld.SHL(mask, sid, shift_count);
2265 abld.OR(s.control_data_bits, s.control_data_bits, mask);
2266 }
2267
2268 static void
emit_gs_vertex(nir_to_elk_state & ntb,const nir_src & vertex_count_nir_src,unsigned stream_id)2269 emit_gs_vertex(nir_to_elk_state &ntb, const nir_src &vertex_count_nir_src,
2270 unsigned stream_id)
2271 {
2272 elk_fs_visitor &s = ntb.s;
2273
2274 assert(s.stage == MESA_SHADER_GEOMETRY);
2275
2276 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2277
2278 elk_fs_reg vertex_count = get_nir_src(ntb, vertex_count_nir_src);
2279 vertex_count.type = ELK_REGISTER_TYPE_UD;
2280
2281 /* Haswell and later hardware ignores the "Render Stream Select" bits
2282 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2283 * and instead sends all primitives down the pipeline for rasterization.
2284 * If the SOL stage is enabled, "Render Stream Select" is honored and
2285 * primitives bound to non-zero streams are discarded after stream output.
2286 *
2287 * Since the only purpose of primives sent to non-zero streams is to
2288 * be recorded by transform feedback, we can simply discard all geometry
2289 * bound to these streams when transform feedback is disabled.
2290 */
2291 if (stream_id > 0 && !s.nir->info.has_transform_feedback_varyings)
2292 return;
2293
2294 /* If we're outputting 32 control data bits or less, then we can wait
2295 * until the shader is over to output them all. Otherwise we need to
2296 * output them as we go. Now is the time to do it, since we're about to
2297 * output the vertex_count'th vertex, so it's guaranteed that the
2298 * control data bits associated with the (vertex_count - 1)th vertex are
2299 * correct.
2300 */
2301 if (s.gs_compile->control_data_header_size_bits > 32) {
2302 const fs_builder abld =
2303 ntb.bld.annotate("emit vertex: emit control data bits");
2304
2305 /* Only emit control data bits if we've finished accumulating a batch
2306 * of 32 bits. This is the case when:
2307 *
2308 * (vertex_count * bits_per_vertex) % 32 == 0
2309 *
2310 * (in other words, when the last 5 bits of vertex_count *
2311 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
2312 * integer n (which is always the case, since bits_per_vertex is
2313 * always 1 or 2), this is equivalent to requiring that the last 5-n
2314 * bits of vertex_count are 0:
2315 *
2316 * vertex_count & (2^(5-n) - 1) == 0
2317 *
2318 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2319 * equivalent to:
2320 *
2321 * vertex_count & (32 / bits_per_vertex - 1) == 0
2322 *
2323 * TODO: If vertex_count is an immediate, we could do some of this math
2324 * at compile time...
2325 */
2326 elk_fs_inst *inst =
2327 abld.AND(ntb.bld.null_reg_d(), vertex_count,
2328 elk_imm_ud(32u / s.gs_compile->control_data_bits_per_vertex - 1u));
2329 inst->conditional_mod = ELK_CONDITIONAL_Z;
2330
2331 abld.IF(ELK_PREDICATE_NORMAL);
2332 /* If vertex_count is 0, then no control data bits have been
2333 * accumulated yet, so we can skip emitting them.
2334 */
2335 abld.CMP(ntb.bld.null_reg_d(), vertex_count, elk_imm_ud(0u),
2336 ELK_CONDITIONAL_NEQ);
2337 abld.IF(ELK_PREDICATE_NORMAL);
2338 s.emit_gs_control_data_bits(vertex_count);
2339 abld.emit(ELK_OPCODE_ENDIF);
2340
2341 /* Reset control_data_bits to 0 so we can start accumulating a new
2342 * batch.
2343 *
2344 * Note: in the case where vertex_count == 0, this neutralizes the
2345 * effect of any call to EndPrimitive() that the shader may have
2346 * made before outputting its first vertex.
2347 */
2348 inst = abld.MOV(s.control_data_bits, elk_imm_ud(0u));
2349 inst->force_writemask_all = true;
2350 abld.emit(ELK_OPCODE_ENDIF);
2351 }
2352
2353 s.emit_urb_writes(vertex_count);
2354
2355 /* In stream mode we have to set control data bits for all vertices
2356 * unless we have disabled control data bits completely (which we do
2357 * do for MESA_PRIM_POINTS outputs that don't use streams).
2358 */
2359 if (s.gs_compile->control_data_header_size_bits > 0 &&
2360 gs_prog_data->control_data_format ==
2361 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2362 set_gs_stream_control_data_bits(ntb, vertex_count, stream_id);
2363 }
2364 }
2365
2366 static void
emit_gs_input_load(nir_to_elk_state & ntb,const elk_fs_reg & dst,const nir_src & vertex_src,unsigned base_offset,const nir_src & offset_src,unsigned num_components,unsigned first_component)2367 emit_gs_input_load(nir_to_elk_state &ntb, const elk_fs_reg &dst,
2368 const nir_src &vertex_src,
2369 unsigned base_offset,
2370 const nir_src &offset_src,
2371 unsigned num_components,
2372 unsigned first_component)
2373 {
2374 const fs_builder &bld = ntb.bld;
2375 elk_fs_visitor &s = ntb.s;
2376
2377 assert(type_sz(dst.type) == 4);
2378 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(s.prog_data);
2379 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2380
2381 /* TODO: figure out push input layout for invocations == 1 */
2382 if (gs_prog_data->invocations == 1 &&
2383 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2384 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2385 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2386 nir_src_as_uint(vertex_src) * push_reg_count;
2387 const elk_fs_reg attr = elk_fs_reg(ATTR, 0, dst.type);
2388 for (unsigned i = 0; i < num_components; i++) {
2389 ntb.bld.MOV(offset(dst, bld, i),
2390 offset(attr, bld, imm_offset + i + first_component));
2391 }
2392 return;
2393 }
2394
2395 /* Resort to the pull model. Ensure the VUE handles are provided. */
2396 assert(gs_prog_data->base.include_vue_handles);
2397
2398 elk_fs_reg start = s.gs_payload().icp_handle_start;
2399 elk_fs_reg icp_handle = ntb.bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2400
2401 if (gs_prog_data->invocations == 1) {
2402 if (nir_src_is_const(vertex_src)) {
2403 /* The vertex index is constant; just select the proper URB handle. */
2404 icp_handle = offset(start, ntb.bld, nir_src_as_uint(vertex_src));
2405 } else {
2406 /* The vertex index is non-constant. We need to use indirect
2407 * addressing to fetch the proper URB handle.
2408 *
2409 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2410 * indicating that channel <n> should read the handle from
2411 * DWord <n>. We convert that to bytes by multiplying by 4.
2412 *
2413 * Next, we convert the vertex index to bytes by multiplying
2414 * by 32 (shifting by 5), and add the two together. This is
2415 * the final indirect byte offset.
2416 */
2417 elk_fs_reg sequence =
2418 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2419 elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2420 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2421 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2422
2423 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2424 bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2425 /* Convert vertex_index to bytes (multiply by 32) */
2426 bld.SHL(vertex_offset_bytes,
2427 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2428 elk_imm_ud(5u));
2429 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2430
2431 /* Use first_icp_handle as the base offset. There is one register
2432 * of URB handles per vertex, so inform the register allocator that
2433 * we might read up to nir->info.gs.vertices_in registers.
2434 */
2435 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2436 elk_fs_reg(icp_offset_bytes),
2437 elk_imm_ud(s.nir->info.gs.vertices_in * REG_SIZE));
2438 }
2439 } else {
2440 assert(gs_prog_data->invocations > 1);
2441
2442 if (nir_src_is_const(vertex_src)) {
2443 unsigned vertex = nir_src_as_uint(vertex_src);
2444 assert(vertex <= 5);
2445 bld.MOV(icp_handle, component(start, vertex));
2446 } else {
2447 /* The vertex index is non-constant. We need to use indirect
2448 * addressing to fetch the proper URB handle.
2449 *
2450 */
2451 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2452
2453 /* Convert vertex_index to bytes (multiply by 4) */
2454 bld.SHL(icp_offset_bytes,
2455 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2456 elk_imm_ud(2u));
2457
2458 /* Use first_icp_handle as the base offset. There is one DWord
2459 * of URB handles per vertex, so inform the register allocator that
2460 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2461 */
2462 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2463 elk_fs_reg(icp_offset_bytes),
2464 elk_imm_ud(DIV_ROUND_UP(s.nir->info.gs.vertices_in, 8) *
2465 REG_SIZE));
2466 }
2467 }
2468
2469 elk_fs_inst *inst;
2470 elk_fs_reg indirect_offset = get_nir_src(ntb, offset_src);
2471
2472 if (nir_src_is_const(offset_src)) {
2473 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2474 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2475
2476 /* Constant indexing - use global offset. */
2477 if (first_component != 0) {
2478 unsigned read_components = num_components + first_component;
2479 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2480 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2481 ARRAY_SIZE(srcs));
2482 inst->size_written = read_components *
2483 tmp.component_size(inst->exec_size);
2484 for (unsigned i = 0; i < num_components; i++) {
2485 bld.MOV(offset(dst, bld, i),
2486 offset(tmp, bld, i + first_component));
2487 }
2488 } else {
2489 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2490 ARRAY_SIZE(srcs));
2491 inst->size_written = num_components *
2492 dst.component_size(inst->exec_size);
2493 }
2494 inst->offset = base_offset + nir_src_as_uint(offset_src);
2495 } else {
2496 /* Indirect indexing - use per-slot offsets as well. */
2497 unsigned read_components = num_components + first_component;
2498 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2499
2500 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2501 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2502 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2503
2504 if (first_component != 0) {
2505 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2506 srcs, ARRAY_SIZE(srcs));
2507 inst->size_written = read_components *
2508 tmp.component_size(inst->exec_size);
2509 for (unsigned i = 0; i < num_components; i++) {
2510 bld.MOV(offset(dst, bld, i),
2511 offset(tmp, bld, i + first_component));
2512 }
2513 } else {
2514 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2515 srcs, ARRAY_SIZE(srcs));
2516 inst->size_written = num_components *
2517 dst.component_size(inst->exec_size);
2518 }
2519 inst->offset = base_offset;
2520 }
2521 }
2522
2523 static elk_fs_reg
get_indirect_offset(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2524 get_indirect_offset(nir_to_elk_state &ntb, nir_intrinsic_instr *instr)
2525 {
2526 nir_src *offset_src = nir_get_io_offset_src(instr);
2527
2528 if (nir_src_is_const(*offset_src)) {
2529 /* The only constant offset we should find is 0. elk_nir.c's
2530 * add_const_offset_to_base() will fold other constant offsets
2531 * into the "base" index.
2532 */
2533 assert(nir_src_as_uint(*offset_src) == 0);
2534 return elk_fs_reg();
2535 }
2536
2537 return get_nir_src(ntb, *offset_src);
2538 }
2539
2540 static void
fs_nir_emit_vs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2541 fs_nir_emit_vs_intrinsic(nir_to_elk_state &ntb,
2542 nir_intrinsic_instr *instr)
2543 {
2544 const fs_builder &bld = ntb.bld;
2545 elk_fs_visitor &s = ntb.s;
2546 assert(s.stage == MESA_SHADER_VERTEX);
2547
2548 elk_fs_reg dest;
2549 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2550 dest = get_nir_def(ntb, instr->def);
2551
2552 switch (instr->intrinsic) {
2553 case nir_intrinsic_load_vertex_id:
2554 case nir_intrinsic_load_base_vertex:
2555 unreachable("should be lowered by nir_lower_system_values()");
2556
2557 case nir_intrinsic_load_input: {
2558 assert(instr->def.bit_size == 32);
2559 const elk_fs_reg src = offset(elk_fs_reg(ATTR, 0, dest.type), bld,
2560 nir_intrinsic_base(instr) * 4 +
2561 nir_intrinsic_component(instr) +
2562 nir_src_as_uint(instr->src[0]));
2563
2564 for (unsigned i = 0; i < instr->num_components; i++)
2565 bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2566 break;
2567 }
2568
2569 case nir_intrinsic_load_vertex_id_zero_base:
2570 case nir_intrinsic_load_instance_id:
2571 case nir_intrinsic_load_base_instance:
2572 case nir_intrinsic_load_draw_id:
2573 case nir_intrinsic_load_first_vertex:
2574 case nir_intrinsic_load_is_indexed_draw:
2575 unreachable("lowered by elk_nir_lower_vs_inputs");
2576
2577 default:
2578 fs_nir_emit_intrinsic(ntb, bld, instr);
2579 break;
2580 }
2581 }
2582
2583 static elk_fs_reg
get_tcs_single_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2584 get_tcs_single_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2585 nir_intrinsic_instr *instr)
2586 {
2587 elk_fs_visitor &s = ntb.s;
2588
2589 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2590 const nir_src &vertex_src = instr->src[0];
2591 nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2592
2593 const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2594
2595 elk_fs_reg icp_handle;
2596
2597 if (nir_src_is_const(vertex_src)) {
2598 /* Emit a MOV to resolve <0,1,0> regioning. */
2599 icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2600 unsigned vertex = nir_src_as_uint(vertex_src);
2601 bld.MOV(icp_handle, component(start, vertex));
2602 } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2603 vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2604 /* For the common case of only 1 instance, an array index of
2605 * gl_InvocationID means reading the handles from the start. Skip all
2606 * the indirect work.
2607 */
2608 icp_handle = start;
2609 } else {
2610 /* The vertex index is non-constant. We need to use indirect
2611 * addressing to fetch the proper URB handle.
2612 */
2613 icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2614
2615 /* Each ICP handle is a single DWord (4 bytes) */
2616 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2617 bld.SHL(vertex_offset_bytes,
2618 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2619 elk_imm_ud(2u));
2620
2621 /* We might read up to 4 registers. */
2622 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2623 start, vertex_offset_bytes,
2624 elk_imm_ud(4 * REG_SIZE));
2625 }
2626
2627 return icp_handle;
2628 }
2629
2630 static elk_fs_reg
get_tcs_multi_patch_icp_handle(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)2631 get_tcs_multi_patch_icp_handle(nir_to_elk_state &ntb, const fs_builder &bld,
2632 nir_intrinsic_instr *instr)
2633 {
2634 elk_fs_visitor &s = ntb.s;
2635 const intel_device_info *devinfo = s.devinfo;
2636
2637 struct elk_tcs_prog_key *tcs_key = (struct elk_tcs_prog_key *) s.key;
2638 const nir_src &vertex_src = instr->src[0];
2639 const unsigned grf_size_bytes = REG_SIZE * reg_unit(devinfo);
2640
2641 const elk_fs_reg start = s.tcs_payload().icp_handle_start;
2642
2643 if (nir_src_is_const(vertex_src))
2644 return byte_offset(start, nir_src_as_uint(vertex_src) * grf_size_bytes);
2645
2646 /* The vertex index is non-constant. We need to use indirect
2647 * addressing to fetch the proper URB handle.
2648 *
2649 * First, we start with the sequence indicating that channel <n>
2650 * should read the handle from DWord <n>. We convert that to bytes
2651 * by multiplying by 4.
2652 *
2653 * Next, we convert the vertex index to bytes by multiplying
2654 * by the GRF size (by shifting), and add the two together. This is
2655 * the final indirect byte offset.
2656 */
2657 elk_fs_reg icp_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2658 elk_fs_reg sequence = ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
2659 elk_fs_reg channel_offsets = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2660 elk_fs_reg vertex_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2661 elk_fs_reg icp_offset_bytes = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2662
2663 /* Offsets will be 0, 4, 8, ... */
2664 bld.SHL(channel_offsets, sequence, elk_imm_ud(2u));
2665 /* Convert vertex_index to bytes (multiply by 32) */
2666 assert(util_is_power_of_two_nonzero(grf_size_bytes)); /* for ffs() */
2667 bld.SHL(vertex_offset_bytes,
2668 retype(get_nir_src(ntb, vertex_src), ELK_REGISTER_TYPE_UD),
2669 elk_imm_ud(ffs(grf_size_bytes) - 1));
2670 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2671
2672 /* Use start of ICP handles as the base offset. There is one register
2673 * of URB handles per vertex, so inform the register allocator that
2674 * we might read up to nir->info.gs.vertices_in registers.
2675 */
2676 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT, icp_handle, start,
2677 icp_offset_bytes,
2678 elk_imm_ud(elk_tcs_prog_key_input_vertices(tcs_key) *
2679 grf_size_bytes));
2680
2681 return icp_handle;
2682 }
2683
2684 static void
emit_barrier(nir_to_elk_state & ntb)2685 emit_barrier(nir_to_elk_state &ntb)
2686 {
2687 const intel_device_info *devinfo = ntb.devinfo;
2688 const fs_builder &bld = ntb.bld;
2689 elk_fs_visitor &s = ntb.s;
2690
2691 /* We are getting the barrier ID from the compute shader header */
2692 assert(gl_shader_stage_uses_workgroup(s.stage));
2693
2694 elk_fs_reg payload = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
2695
2696 /* Clear the message payload */
2697 bld.exec_all().group(8, 0).MOV(payload, elk_imm_ud(0u));
2698
2699 assert(gl_shader_stage_is_compute(s.stage));
2700
2701 uint32_t barrier_id_mask;
2702 switch (devinfo->ver) {
2703 case 7:
2704 case 8:
2705 barrier_id_mask = 0x0f000000u; break;
2706 default:
2707 unreachable("barrier is only available on gen >= 7");
2708 }
2709
2710 /* Copy the barrier id from r0.2 to the message payload reg.2 */
2711 elk_fs_reg r0_2 = elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD));
2712 bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
2713 elk_imm_ud(barrier_id_mask));
2714
2715 /* Emit a gateway "barrier" message using the payload we set up, followed
2716 * by a wait instruction.
2717 */
2718 bld.exec_all().emit(ELK_SHADER_OPCODE_BARRIER, reg_undef, payload);
2719 }
2720
2721 static void
emit_tcs_barrier(nir_to_elk_state & ntb)2722 emit_tcs_barrier(nir_to_elk_state &ntb)
2723 {
2724 const fs_builder &bld = ntb.bld;
2725 elk_fs_visitor &s = ntb.s;
2726
2727 assert(s.stage == MESA_SHADER_TESS_CTRL);
2728 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2729
2730 elk_fs_reg m0 = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2731 elk_fs_reg m0_2 = component(m0, 2);
2732
2733 const fs_builder chanbld = bld.exec_all().group(1, 0);
2734
2735 /* Zero the message header */
2736 bld.exec_all().MOV(m0, elk_imm_ud(0u));
2737
2738 /* Copy "Barrier ID" from r0.2, bits 16:13 */
2739 chanbld.AND(m0_2, retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD),
2740 elk_imm_ud(INTEL_MASK(16, 13)));
2741
2742 /* Shift it up to bits 27:24. */
2743 chanbld.SHL(m0_2, m0_2, elk_imm_ud(11));
2744
2745 /* Set the Barrier Count and the enable bit */
2746 chanbld.OR(m0_2, m0_2,
2747 elk_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2748
2749 bld.emit(ELK_SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2750 }
2751
2752 static void
fs_nir_emit_tcs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2753 fs_nir_emit_tcs_intrinsic(nir_to_elk_state &ntb,
2754 nir_intrinsic_instr *instr)
2755 {
2756 const intel_device_info *devinfo = ntb.devinfo;
2757 const fs_builder &bld = ntb.bld;
2758 elk_fs_visitor &s = ntb.s;
2759
2760 assert(s.stage == MESA_SHADER_TESS_CTRL);
2761 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(s.prog_data);
2762 struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2763
2764 elk_fs_reg dst;
2765 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2766 dst = get_nir_def(ntb, instr->def);
2767
2768 switch (instr->intrinsic) {
2769 case nir_intrinsic_load_primitive_id:
2770 bld.MOV(dst, s.tcs_payload().primitive_id);
2771 break;
2772 case nir_intrinsic_load_invocation_id:
2773 bld.MOV(retype(dst, s.invocation_id.type), s.invocation_id);
2774 break;
2775
2776 case nir_intrinsic_barrier:
2777 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
2778 fs_nir_emit_intrinsic(ntb, bld, instr);
2779 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
2780 if (tcs_prog_data->instances != 1)
2781 emit_tcs_barrier(ntb);
2782 }
2783 break;
2784
2785 case nir_intrinsic_load_input:
2786 unreachable("nir_lower_io should never give us these.");
2787 break;
2788
2789 case nir_intrinsic_load_per_vertex_input: {
2790 assert(instr->def.bit_size == 32);
2791 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2792 unsigned imm_offset = nir_intrinsic_base(instr);
2793 elk_fs_inst *inst;
2794
2795 const bool multi_patch =
2796 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH;
2797
2798 elk_fs_reg icp_handle = multi_patch ?
2799 get_tcs_multi_patch_icp_handle(ntb, bld, instr) :
2800 get_tcs_single_patch_icp_handle(ntb, bld, instr);
2801
2802 /* We can only read two double components with each URB read, so
2803 * we send two read messages in that case, each one loading up to
2804 * two double components.
2805 */
2806 unsigned num_components = instr->num_components;
2807 unsigned first_component = nir_intrinsic_component(instr);
2808
2809 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2810 srcs[URB_LOGICAL_SRC_HANDLE] = icp_handle;
2811
2812 if (indirect_offset.file == BAD_FILE) {
2813 /* Constant indexing - use global offset. */
2814 if (first_component != 0) {
2815 unsigned read_components = num_components + first_component;
2816 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2817 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp, srcs,
2818 ARRAY_SIZE(srcs));
2819 for (unsigned i = 0; i < num_components; i++) {
2820 bld.MOV(offset(dst, bld, i),
2821 offset(tmp, bld, i + first_component));
2822 }
2823 } else {
2824 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst, srcs,
2825 ARRAY_SIZE(srcs));
2826 }
2827 inst->offset = imm_offset;
2828 } else {
2829 /* Indirect indexing - use per-slot offsets as well. */
2830 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2831
2832 if (first_component != 0) {
2833 unsigned read_components = num_components + first_component;
2834 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2835 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2836 srcs, ARRAY_SIZE(srcs));
2837 for (unsigned i = 0; i < num_components; i++) {
2838 bld.MOV(offset(dst, bld, i),
2839 offset(tmp, bld, i + first_component));
2840 }
2841 } else {
2842 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2843 srcs, ARRAY_SIZE(srcs));
2844 }
2845 inst->offset = imm_offset;
2846 }
2847 inst->size_written = (num_components + first_component) *
2848 inst->dst.component_size(inst->exec_size);
2849
2850 /* Copy the temporary to the destination to deal with writemasking.
2851 *
2852 * Also attempt to deal with gl_PointSize being in the .w component.
2853 */
2854 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2855 assert(type_sz(dst.type) == 4);
2856 inst->dst = bld.vgrf(dst.type, 4);
2857 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
2858 bld.MOV(dst, offset(inst->dst, bld, 3));
2859 }
2860 break;
2861 }
2862
2863 case nir_intrinsic_load_output:
2864 case nir_intrinsic_load_per_vertex_output: {
2865 assert(instr->def.bit_size == 32);
2866 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2867 unsigned imm_offset = nir_intrinsic_base(instr);
2868 unsigned first_component = nir_intrinsic_component(instr);
2869
2870 elk_fs_inst *inst;
2871 if (indirect_offset.file == BAD_FILE) {
2872 /* This MOV replicates the output handle to all enabled channels
2873 * is SINGLE_PATCH mode.
2874 */
2875 elk_fs_reg patch_handle = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
2876 bld.MOV(patch_handle, s.tcs_payload().patch_urb_output);
2877
2878 {
2879 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2880 srcs[URB_LOGICAL_SRC_HANDLE] = patch_handle;
2881
2882 if (first_component != 0) {
2883 unsigned read_components =
2884 instr->num_components + first_component;
2885 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2886 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2887 srcs, ARRAY_SIZE(srcs));
2888 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2889 for (unsigned i = 0; i < instr->num_components; i++) {
2890 bld.MOV(offset(dst, bld, i),
2891 offset(tmp, bld, i + first_component));
2892 }
2893 } else {
2894 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2895 srcs, ARRAY_SIZE(srcs));
2896 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2897 }
2898 inst->offset = imm_offset;
2899 }
2900 } else {
2901 /* Indirect indexing - use per-slot offsets as well. */
2902 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2903 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2904 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2905
2906 if (first_component != 0) {
2907 unsigned read_components =
2908 instr->num_components + first_component;
2909 elk_fs_reg tmp = bld.vgrf(dst.type, read_components);
2910 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
2911 srcs, ARRAY_SIZE(srcs));
2912 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
2913 for (unsigned i = 0; i < instr->num_components; i++) {
2914 bld.MOV(offset(dst, bld, i),
2915 offset(tmp, bld, i + first_component));
2916 }
2917 } else {
2918 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dst,
2919 srcs, ARRAY_SIZE(srcs));
2920 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
2921 }
2922 inst->offset = imm_offset;
2923 }
2924 break;
2925 }
2926
2927 case nir_intrinsic_store_output:
2928 case nir_intrinsic_store_per_vertex_output: {
2929 assert(nir_src_bit_size(instr->src[0]) == 32);
2930 elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
2931 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
2932 unsigned imm_offset = nir_intrinsic_base(instr);
2933 unsigned mask = nir_intrinsic_write_mask(instr);
2934
2935 if (mask == 0)
2936 break;
2937
2938 unsigned num_components = util_last_bit(mask);
2939 unsigned first_component = nir_intrinsic_component(instr);
2940 assert((first_component + num_components) <= 4);
2941
2942 mask = mask << first_component;
2943
2944 elk_fs_reg mask_reg;
2945 if (mask != WRITEMASK_XYZW)
2946 mask_reg = elk_imm_ud(mask << 16);
2947
2948 elk_fs_reg sources[4];
2949
2950 unsigned m = first_component;
2951 for (unsigned i = 0; i < num_components; i++) {
2952 int c = i + first_component;
2953 if (mask & (1 << c)) {
2954 sources[m++] = offset(value, bld, i);
2955 } else {
2956 m++;
2957 }
2958 }
2959
2960 assert(m == (first_component + num_components));
2961
2962 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
2963 srcs[URB_LOGICAL_SRC_HANDLE] = s.tcs_payload().patch_urb_output;
2964 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
2965 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = mask_reg;
2966 srcs[URB_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_F, m);
2967 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(m);
2968 bld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, m, 0);
2969
2970 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
2971 srcs, ARRAY_SIZE(srcs));
2972 inst->offset = imm_offset;
2973 break;
2974 }
2975
2976 default:
2977 fs_nir_emit_intrinsic(ntb, bld, instr);
2978 break;
2979 }
2980 }
2981
2982 static void
fs_nir_emit_tes_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)2983 fs_nir_emit_tes_intrinsic(nir_to_elk_state &ntb,
2984 nir_intrinsic_instr *instr)
2985 {
2986 const intel_device_info *devinfo = ntb.devinfo;
2987 const fs_builder &bld = ntb.bld;
2988 elk_fs_visitor &s = ntb.s;
2989
2990 assert(s.stage == MESA_SHADER_TESS_EVAL);
2991 struct elk_tes_prog_data *tes_prog_data = elk_tes_prog_data(s.prog_data);
2992
2993 elk_fs_reg dest;
2994 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2995 dest = get_nir_def(ntb, instr->def);
2996
2997 switch (instr->intrinsic) {
2998 case nir_intrinsic_load_primitive_id:
2999 bld.MOV(dest, s.tes_payload().primitive_id);
3000 break;
3001
3002 case nir_intrinsic_load_tess_coord:
3003 for (unsigned i = 0; i < 3; i++)
3004 bld.MOV(offset(dest, bld, i), s.tes_payload().coords[i]);
3005 break;
3006
3007 case nir_intrinsic_load_input:
3008 case nir_intrinsic_load_per_vertex_input: {
3009 assert(instr->def.bit_size == 32);
3010 elk_fs_reg indirect_offset = get_indirect_offset(ntb, instr);
3011 unsigned imm_offset = nir_intrinsic_base(instr);
3012 unsigned first_component = nir_intrinsic_component(instr);
3013
3014 elk_fs_inst *inst;
3015 if (indirect_offset.file == BAD_FILE) {
3016 /* Arbitrarily only push up to 32 vec4 slots worth of data,
3017 * which is 16 registers (since each holds 2 vec4 slots).
3018 */
3019 const unsigned max_push_slots = 32;
3020 if (imm_offset < max_push_slots) {
3021 const elk_fs_reg src = horiz_offset(elk_fs_reg(ATTR, 0, dest.type),
3022 4 * imm_offset + first_component);
3023 for (int i = 0; i < instr->num_components; i++)
3024 bld.MOV(offset(dest, bld, i), component(src, i));
3025
3026 tes_prog_data->base.urb_read_length =
3027 MAX2(tes_prog_data->base.urb_read_length,
3028 (imm_offset / 2) + 1);
3029 } else {
3030 /* Replicate the patch handle to all enabled channels */
3031 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3032 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3033
3034 if (first_component != 0) {
3035 unsigned read_components =
3036 instr->num_components + first_component;
3037 elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3038 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3039 srcs, ARRAY_SIZE(srcs));
3040 inst->size_written = read_components * REG_SIZE * reg_unit(devinfo);
3041 for (unsigned i = 0; i < instr->num_components; i++) {
3042 bld.MOV(offset(dest, bld, i),
3043 offset(tmp, bld, i + first_component));
3044 }
3045 } else {
3046 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3047 srcs, ARRAY_SIZE(srcs));
3048 inst->size_written = instr->num_components * REG_SIZE * reg_unit(devinfo);
3049 }
3050 inst->offset = imm_offset;
3051 }
3052 } else {
3053 /* Indirect indexing - use per-slot offsets as well. */
3054
3055 /* We can only read two double components with each URB read, so
3056 * we send two read messages in that case, each one loading up to
3057 * two double components.
3058 */
3059 unsigned num_components = instr->num_components;
3060
3061 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
3062 srcs[URB_LOGICAL_SRC_HANDLE] = s.tes_payload().patch_urb_input;
3063 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = indirect_offset;
3064
3065 if (first_component != 0) {
3066 unsigned read_components =
3067 num_components + first_component;
3068 elk_fs_reg tmp = bld.vgrf(dest.type, read_components);
3069 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, tmp,
3070 srcs, ARRAY_SIZE(srcs));
3071 for (unsigned i = 0; i < num_components; i++) {
3072 bld.MOV(offset(dest, bld, i),
3073 offset(tmp, bld, i + first_component));
3074 }
3075 } else {
3076 inst = bld.emit(ELK_SHADER_OPCODE_URB_READ_LOGICAL, dest,
3077 srcs, ARRAY_SIZE(srcs));
3078 }
3079 inst->offset = imm_offset;
3080 inst->size_written = (num_components + first_component) *
3081 inst->dst.component_size(inst->exec_size);
3082 }
3083 break;
3084 }
3085 default:
3086 fs_nir_emit_intrinsic(ntb, bld, instr);
3087 break;
3088 }
3089 }
3090
3091 static void
fs_nir_emit_gs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3092 fs_nir_emit_gs_intrinsic(nir_to_elk_state &ntb,
3093 nir_intrinsic_instr *instr)
3094 {
3095 const fs_builder &bld = ntb.bld;
3096 elk_fs_visitor &s = ntb.s;
3097
3098 assert(s.stage == MESA_SHADER_GEOMETRY);
3099 elk_fs_reg indirect_offset;
3100
3101 elk_fs_reg dest;
3102 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3103 dest = get_nir_def(ntb, instr->def);
3104
3105 switch (instr->intrinsic) {
3106 case nir_intrinsic_load_primitive_id:
3107 assert(s.stage == MESA_SHADER_GEOMETRY);
3108 assert(elk_gs_prog_data(s.prog_data)->include_primitive_id);
3109 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD), s.gs_payload().primitive_id);
3110 break;
3111
3112 case nir_intrinsic_load_input:
3113 unreachable("load_input intrinsics are invalid for the GS stage");
3114
3115 case nir_intrinsic_load_per_vertex_input:
3116 emit_gs_input_load(ntb, dest, instr->src[0], nir_intrinsic_base(instr),
3117 instr->src[1], instr->num_components,
3118 nir_intrinsic_component(instr));
3119 break;
3120
3121 case nir_intrinsic_emit_vertex_with_counter:
3122 emit_gs_vertex(ntb, instr->src[0], nir_intrinsic_stream_id(instr));
3123 break;
3124
3125 case nir_intrinsic_end_primitive_with_counter:
3126 emit_gs_end_primitive(ntb, instr->src[0]);
3127 break;
3128
3129 case nir_intrinsic_set_vertex_and_primitive_count:
3130 bld.MOV(s.final_gs_vertex_count, get_nir_src(ntb, instr->src[0]));
3131 break;
3132
3133 case nir_intrinsic_load_invocation_id: {
3134 elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_INVOCATION_ID];
3135 assert(val.file != BAD_FILE);
3136 dest.type = val.type;
3137 bld.MOV(dest, val);
3138 break;
3139 }
3140
3141 default:
3142 fs_nir_emit_intrinsic(ntb, bld, instr);
3143 break;
3144 }
3145 }
3146
3147 /**
3148 * Fetch the current render target layer index.
3149 */
3150 static elk_fs_reg
fetch_render_target_array_index(const fs_builder & bld)3151 fetch_render_target_array_index(const fs_builder &bld)
3152 {
3153 if (bld.shader->devinfo->ver >= 6) {
3154 /* The render target array index is provided in the thread payload as
3155 * bits 26:16 of r0.0.
3156 */
3157 const elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_UD);
3158 bld.AND(idx, elk_uw1_reg(ELK_GENERAL_REGISTER_FILE, 0, 1),
3159 elk_imm_uw(0x7ff));
3160 return idx;
3161 } else {
3162 /* Pre-SNB we only ever render into the first layer of the framebuffer
3163 * since layered rendering is not implemented.
3164 */
3165 return elk_imm_ud(0);
3166 }
3167 }
3168
3169 /* Sample from the MCS surface attached to this multisample texture. */
3170 static elk_fs_reg
emit_mcs_fetch(nir_to_elk_state & ntb,const elk_fs_reg & coordinate,unsigned components,const elk_fs_reg & texture,const elk_fs_reg & texture_handle)3171 emit_mcs_fetch(nir_to_elk_state &ntb, const elk_fs_reg &coordinate, unsigned components,
3172 const elk_fs_reg &texture,
3173 const elk_fs_reg &texture_handle)
3174 {
3175 const fs_builder &bld = ntb.bld;
3176
3177 const elk_fs_reg dest = ntb.s.vgrf(glsl_uvec4_type());
3178
3179 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3180 srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
3181 srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
3182 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3183 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
3184 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(components);
3185 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
3186 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
3187
3188 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
3189 ARRAY_SIZE(srcs));
3190
3191 /* We only care about one or two regs of response, but the sampler always
3192 * writes 4/8.
3193 */
3194 inst->size_written = 4 * dest.component_size(inst->exec_size);
3195
3196 return dest;
3197 }
3198
3199 /**
3200 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3201 * framebuffer at the current fragment coordinates and sample index.
3202 */
3203 static elk_fs_inst *
emit_non_coherent_fb_read(nir_to_elk_state & ntb,const fs_builder & bld,const elk_fs_reg & dst,unsigned target)3204 emit_non_coherent_fb_read(nir_to_elk_state &ntb, const fs_builder &bld, const elk_fs_reg &dst,
3205 unsigned target)
3206 {
3207 elk_fs_visitor &s = ntb.s;
3208
3209 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3210 const elk_wm_prog_key *wm_key =
3211 reinterpret_cast<const elk_wm_prog_key *>(s.key);
3212 assert(!wm_key->coherent_fb_fetch);
3213
3214 /* Calculate the fragment coordinates. */
3215 const elk_fs_reg coords = bld.vgrf(ELK_REGISTER_TYPE_UD, 3);
3216 bld.MOV(offset(coords, bld, 0), s.pixel_x);
3217 bld.MOV(offset(coords, bld, 1), s.pixel_y);
3218 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3219
3220 /* Calculate the sample index and MCS payload when multisampling. Luckily
3221 * the MCS fetch message behaves deterministically for UMS surfaces, so it
3222 * shouldn't be necessary to recompile based on whether the framebuffer is
3223 * CMS or UMS.
3224 */
3225 assert(wm_key->multisample_fbo == ELK_ALWAYS ||
3226 wm_key->multisample_fbo == ELK_NEVER);
3227 if (wm_key->multisample_fbo &&
3228 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3229 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3230
3231 const elk_fs_reg sample = ntb.system_values[SYSTEM_VALUE_SAMPLE_ID];
3232 const elk_fs_reg mcs = wm_key->multisample_fbo ?
3233 emit_mcs_fetch(ntb, coords, 3, elk_imm_ud(target), elk_fs_reg()) : elk_fs_reg();
3234
3235 /* Use either a normal or a CMS texel fetch message depending on whether
3236 * the framebuffer is single or multisample. On SKL+ use the wide CMS
3237 * message just in case the framebuffer uses 16x multisampling, it should
3238 * be equivalent to the normal CMS fetch for lower multisampling modes.
3239 */
3240 elk_opcode op;
3241 if (wm_key->multisample_fbo) {
3242 op = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
3243 } else {
3244 op = ELK_SHADER_OPCODE_TXF_LOGICAL;
3245 }
3246
3247 /* Emit the instruction. */
3248 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3249 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords;
3250 srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_ud(0);
3251 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample;
3252 srcs[TEX_LOGICAL_SRC_MCS] = mcs;
3253 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(target);
3254 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(0);
3255 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_ud(3);
3256 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_ud(0);
3257 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(0);
3258
3259 elk_fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3260 inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3261
3262 return inst;
3263 }
3264
3265 static elk_fs_reg
alloc_temporary(const fs_builder & bld,unsigned size,elk_fs_reg * regs,unsigned n)3266 alloc_temporary(const fs_builder &bld, unsigned size, elk_fs_reg *regs, unsigned n)
3267 {
3268 if (n && regs[0].file != BAD_FILE) {
3269 return regs[0];
3270
3271 } else {
3272 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, size);
3273
3274 for (unsigned i = 0; i < n; i++)
3275 regs[i] = tmp;
3276
3277 return tmp;
3278 }
3279 }
3280
3281 static elk_fs_reg
alloc_frag_output(nir_to_elk_state & ntb,unsigned location)3282 alloc_frag_output(nir_to_elk_state &ntb, unsigned location)
3283 {
3284 elk_fs_visitor &s = ntb.s;
3285
3286 assert(s.stage == MESA_SHADER_FRAGMENT);
3287 const elk_wm_prog_key *const key =
3288 reinterpret_cast<const elk_wm_prog_key *>(s.key);
3289 const unsigned l = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_LOCATION);
3290 const unsigned i = GET_FIELD(location, ELK_NIR_FRAG_OUTPUT_INDEX);
3291
3292 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3293 return alloc_temporary(ntb.bld, 4, &s.dual_src_output, 1);
3294
3295 else if (l == FRAG_RESULT_COLOR)
3296 return alloc_temporary(ntb.bld, 4, s.outputs,
3297 MAX2(key->nr_color_regions, 1));
3298
3299 else if (l == FRAG_RESULT_DEPTH)
3300 return alloc_temporary(ntb.bld, 1, &s.frag_depth, 1);
3301
3302 else if (l == FRAG_RESULT_STENCIL)
3303 return alloc_temporary(ntb.bld, 1, &s.frag_stencil, 1);
3304
3305 else if (l == FRAG_RESULT_SAMPLE_MASK)
3306 return alloc_temporary(ntb.bld, 1, &s.sample_mask, 1);
3307
3308 else if (l >= FRAG_RESULT_DATA0 &&
3309 l < FRAG_RESULT_DATA0 + ELK_MAX_DRAW_BUFFERS)
3310 return alloc_temporary(ntb.bld, 4,
3311 &s.outputs[l - FRAG_RESULT_DATA0], 1);
3312
3313 else
3314 unreachable("Invalid location");
3315 }
3316
3317 static void
emit_is_helper_invocation(nir_to_elk_state & ntb,elk_fs_reg result)3318 emit_is_helper_invocation(nir_to_elk_state &ntb, elk_fs_reg result)
3319 {
3320 const fs_builder &bld = ntb.bld;
3321
3322 /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3323 * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3324 * consideration demoted invocations.
3325 */
3326 result.type = ELK_REGISTER_TYPE_UD;
3327
3328 bld.MOV(result, elk_imm_ud(0));
3329
3330 /* See elk_sample_mask_reg() for why we split SIMD32 into SIMD16 here. */
3331 unsigned width = bld.dispatch_width();
3332 for (unsigned i = 0; i < DIV_ROUND_UP(width, 16); i++) {
3333 const fs_builder b = bld.group(MIN2(width, 16), i);
3334
3335 elk_fs_inst *mov = b.MOV(offset(result, b, i), elk_imm_ud(~0));
3336
3337 /* The at() ensures that any code emitted to get the predicate happens
3338 * before the mov right above. This is not an issue elsewhere because
3339 * lowering code already set up the builder this way.
3340 */
3341 elk_emit_predicate_on_sample_mask(b.at(NULL, mov), mov);
3342 mov->predicate_inverse = true;
3343 }
3344 }
3345
3346 static void
emit_fragcoord_interpolation(nir_to_elk_state & ntb,elk_fs_reg wpos)3347 emit_fragcoord_interpolation(nir_to_elk_state &ntb, elk_fs_reg wpos)
3348 {
3349 const intel_device_info *devinfo = ntb.devinfo;
3350 const fs_builder &bld = ntb.bld;
3351 elk_fs_visitor &s = ntb.s;
3352
3353 assert(s.stage == MESA_SHADER_FRAGMENT);
3354
3355 /* gl_FragCoord.x */
3356 bld.MOV(wpos, s.pixel_x);
3357 wpos = offset(wpos, bld, 1);
3358
3359 /* gl_FragCoord.y */
3360 bld.MOV(wpos, s.pixel_y);
3361 wpos = offset(wpos, bld, 1);
3362
3363 /* gl_FragCoord.z */
3364 if (devinfo->ver >= 6) {
3365 bld.MOV(wpos, s.pixel_z);
3366 } else {
3367 bld.emit(ELK_FS_OPCODE_LINTERP, wpos,
3368 s.delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL],
3369 s.interp_reg(bld, VARYING_SLOT_POS, 2, 0));
3370 }
3371 wpos = offset(wpos, bld, 1);
3372
3373 /* gl_FragCoord.w: Already set up in emit_interpolation */
3374 bld.MOV(wpos, s.wpos_w);
3375 }
3376
3377 static elk_fs_reg
emit_frontfacing_interpolation(nir_to_elk_state & ntb)3378 emit_frontfacing_interpolation(nir_to_elk_state &ntb)
3379 {
3380 const intel_device_info *devinfo = ntb.devinfo;
3381 const fs_builder &bld = ntb.bld;
3382
3383 elk_fs_reg ff = bld.vgrf(ELK_REGISTER_TYPE_D);
3384
3385 if (devinfo->ver >= 6) {
3386 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
3387 * a boolean result from this (~0/true or 0/false).
3388 *
3389 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
3390 * this task in only one instruction:
3391 * - a negation source modifier will flip the bit; and
3392 * - a W -> D type conversion will sign extend the bit into the high
3393 * word of the destination.
3394 *
3395 * An ASR 15 fills the low word of the destination.
3396 */
3397 elk_fs_reg g0 = elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_W));
3398 g0.negate = true;
3399
3400 bld.ASR(ff, g0, elk_imm_d(15));
3401 } else {
3402 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
3403 * a boolean result from this (1/true or 0/false).
3404 *
3405 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
3406 * the negation source modifier to flip it. Unfortunately the SHR
3407 * instruction only operates on UD (or D with an abs source modifier)
3408 * sources without negation.
3409 *
3410 * Instead, use ASR (which will give ~0/true or 0/false).
3411 */
3412 elk_fs_reg g1_6 = elk_fs_reg(retype(elk_vec1_grf(1, 6), ELK_REGISTER_TYPE_D));
3413 g1_6.negate = true;
3414
3415 bld.ASR(ff, g1_6, elk_imm_d(31));
3416 }
3417
3418 return ff;
3419 }
3420
3421 static elk_fs_reg
emit_samplepos_setup(nir_to_elk_state & ntb)3422 emit_samplepos_setup(nir_to_elk_state &ntb)
3423 {
3424 const intel_device_info *devinfo = ntb.devinfo;
3425 const fs_builder &bld = ntb.bld;
3426 elk_fs_visitor &s = ntb.s;
3427
3428 assert(s.stage == MESA_SHADER_FRAGMENT);
3429 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3430 assert(devinfo->ver >= 6);
3431
3432 const fs_builder abld = bld.annotate("compute sample position");
3433 elk_fs_reg pos = abld.vgrf(ELK_REGISTER_TYPE_F, 2);
3434
3435 if (wm_prog_data->persample_dispatch == ELK_NEVER) {
3436 /* From ARB_sample_shading specification:
3437 * "When rendering to a non-multisample buffer, or if multisample
3438 * rasterization is disabled, gl_SamplePosition will always be
3439 * (0.5, 0.5).
3440 */
3441 bld.MOV(offset(pos, bld, 0), elk_imm_f(0.5f));
3442 bld.MOV(offset(pos, bld, 1), elk_imm_f(0.5f));
3443 return pos;
3444 }
3445
3446 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
3447 * mode will be enabled.
3448 *
3449 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
3450 * R31.1:0 Position Offset X/Y for Slot[3:0]
3451 * R31.3:2 Position Offset X/Y for Slot[7:4]
3452 * .....
3453 *
3454 * The X, Y sample positions come in as bytes in thread payload. So, read
3455 * the positions using vstride=16, width=8, hstride=2.
3456 */
3457 const elk_fs_reg sample_pos_reg =
3458 fetch_payload_reg(abld, s.fs_payload().sample_pos_reg, ELK_REGISTER_TYPE_W);
3459
3460 for (unsigned i = 0; i < 2; i++) {
3461 elk_fs_reg tmp_d = bld.vgrf(ELK_REGISTER_TYPE_D);
3462 abld.MOV(tmp_d, subscript(sample_pos_reg, ELK_REGISTER_TYPE_B, i));
3463 /* Convert int_sample_pos to floating point */
3464 elk_fs_reg tmp_f = bld.vgrf(ELK_REGISTER_TYPE_F);
3465 abld.MOV(tmp_f, tmp_d);
3466 /* Scale to the range [0, 1] */
3467 abld.MUL(offset(pos, abld, i), tmp_f, elk_imm_f(1 / 16.0f));
3468 }
3469
3470 if (wm_prog_data->persample_dispatch == ELK_SOMETIMES) {
3471 check_dynamic_msaa_flag(abld, wm_prog_data,
3472 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3473 for (unsigned i = 0; i < 2; i++) {
3474 set_predicate(ELK_PREDICATE_NORMAL,
3475 bld.SEL(offset(pos, abld, i), offset(pos, abld, i),
3476 elk_imm_f(0.5f)));
3477 }
3478 }
3479
3480 return pos;
3481 }
3482
3483 static elk_fs_reg
emit_sampleid_setup(nir_to_elk_state & ntb)3484 emit_sampleid_setup(nir_to_elk_state &ntb)
3485 {
3486 const intel_device_info *devinfo = ntb.devinfo;
3487 const fs_builder &bld = ntb.bld;
3488 elk_fs_visitor &s = ntb.s;
3489
3490 assert(s.stage == MESA_SHADER_FRAGMENT);
3491 ASSERTED elk_wm_prog_key *key = (elk_wm_prog_key*) s.key;
3492 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3493 assert(devinfo->ver >= 6);
3494
3495 const fs_builder abld = bld.annotate("compute sample id");
3496 elk_fs_reg sample_id = abld.vgrf(ELK_REGISTER_TYPE_UD);
3497
3498 assert(key->multisample_fbo != ELK_NEVER);
3499
3500 if (devinfo->ver >= 8) {
3501 /* Sample ID comes in as 4-bit numbers in g1.0:
3502 *
3503 * 15:12 Slot 3 SampleID (only used in SIMD16)
3504 * 11:8 Slot 2 SampleID (only used in SIMD16)
3505 * 7:4 Slot 1 SampleID
3506 * 3:0 Slot 0 SampleID
3507 *
3508 * Each slot corresponds to four channels, so we want to replicate each
3509 * half-byte value to 4 channels in a row:
3510 *
3511 * dst+0: .7 .6 .5 .4 .3 .2 .1 .0
3512 * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
3513 *
3514 * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
3515 * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
3516 *
3517 * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
3518 * channels to read the first byte (7:0), and the second group of 8
3519 * channels to read the second byte (15:8). Then, we shift right by
3520 * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
3521 * values into place. Finally, we AND with 0xf to keep the low nibble.
3522 *
3523 * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
3524 * and(16) dst<1>D tmp<8,8,1>W 0xf:W
3525 *
3526 * TODO: These payload bits exist on Gfx7 too, but they appear to always
3527 * be zero, so this code fails to work. We should find out why.
3528 */
3529 const elk_fs_reg tmp = abld.vgrf(ELK_REGISTER_TYPE_UW);
3530
3531 for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
3532 const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
3533 /* According to the "PS Thread Payload for Normal Dispatch"
3534 * pages on the BSpec, the sample ids are stored in R1.0/R2.0 on gfx8+.
3535 */
3536 const struct elk_reg id_reg = elk_vec1_grf(i + 1, 0);
3537 hbld.SHR(offset(tmp, hbld, i),
3538 stride(retype(id_reg, ELK_REGISTER_TYPE_UB), 1, 8, 0),
3539 elk_imm_v(0x44440000));
3540 }
3541
3542 abld.AND(sample_id, tmp, elk_imm_w(0xf));
3543 } else {
3544 const elk_fs_reg t1 = component(abld.vgrf(ELK_REGISTER_TYPE_UD), 0);
3545 const elk_fs_reg t2 = abld.vgrf(ELK_REGISTER_TYPE_UW);
3546
3547 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
3548 * 8x multisampling, subspan 0 will represent sample N (where N
3549 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
3550 * 7. We can find the value of N by looking at R0.0 bits 7:6
3551 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
3552 * (since samples are always delivered in pairs). That is, we
3553 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
3554 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
3555 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
3556 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
3557 * populating a temporary variable with the sequence (0, 1, 2, 3),
3558 * and then reading from it using vstride=1, width=4, hstride=0.
3559 * These computations hold good for 4x multisampling as well.
3560 *
3561 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
3562 * the first four slots are sample 0 of subspan 0; the next four
3563 * are sample 1 of subspan 0; the third group is sample 0 of
3564 * subspan 1, and finally sample 1 of subspan 1.
3565 */
3566
3567 /* SKL+ has an extra bit for the Starting Sample Pair Index to
3568 * accommodate 16x MSAA.
3569 */
3570 abld.exec_all().group(1, 0)
3571 .AND(t1, elk_fs_reg(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD)),
3572 elk_imm_ud(0xc0));
3573 abld.exec_all().group(1, 0).SHR(t1, t1, elk_imm_d(5));
3574
3575 /* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we
3576 * can assume 4x MSAA. Disallow it on IVB+
3577 *
3578 * FINISHME: One day, we could come up with a way to do this that
3579 * actually works on gfx7.
3580 */
3581 if (devinfo->ver >= 7)
3582 s.limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
3583 abld.exec_all().group(8, 0).MOV(t2, elk_imm_v(0x32103210));
3584
3585 /* This special instruction takes care of setting vstride=1,
3586 * width=4, hstride=0 of t2 during an ADD instruction.
3587 */
3588 abld.emit(ELK_FS_OPCODE_SET_SAMPLE_ID, sample_id, t1, t2);
3589 }
3590
3591 if (key->multisample_fbo == ELK_SOMETIMES) {
3592 check_dynamic_msaa_flag(abld, wm_prog_data,
3593 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3594 set_predicate(ELK_PREDICATE_NORMAL,
3595 abld.SEL(sample_id, sample_id, elk_imm_ud(0)));
3596 }
3597
3598 return sample_id;
3599 }
3600
3601 static elk_fs_reg
emit_samplemaskin_setup(nir_to_elk_state & ntb)3602 emit_samplemaskin_setup(nir_to_elk_state &ntb)
3603 {
3604 const intel_device_info *devinfo = ntb.devinfo;
3605 const fs_builder &bld = ntb.bld;
3606 elk_fs_visitor &s = ntb.s;
3607
3608 assert(s.stage == MESA_SHADER_FRAGMENT);
3609 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3610 assert(devinfo->ver >= 6);
3611
3612 elk_fs_reg coverage_mask =
3613 fetch_payload_reg(bld, s.fs_payload().sample_mask_in_reg, ELK_REGISTER_TYPE_D);
3614
3615 if (wm_prog_data->persample_dispatch == ELK_NEVER)
3616 return coverage_mask;
3617
3618 /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
3619 * and a mask representing which sample is being processed by the
3620 * current shader invocation.
3621 *
3622 * From the OES_sample_variables specification:
3623 * "When per-sample shading is active due to the use of a fragment input
3624 * qualified by "sample" or due to the use of the gl_SampleID or
3625 * gl_SamplePosition variables, only the bit for the current sample is
3626 * set in gl_SampleMaskIn."
3627 */
3628 const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
3629
3630 if (ntb.system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3631 ntb.system_values[SYSTEM_VALUE_SAMPLE_ID] = emit_sampleid_setup(ntb);
3632
3633 elk_fs_reg one = s.vgrf(glsl_int_type());
3634 elk_fs_reg enabled_mask = s.vgrf(glsl_int_type());
3635 abld.MOV(one, elk_imm_d(1));
3636 abld.SHL(enabled_mask, one, ntb.system_values[SYSTEM_VALUE_SAMPLE_ID]);
3637 elk_fs_reg mask = bld.vgrf(ELK_REGISTER_TYPE_D);
3638 abld.AND(mask, enabled_mask, coverage_mask);
3639
3640 if (wm_prog_data->persample_dispatch == ELK_ALWAYS)
3641 return mask;
3642
3643 check_dynamic_msaa_flag(abld, wm_prog_data,
3644 INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH);
3645 set_predicate(ELK_PREDICATE_NORMAL, abld.SEL(mask, mask, coverage_mask));
3646
3647 return mask;
3648 }
3649
3650 static void
fs_nir_emit_fs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)3651 fs_nir_emit_fs_intrinsic(nir_to_elk_state &ntb,
3652 nir_intrinsic_instr *instr)
3653 {
3654 const intel_device_info *devinfo = ntb.devinfo;
3655 const fs_builder &bld = ntb.bld;
3656 elk_fs_visitor &s = ntb.s;
3657
3658 assert(s.stage == MESA_SHADER_FRAGMENT);
3659
3660 elk_fs_reg dest;
3661 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3662 dest = get_nir_def(ntb, instr->def);
3663
3664 switch (instr->intrinsic) {
3665 case nir_intrinsic_load_front_face:
3666 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
3667 emit_frontfacing_interpolation(ntb));
3668 break;
3669
3670 case nir_intrinsic_load_sample_pos:
3671 case nir_intrinsic_load_sample_pos_or_center: {
3672 elk_fs_reg sample_pos = ntb.system_values[SYSTEM_VALUE_SAMPLE_POS];
3673 assert(sample_pos.file != BAD_FILE);
3674 dest.type = sample_pos.type;
3675 bld.MOV(dest, sample_pos);
3676 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3677 break;
3678 }
3679
3680 case nir_intrinsic_load_layer_id:
3681 dest.type = ELK_REGISTER_TYPE_UD;
3682 bld.MOV(dest, fetch_render_target_array_index(bld));
3683 break;
3684
3685 case nir_intrinsic_is_helper_invocation:
3686 emit_is_helper_invocation(ntb, dest);
3687 break;
3688
3689 case nir_intrinsic_load_helper_invocation:
3690 case nir_intrinsic_load_sample_mask_in:
3691 case nir_intrinsic_load_sample_id:
3692 case nir_intrinsic_load_frag_shading_rate: {
3693 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3694 elk_fs_reg val = ntb.system_values[sv];
3695 assert(val.file != BAD_FILE);
3696 dest.type = val.type;
3697 bld.MOV(dest, val);
3698 break;
3699 }
3700
3701 case nir_intrinsic_store_output: {
3702 const elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
3703 const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3704 const unsigned location = nir_intrinsic_base(instr) +
3705 SET_FIELD(store_offset, ELK_NIR_FRAG_OUTPUT_LOCATION);
3706 const elk_fs_reg new_dest = retype(alloc_frag_output(ntb, location),
3707 src.type);
3708
3709 for (unsigned j = 0; j < instr->num_components; j++)
3710 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3711 offset(src, bld, j));
3712
3713 break;
3714 }
3715
3716 case nir_intrinsic_load_output: {
3717 const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3718 ELK_NIR_FRAG_OUTPUT_LOCATION);
3719 assert(l >= FRAG_RESULT_DATA0);
3720 const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3721 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3722 const elk_fs_reg tmp = bld.vgrf(dest.type, 4);
3723
3724 assert(!reinterpret_cast<const elk_wm_prog_key *>(s.key)->coherent_fb_fetch);
3725 emit_non_coherent_fb_read(ntb, bld, tmp, target);
3726
3727 for (unsigned j = 0; j < instr->num_components; j++) {
3728 bld.MOV(offset(dest, bld, j),
3729 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3730 }
3731
3732 break;
3733 }
3734
3735 case nir_intrinsic_demote:
3736 case nir_intrinsic_terminate:
3737 case nir_intrinsic_demote_if:
3738 case nir_intrinsic_terminate_if: {
3739 /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we
3740 * can update just the flag bits that aren't yet discarded. If there's
3741 * no condition, we emit a CMP of g0 != g0, so all currently executing
3742 * channels will get turned off.
3743 */
3744 elk_fs_inst *cmp = NULL;
3745 if (instr->intrinsic == nir_intrinsic_demote_if ||
3746 instr->intrinsic == nir_intrinsic_terminate_if) {
3747 nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3748
3749 if (alu != NULL &&
3750 alu->op != nir_op_bcsel &&
3751 (devinfo->ver > 5 ||
3752 (alu->instr.pass_flags & ELK_NIR_BOOLEAN_MASK) != ELK_NIR_BOOLEAN_NEEDS_RESOLVE ||
3753 alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
3754 alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
3755 alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
3756 alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
3757 alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
3758 /* Re-emit the instruction that generated the Boolean value, but
3759 * do not store it. Since this instruction will be conditional,
3760 * other instructions that want to use the real Boolean value may
3761 * get garbage. This was a problem for piglit's fs-discard-exit-2
3762 * test.
3763 *
3764 * Ideally we'd detect that the instruction cannot have a
3765 * conditional modifier before emitting the instructions. Alas,
3766 * that is nigh impossible. Instead, we're going to assume the
3767 * instruction (or last instruction) generated can have a
3768 * conditional modifier. If it cannot, fallback to the old-style
3769 * compare, and hope dead code elimination will clean up the
3770 * extra instructions generated.
3771 */
3772 fs_nir_emit_alu(ntb, alu, false);
3773
3774 cmp = (elk_fs_inst *) s.instructions.get_tail();
3775 if (cmp->conditional_mod == ELK_CONDITIONAL_NONE) {
3776 if (cmp->can_do_cmod())
3777 cmp->conditional_mod = ELK_CONDITIONAL_Z;
3778 else
3779 cmp = NULL;
3780 } else {
3781 /* The old sequence that would have been generated is,
3782 * basically, bool_result == false. This is equivalent to
3783 * !bool_result, so negate the old modifier.
3784 *
3785 * Unfortunately, we can't do this to most float comparisons
3786 * because of NaN, so we'll have to fallback to the old-style
3787 * compare.
3788 *
3789 * For example, this code (after negation):
3790 * (+f1.0) cmp.ge.f1.0(8) null<1>F g30<8,8,1>F 0x0F
3791 * will provide different results from this:
3792 * cmp.l.f0.0(8) g31<1>F g30<1,1,0>F 0x0F
3793 * (+f1.0) cmp.z.f1.0(8) null<1>D g31<8,8,1>D 0D
3794 * because both (NaN >= 0) == false and (NaN < 0) == false.
3795 *
3796 * It will still work for == and != though, because
3797 * (NaN == x) == false and (NaN != x) == true.
3798 */
3799 if (elk_type_is_float(cmp->src[0].type) &&
3800 cmp->conditional_mod != ELK_CONDITIONAL_EQ &&
3801 cmp->conditional_mod != ELK_CONDITIONAL_NEQ) {
3802 cmp = NULL;
3803 } else {
3804 cmp->conditional_mod = elk_negate_cmod(cmp->conditional_mod);
3805 }
3806 }
3807 }
3808
3809 if (cmp == NULL) {
3810 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(ntb, instr->src[0]),
3811 elk_imm_d(0), ELK_CONDITIONAL_Z);
3812 }
3813 } else {
3814 elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
3815 ELK_REGISTER_TYPE_UW));
3816 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, ELK_CONDITIONAL_NZ);
3817 }
3818
3819 cmp->predicate = ELK_PREDICATE_NORMAL;
3820 cmp->flag_subreg = sample_mask_flag_subreg(s);
3821
3822 elk_fs_inst *jump = bld.emit(ELK_OPCODE_HALT);
3823 jump->flag_subreg = sample_mask_flag_subreg(s);
3824 jump->predicate_inverse = true;
3825
3826 if (instr->intrinsic == nir_intrinsic_terminate ||
3827 instr->intrinsic == nir_intrinsic_terminate_if) {
3828 jump->predicate = ELK_PREDICATE_NORMAL;
3829 } else {
3830 /* Only jump when the whole quad is demoted. For historical
3831 * reasons this is also used for discard.
3832 */
3833 jump->predicate = ELK_PREDICATE_ALIGN1_ANY4H;
3834 }
3835
3836 if (devinfo->ver < 7)
3837 s.limit_dispatch_width(
3838 16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
3839 break;
3840 }
3841
3842 case nir_intrinsic_load_input:
3843 case nir_intrinsic_load_per_primitive_input: {
3844 /* In Fragment Shaders load_input is used either for flat inputs or
3845 * per-primitive inputs.
3846 */
3847 assert(instr->def.bit_size == 32);
3848 unsigned base = nir_intrinsic_base(instr);
3849 unsigned comp = nir_intrinsic_component(instr);
3850 unsigned num_components = instr->num_components;
3851
3852 /* Special case fields in the VUE header */
3853 if (base == VARYING_SLOT_LAYER)
3854 comp = 1;
3855 else if (base == VARYING_SLOT_VIEWPORT)
3856 comp = 2;
3857
3858 if (BITFIELD64_BIT(base) & s.nir->info.per_primitive_inputs) {
3859 assert(base != VARYING_SLOT_PRIMITIVE_INDICES);
3860 for (unsigned int i = 0; i < num_components; i++) {
3861 bld.MOV(offset(dest, bld, i),
3862 retype(s.per_primitive_reg(bld, base, comp + i), dest.type));
3863 }
3864 } else {
3865 const unsigned k = 3;
3866 for (unsigned int i = 0; i < num_components; i++) {
3867 bld.MOV(offset(dest, bld, i),
3868 retype(s.interp_reg(bld, base, comp + i, k), dest.type));
3869 }
3870 }
3871 break;
3872 }
3873
3874 case nir_intrinsic_load_fs_input_interp_deltas: {
3875 assert(s.stage == MESA_SHADER_FRAGMENT);
3876 assert(nir_src_as_uint(instr->src[0]) == 0);
3877 const unsigned base = nir_intrinsic_base(instr);
3878 const unsigned comp = nir_intrinsic_component(instr);
3879 dest.type = ELK_REGISTER_TYPE_F;
3880
3881 bld.MOV(offset(dest, bld, 0), s.interp_reg(bld, base, comp, 3));
3882 bld.MOV(offset(dest, bld, 1), s.interp_reg(bld, base, comp, 1));
3883 bld.MOV(offset(dest, bld, 2), s.interp_reg(bld, base, comp, 0));
3884
3885 break;
3886 }
3887
3888 case nir_intrinsic_load_barycentric_pixel:
3889 case nir_intrinsic_load_barycentric_centroid:
3890 case nir_intrinsic_load_barycentric_sample: {
3891 /* Use the delta_xy values computed from the payload */
3892 enum elk_barycentric_mode bary = elk_barycentric_mode(instr);
3893 const elk_fs_reg srcs[] = { offset(s.delta_xy[bary], bld, 0),
3894 offset(s.delta_xy[bary], bld, 1) };
3895 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3896 break;
3897 }
3898
3899 case nir_intrinsic_load_barycentric_at_sample: {
3900 const glsl_interp_mode interpolation =
3901 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3902
3903 elk_fs_reg msg_data;
3904 if (nir_src_is_const(instr->src[0])) {
3905 msg_data = elk_imm_ud(nir_src_as_uint(instr->src[0]) << 4);
3906 } else {
3907 const elk_fs_reg sample_src = retype(get_nir_src(ntb, instr->src[0]),
3908 ELK_REGISTER_TYPE_UD);
3909 const elk_fs_reg sample_id = bld.emit_uniformize(sample_src);
3910 msg_data = component(bld.group(8, 0).vgrf(ELK_REGISTER_TYPE_UD), 0);
3911 bld.exec_all().group(1, 0).SHL(msg_data, sample_id, elk_imm_ud(4u));
3912 }
3913
3914 elk_fs_reg flag_reg;
3915 struct elk_wm_prog_key *wm_prog_key = (struct elk_wm_prog_key *) s.key;
3916 if (wm_prog_key->multisample_fbo == ELK_SOMETIMES) {
3917 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(s.prog_data);
3918
3919 check_dynamic_msaa_flag(bld.exec_all().group(8, 0),
3920 wm_prog_data,
3921 INTEL_MSAA_FLAG_MULTISAMPLE_FBO);
3922 flag_reg = elk_flag_reg(0, 0);
3923 }
3924
3925 emit_pixel_interpolater_send(bld,
3926 ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3927 dest,
3928 elk_fs_reg(), /* src */
3929 msg_data,
3930 flag_reg,
3931 interpolation);
3932 break;
3933 }
3934
3935 case nir_intrinsic_load_barycentric_at_offset: {
3936 const glsl_interp_mode interpolation =
3937 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3938
3939 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3940
3941 if (const_offset) {
3942 assert(nir_src_bit_size(instr->src[0]) == 32);
3943 unsigned off_x = const_offset[0].u32 & 0xf;
3944 unsigned off_y = const_offset[1].u32 & 0xf;
3945
3946 emit_pixel_interpolater_send(bld,
3947 ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3948 dest,
3949 elk_fs_reg(), /* src */
3950 elk_imm_ud(off_x | (off_y << 4)),
3951 elk_fs_reg(), /* flag_reg */
3952 interpolation);
3953 } else {
3954 elk_fs_reg src = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_D);
3955 const enum elk_opcode opcode = ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3956 emit_pixel_interpolater_send(bld,
3957 opcode,
3958 dest,
3959 src,
3960 elk_imm_ud(0u),
3961 elk_fs_reg(), /* flag_reg */
3962 interpolation);
3963 }
3964 break;
3965 }
3966
3967 case nir_intrinsic_load_frag_coord:
3968 emit_fragcoord_interpolation(ntb, dest);
3969 break;
3970
3971 case nir_intrinsic_load_interpolated_input: {
3972 assert(instr->src[0].ssa &&
3973 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3974 nir_intrinsic_instr *bary_intrinsic =
3975 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3976 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3977 enum glsl_interp_mode interp_mode =
3978 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3979 elk_fs_reg dst_xy;
3980
3981 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3982 bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3983 /* Use the result of the PI message. */
3984 dst_xy = retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F);
3985 } else {
3986 /* Use the delta_xy values computed from the payload */
3987 enum elk_barycentric_mode bary = elk_barycentric_mode(bary_intrinsic);
3988 dst_xy = s.delta_xy[bary];
3989 }
3990
3991 for (unsigned int i = 0; i < instr->num_components; i++) {
3992 elk_fs_reg interp =
3993 s.interp_reg(bld, nir_intrinsic_base(instr),
3994 nir_intrinsic_component(instr) + i, 0);
3995 interp.type = ELK_REGISTER_TYPE_F;
3996 dest.type = ELK_REGISTER_TYPE_F;
3997
3998 if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3999 elk_fs_reg tmp = s.vgrf(glsl_float_type());
4000 bld.emit(ELK_FS_OPCODE_LINTERP, tmp, dst_xy, interp);
4001 bld.MUL(offset(dest, bld, i), tmp, s.pixel_w);
4002 } else {
4003 bld.emit(ELK_FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
4004 }
4005 }
4006 break;
4007 }
4008
4009 default:
4010 fs_nir_emit_intrinsic(ntb, bld, instr);
4011 break;
4012 }
4013 }
4014
4015 static void
fs_nir_emit_cs_intrinsic(nir_to_elk_state & ntb,nir_intrinsic_instr * instr)4016 fs_nir_emit_cs_intrinsic(nir_to_elk_state &ntb,
4017 nir_intrinsic_instr *instr)
4018 {
4019 const intel_device_info *devinfo = ntb.devinfo;
4020 const fs_builder &bld = ntb.bld;
4021 elk_fs_visitor &s = ntb.s;
4022
4023 assert(gl_shader_stage_uses_workgroup(s.stage));
4024 struct elk_cs_prog_data *cs_prog_data = elk_cs_prog_data(s.prog_data);
4025
4026 elk_fs_reg dest;
4027 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4028 dest = get_nir_def(ntb, instr->def);
4029
4030 switch (instr->intrinsic) {
4031 case nir_intrinsic_barrier:
4032 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4033 fs_nir_emit_intrinsic(ntb, bld, instr);
4034 if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
4035 /* The whole workgroup fits in a single HW thread, so all the
4036 * invocations are already executed lock-step. Instead of an actual
4037 * barrier just emit a scheduling fence, that will generate no code.
4038 */
4039 if (!s.nir->info.workgroup_size_variable &&
4040 s.workgroup_size() <= s.dispatch_width) {
4041 bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE);
4042 break;
4043 }
4044
4045 emit_barrier(ntb);
4046 cs_prog_data->uses_barrier = true;
4047 }
4048 break;
4049
4050 case nir_intrinsic_load_subgroup_id:
4051 s.cs_payload().load_subgroup_id(bld, dest);
4052 break;
4053
4054 case nir_intrinsic_load_workgroup_id: {
4055 elk_fs_reg val = ntb.system_values[SYSTEM_VALUE_WORKGROUP_ID];
4056 assert(val.file != BAD_FILE);
4057 dest.type = val.type;
4058 for (unsigned i = 0; i < 3; i++)
4059 bld.MOV(offset(dest, bld, i), offset(val, bld, i));
4060 break;
4061 }
4062
4063 case nir_intrinsic_load_num_workgroups: {
4064 assert(instr->def.bit_size == 32);
4065
4066 cs_prog_data->uses_num_work_groups = true;
4067
4068 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4069 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(0);
4070 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4071 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(3); /* num components */
4072 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = elk_imm_ud(0);
4073 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4074 elk_fs_inst *inst =
4075 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4076 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4077 inst->size_written = 3 * s.dispatch_width * 4;
4078 break;
4079 }
4080
4081 case nir_intrinsic_shared_atomic:
4082 case nir_intrinsic_shared_atomic_swap:
4083 fs_nir_emit_surface_atomic(ntb, bld, instr, elk_imm_ud(GFX7_BTI_SLM),
4084 false /* bindless */);
4085 break;
4086
4087 case nir_intrinsic_load_shared: {
4088 assert(devinfo->ver >= 7);
4089
4090 const unsigned bit_size = instr->def.bit_size;
4091 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4092 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4093
4094 elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
4095 int base = nir_intrinsic_base(instr);
4096 if (base) {
4097 elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4098 bld.ADD(addr_off, addr, elk_imm_d(base));
4099 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4100 } else {
4101 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4102 }
4103
4104 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4105 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4106
4107 /* Make dest unsigned because that's what the temporary will be */
4108 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4109
4110 /* Read the vector */
4111 assert(bit_size <= 32);
4112 assert(nir_intrinsic_align(instr) > 0);
4113 if (bit_size == 32 &&
4114 nir_intrinsic_align(instr) >= 4) {
4115 assert(instr->def.num_components <= 4);
4116 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4117 elk_fs_inst *inst =
4118 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4119 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4120 inst->size_written = instr->num_components * s.dispatch_width * 4;
4121 } else {
4122 assert(instr->def.num_components == 1);
4123 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4124
4125 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
4126 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4127 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4128 bld.MOV(dest, subscript(read_result, dest.type, 0));
4129 }
4130 break;
4131 }
4132
4133 case nir_intrinsic_store_shared: {
4134 assert(devinfo->ver >= 7);
4135
4136 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4137 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4138 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(GFX7_BTI_SLM);
4139
4140 elk_fs_reg addr = get_nir_src(ntb, instr->src[1]);
4141 int base = nir_intrinsic_base(instr);
4142 if (base) {
4143 elk_fs_reg addr_off = bld.vgrf(ELK_REGISTER_TYPE_UD, 1);
4144 bld.ADD(addr_off, addr, elk_imm_d(base));
4145 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr_off;
4146 } else {
4147 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = addr;
4148 }
4149
4150 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4151 /* No point in masking with sample mask, here we're handling compute
4152 * intrinsics.
4153 */
4154 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4155
4156 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
4157 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
4158
4159 assert(bit_size <= 32);
4160 assert(nir_intrinsic_write_mask(instr) ==
4161 (1u << instr->num_components) - 1);
4162 assert(nir_intrinsic_align(instr) > 0);
4163 if (bit_size == 32 &&
4164 nir_intrinsic_align(instr) >= 4) {
4165 assert(nir_src_num_components(instr->src[0]) <= 4);
4166 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4167 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4168 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4169 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4170 } else {
4171 assert(nir_src_num_components(instr->src[0]) == 1);
4172 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
4173
4174 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
4175 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4176
4177 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4178 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4179 }
4180 break;
4181 }
4182
4183 case nir_intrinsic_load_workgroup_size: {
4184 /* Should have been lowered by elk_nir_lower_cs_intrinsics() or
4185 * crocus/iris_setup_uniforms() for the variable group size case.
4186 */
4187 unreachable("Should have been lowered");
4188 break;
4189 }
4190
4191 default:
4192 fs_nir_emit_intrinsic(ntb, bld, instr);
4193 break;
4194 }
4195 }
4196
4197 static elk_fs_reg
elk_nir_reduction_op_identity(const fs_builder & bld,nir_op op,elk_reg_type type)4198 elk_nir_reduction_op_identity(const fs_builder &bld,
4199 nir_op op, elk_reg_type type)
4200 {
4201 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4202 switch (type_sz(type)) {
4203 case 1:
4204 if (type == ELK_REGISTER_TYPE_UB) {
4205 return elk_imm_uw(value.u8);
4206 } else {
4207 assert(type == ELK_REGISTER_TYPE_B);
4208 return elk_imm_w(value.i8);
4209 }
4210 case 2:
4211 return retype(elk_imm_uw(value.u16), type);
4212 case 4:
4213 return retype(elk_imm_ud(value.u32), type);
4214 case 8:
4215 if (type == ELK_REGISTER_TYPE_DF)
4216 return elk_setup_imm_df(bld, value.f64);
4217 else
4218 return retype(elk_imm_u64(value.u64), type);
4219 default:
4220 unreachable("Invalid type size");
4221 }
4222 }
4223
4224 static elk_opcode
elk_op_for_nir_reduction_op(nir_op op)4225 elk_op_for_nir_reduction_op(nir_op op)
4226 {
4227 switch (op) {
4228 case nir_op_iadd: return ELK_OPCODE_ADD;
4229 case nir_op_fadd: return ELK_OPCODE_ADD;
4230 case nir_op_imul: return ELK_OPCODE_MUL;
4231 case nir_op_fmul: return ELK_OPCODE_MUL;
4232 case nir_op_imin: return ELK_OPCODE_SEL;
4233 case nir_op_umin: return ELK_OPCODE_SEL;
4234 case nir_op_fmin: return ELK_OPCODE_SEL;
4235 case nir_op_imax: return ELK_OPCODE_SEL;
4236 case nir_op_umax: return ELK_OPCODE_SEL;
4237 case nir_op_fmax: return ELK_OPCODE_SEL;
4238 case nir_op_iand: return ELK_OPCODE_AND;
4239 case nir_op_ior: return ELK_OPCODE_OR;
4240 case nir_op_ixor: return ELK_OPCODE_XOR;
4241 default:
4242 unreachable("Invalid reduction operation");
4243 }
4244 }
4245
4246 static elk_conditional_mod
elk_cond_mod_for_nir_reduction_op(nir_op op)4247 elk_cond_mod_for_nir_reduction_op(nir_op op)
4248 {
4249 switch (op) {
4250 case nir_op_iadd: return ELK_CONDITIONAL_NONE;
4251 case nir_op_fadd: return ELK_CONDITIONAL_NONE;
4252 case nir_op_imul: return ELK_CONDITIONAL_NONE;
4253 case nir_op_fmul: return ELK_CONDITIONAL_NONE;
4254 case nir_op_imin: return ELK_CONDITIONAL_L;
4255 case nir_op_umin: return ELK_CONDITIONAL_L;
4256 case nir_op_fmin: return ELK_CONDITIONAL_L;
4257 case nir_op_imax: return ELK_CONDITIONAL_GE;
4258 case nir_op_umax: return ELK_CONDITIONAL_GE;
4259 case nir_op_fmax: return ELK_CONDITIONAL_GE;
4260 case nir_op_iand: return ELK_CONDITIONAL_NONE;
4261 case nir_op_ior: return ELK_CONDITIONAL_NONE;
4262 case nir_op_ixor: return ELK_CONDITIONAL_NONE;
4263 default:
4264 unreachable("Invalid reduction operation");
4265 }
4266 }
4267
4268 struct rebuild_resource {
4269 unsigned idx;
4270 std::vector<nir_def *> array;
4271 };
4272
4273 static bool
add_rebuild_src(nir_src * src,void * state)4274 add_rebuild_src(nir_src *src, void *state)
4275 {
4276 struct rebuild_resource *res = (struct rebuild_resource *) state;
4277
4278 for (nir_def *def : res->array) {
4279 if (def == src->ssa)
4280 return true;
4281 }
4282
4283 nir_foreach_src(src->ssa->parent_instr, add_rebuild_src, state);
4284 res->array.push_back(src->ssa);
4285 return true;
4286 }
4287
4288 static elk_fs_reg
try_rebuild_resource(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_def * resource_def)4289 try_rebuild_resource(nir_to_elk_state &ntb, const elk::fs_builder &bld, nir_def *resource_def)
4290 {
4291 /* Create a build at the location of the resource_intel intrinsic */
4292 fs_builder ubld8 = bld.exec_all().group(8, 0);
4293
4294 struct rebuild_resource resources = {};
4295 resources.idx = 0;
4296
4297 if (!nir_foreach_src(resource_def->parent_instr,
4298 add_rebuild_src, &resources))
4299 return elk_fs_reg();
4300 resources.array.push_back(resource_def);
4301
4302 if (resources.array.size() == 1) {
4303 nir_def *def = resources.array[0];
4304
4305 if (def->parent_instr->type == nir_instr_type_load_const) {
4306 nir_load_const_instr *load_const =
4307 nir_instr_as_load_const(def->parent_instr);
4308 return elk_imm_ud(load_const->value[0].i32);
4309 } else {
4310 assert(def->parent_instr->type == nir_instr_type_intrinsic &&
4311 (nir_instr_as_intrinsic(def->parent_instr)->intrinsic ==
4312 nir_intrinsic_load_uniform));
4313 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(def->parent_instr);
4314 unsigned base_offset = nir_intrinsic_base(intrin);
4315 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4316 elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4317 src.offset = load_offset + base_offset % 4;
4318 return src;
4319 }
4320 }
4321
4322 for (unsigned i = 0; i < resources.array.size(); i++) {
4323 nir_def *def = resources.array[i];
4324
4325 nir_instr *instr = def->parent_instr;
4326 switch (instr->type) {
4327 case nir_instr_type_load_const: {
4328 nir_load_const_instr *load_const =
4329 nir_instr_as_load_const(instr);
4330 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4331 ntb.resource_insts[def->index] =
4332 ubld8.MOV(dst, elk_imm_ud(load_const->value[0].i32));
4333 break;
4334 }
4335
4336 case nir_instr_type_alu: {
4337 nir_alu_instr *alu = nir_instr_as_alu(instr);
4338
4339 if (nir_op_infos[alu->op].num_inputs == 2) {
4340 if (alu->src[0].swizzle[0] != 0 ||
4341 alu->src[1].swizzle[0] != 0)
4342 break;
4343 } else if (nir_op_infos[alu->op].num_inputs == 3) {
4344 if (alu->src[0].swizzle[0] != 0 ||
4345 alu->src[1].swizzle[0] != 0 ||
4346 alu->src[2].swizzle[0] != 0)
4347 break;
4348 } else {
4349 /* Not supported ALU input count */
4350 break;
4351 }
4352
4353 switch (alu->op) {
4354 case nir_op_iadd: {
4355 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4356 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4357 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4358 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4359 assert(src0.type == ELK_REGISTER_TYPE_UD);
4360 ntb.resource_insts[def->index] =
4361 ubld8.ADD(dst,
4362 src0.file != IMM ? src0 : src1,
4363 src0.file != IMM ? src1 : src0);
4364 break;
4365 }
4366 case nir_op_ushr: {
4367 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4368 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4369 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4370 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4371 assert(src0.type == ELK_REGISTER_TYPE_UD);
4372 ntb.resource_insts[def->index] = ubld8.SHR(dst, src0, src1);
4373 break;
4374 }
4375 case nir_op_ishl: {
4376 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4377 elk_fs_reg src0 = ntb.resource_insts[alu->src[0].src.ssa->index]->dst;
4378 elk_fs_reg src1 = ntb.resource_insts[alu->src[1].src.ssa->index]->dst;
4379 assert(src0.file != BAD_FILE && src1.file != BAD_FILE);
4380 assert(src0.type == ELK_REGISTER_TYPE_UD);
4381 ntb.resource_insts[def->index] = ubld8.SHL(dst, src0, src1);
4382 break;
4383 }
4384 case nir_op_mov: {
4385 break;
4386 }
4387 default:
4388 break;
4389 }
4390 break;
4391 }
4392
4393 case nir_instr_type_intrinsic: {
4394 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
4395 switch (intrin->intrinsic) {
4396 case nir_intrinsic_resource_intel:
4397 ntb.resource_insts[def->index] =
4398 ntb.resource_insts[intrin->src[1].ssa->index];
4399 break;
4400
4401 case nir_intrinsic_load_uniform: {
4402 if (!nir_src_is_const(intrin->src[0]))
4403 break;
4404
4405 unsigned base_offset = nir_intrinsic_base(intrin);
4406 unsigned load_offset = nir_src_as_uint(intrin->src[0]);
4407 elk_fs_reg dst = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
4408 elk_fs_reg src(UNIFORM, base_offset / 4, ELK_REGISTER_TYPE_UD);
4409 src.offset = load_offset + base_offset % 4;
4410 ntb.resource_insts[def->index] = ubld8.MOV(dst, src);
4411 break;
4412 }
4413
4414 default:
4415 break;
4416 }
4417 break;
4418 }
4419
4420 default:
4421 break;
4422 }
4423
4424 if (ntb.resource_insts[def->index] == NULL)
4425 return elk_fs_reg();
4426 }
4427
4428 assert(ntb.resource_insts[resource_def->index] != NULL);
4429 return component(ntb.resource_insts[resource_def->index]->dst, 0);
4430 }
4431
4432 static elk_fs_reg
get_nir_image_intrinsic_image(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4433 get_nir_image_intrinsic_image(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4434 nir_intrinsic_instr *instr)
4435 {
4436 if (is_resource_src(instr->src[0])) {
4437 elk_fs_reg surf_index = get_resource_nir_src(ntb, instr->src[0]);
4438 if (surf_index.file != BAD_FILE)
4439 return surf_index;
4440 }
4441
4442 elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD);
4443 elk_fs_reg surf_index = image;
4444
4445 return bld.emit_uniformize(surf_index);
4446 }
4447
4448 static elk_fs_reg
get_nir_buffer_intrinsic_index(nir_to_elk_state & ntb,const elk::fs_builder & bld,nir_intrinsic_instr * instr)4449 get_nir_buffer_intrinsic_index(nir_to_elk_state &ntb, const elk::fs_builder &bld,
4450 nir_intrinsic_instr *instr)
4451 {
4452 /* SSBO stores are weird in that their index is in src[1] */
4453 const bool is_store =
4454 instr->intrinsic == nir_intrinsic_store_ssbo ||
4455 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4456 nir_src src = is_store ? instr->src[1] : instr->src[0];
4457
4458 if (nir_src_is_const(src)) {
4459 return elk_imm_ud(nir_src_as_uint(src));
4460 } else if (is_resource_src(src)) {
4461 elk_fs_reg surf_index = get_resource_nir_src(ntb, src);
4462 if (surf_index.file != BAD_FILE)
4463 return surf_index;
4464 }
4465 return bld.emit_uniformize(get_nir_src(ntb, src));
4466 }
4467
4468 /**
4469 * The offsets we get from NIR act as if each SIMD channel has it's own blob
4470 * of contiguous space. However, if we actually place each SIMD channel in
4471 * it's own space, we end up with terrible cache performance because each SIMD
4472 * channel accesses a different cache line even when they're all accessing the
4473 * same byte offset. To deal with this problem, we swizzle the address using
4474 * a simple algorithm which ensures that any time a SIMD message reads or
4475 * writes the same address, it's all in the same cache line. We have to keep
4476 * the bottom two bits fixed so that we can read/write up to a dword at a time
4477 * and the individual element is contiguous. We do this by splitting the
4478 * address as follows:
4479 *
4480 * 31 4-6 2 0
4481 * +-------------------------------+------------+----------+
4482 * | Hi address bits | chan index | addr low |
4483 * +-------------------------------+------------+----------+
4484 *
4485 * In other words, the bottom two address bits stay, and the top 30 get
4486 * shifted up so that we can stick the SIMD channel index in the middle. This
4487 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4488 * at the same logical offset, the scratch read/write instruction acts on
4489 * continuous elements and we get good cache locality.
4490 */
4491 static elk_fs_reg
swizzle_nir_scratch_addr(nir_to_elk_state & ntb,const elk::fs_builder & bld,const elk_fs_reg & nir_addr,bool in_dwords)4492 swizzle_nir_scratch_addr(nir_to_elk_state &ntb,
4493 const elk::fs_builder &bld,
4494 const elk_fs_reg &nir_addr,
4495 bool in_dwords)
4496 {
4497 elk_fs_visitor &s = ntb.s;
4498
4499 const elk_fs_reg &chan_index =
4500 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4501 const unsigned chan_index_bits = ffs(s.dispatch_width) - 1;
4502
4503 elk_fs_reg addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4504 if (in_dwords) {
4505 /* In this case, we know the address is aligned to a DWORD and we want
4506 * the final address in DWORDs.
4507 */
4508 bld.SHL(addr, nir_addr, elk_imm_ud(chan_index_bits - 2));
4509 bld.OR(addr, addr, chan_index);
4510 } else {
4511 /* This case substantially more annoying because we have to pay
4512 * attention to those pesky two bottom bits.
4513 */
4514 elk_fs_reg addr_hi = bld.vgrf(ELK_REGISTER_TYPE_UD);
4515 bld.AND(addr_hi, nir_addr, elk_imm_ud(~0x3u));
4516 bld.SHL(addr_hi, addr_hi, elk_imm_ud(chan_index_bits));
4517 elk_fs_reg chan_addr = bld.vgrf(ELK_REGISTER_TYPE_UD);
4518 bld.SHL(chan_addr, chan_index, elk_imm_ud(2));
4519 bld.AND(addr, nir_addr, elk_imm_ud(0x3u));
4520 bld.OR(addr, addr, addr_hi);
4521 bld.OR(addr, addr, chan_addr);
4522 }
4523 return addr;
4524 }
4525
4526 static unsigned
choose_oword_block_size_dwords(const struct intel_device_info * devinfo,unsigned dwords)4527 choose_oword_block_size_dwords(const struct intel_device_info *devinfo,
4528 unsigned dwords)
4529 {
4530 unsigned block;
4531 if (devinfo->has_lsc && dwords >= 64) {
4532 block = 64;
4533 } else if (dwords >= 32) {
4534 block = 32;
4535 } else if (dwords >= 16) {
4536 block = 16;
4537 } else {
4538 block = 8;
4539 }
4540 assert(block <= dwords);
4541 return block;
4542 }
4543
4544 static void
increment_a64_address(const fs_builder & bld,elk_fs_reg address,uint32_t v)4545 increment_a64_address(const fs_builder &bld, elk_fs_reg address, uint32_t v)
4546 {
4547 if (bld.shader->devinfo->has_64bit_int) {
4548 bld.ADD(address, address, elk_imm_ud(v));
4549 } else {
4550 elk_fs_reg low = retype(address, ELK_REGISTER_TYPE_UD);
4551 elk_fs_reg high = offset(low, bld, 1);
4552
4553 /* Add low and if that overflows, add carry to high. */
4554 bld.ADD(low, low, elk_imm_ud(v))->conditional_mod = ELK_CONDITIONAL_O;
4555 bld.ADD(high, high, elk_imm_ud(0x1))->predicate = ELK_PREDICATE_NORMAL;
4556 }
4557 }
4558
4559 static elk_fs_reg
emit_fence(const fs_builder & bld,enum elk_opcode opcode,uint8_t sfid,uint32_t desc,bool commit_enable,uint8_t bti)4560 emit_fence(const fs_builder &bld, enum elk_opcode opcode,
4561 uint8_t sfid, uint32_t desc,
4562 bool commit_enable, uint8_t bti)
4563 {
4564 assert(opcode == ELK_SHADER_OPCODE_INTERLOCK ||
4565 opcode == ELK_SHADER_OPCODE_MEMORY_FENCE);
4566
4567 elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
4568 elk_fs_inst *fence = bld.emit(opcode, dst, elk_vec8_grf(0, 0),
4569 elk_imm_ud(commit_enable),
4570 elk_imm_ud(bti));
4571 fence->sfid = sfid;
4572 fence->desc = desc;
4573
4574 return dst;
4575 }
4576
4577 /**
4578 * Create a MOV to read the timestamp register.
4579 */
4580 static elk_fs_reg
get_timestamp(const fs_builder & bld)4581 get_timestamp(const fs_builder &bld)
4582 {
4583 elk_fs_visitor &s = *bld.shader;
4584 const intel_device_info *devinfo = s.devinfo;
4585
4586 assert(devinfo->ver >= 7);
4587
4588 elk_fs_reg ts = elk_fs_reg(retype(elk_vec4_reg(ELK_ARCHITECTURE_REGISTER_FILE,
4589 ELK_ARF_TIMESTAMP,
4590 0),
4591 ELK_REGISTER_TYPE_UD));
4592
4593 elk_fs_reg dst = elk_fs_reg(VGRF, s.alloc.allocate(1), ELK_REGISTER_TYPE_UD);
4594
4595 /* We want to read the 3 fields we care about even if it's not enabled in
4596 * the dispatch.
4597 */
4598 bld.group(4, 0).exec_all().MOV(dst, ts);
4599
4600 return dst;
4601 }
4602
4603 static void
fs_nir_emit_intrinsic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)4604 fs_nir_emit_intrinsic(nir_to_elk_state &ntb,
4605 const fs_builder &bld, nir_intrinsic_instr *instr)
4606 {
4607 const intel_device_info *devinfo = ntb.devinfo;
4608 elk_fs_visitor &s = ntb.s;
4609
4610 /* We handle this as a special case */
4611 if (instr->intrinsic == nir_intrinsic_decl_reg) {
4612 assert(nir_intrinsic_num_array_elems(instr) == 0);
4613 unsigned bit_size = nir_intrinsic_bit_size(instr);
4614 unsigned num_components = nir_intrinsic_num_components(instr);
4615 const elk_reg_type reg_type =
4616 elk_reg_type_from_bit_size(bit_size, bit_size == 8 ?
4617 ELK_REGISTER_TYPE_D :
4618 ELK_REGISTER_TYPE_F);
4619
4620 /* Re-use the destination's slot in the table for the register */
4621 ntb.ssa_values[instr->def.index] =
4622 bld.vgrf(reg_type, num_components);
4623 return;
4624 }
4625
4626 elk_fs_reg dest;
4627 if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4628 dest = get_nir_def(ntb, instr->def);
4629
4630 switch (instr->intrinsic) {
4631 case nir_intrinsic_resource_intel:
4632 ntb.ssa_bind_infos[instr->def.index].valid = true;
4633 ntb.ssa_bind_infos[instr->def.index].bindless =
4634 (nir_intrinsic_resource_access_intel(instr) &
4635 nir_resource_intel_bindless) != 0;
4636 ntb.ssa_bind_infos[instr->def.index].block =
4637 nir_intrinsic_resource_block_intel(instr);
4638 ntb.ssa_bind_infos[instr->def.index].set =
4639 nir_intrinsic_desc_set(instr);
4640 ntb.ssa_bind_infos[instr->def.index].binding =
4641 nir_intrinsic_binding(instr);
4642
4643 if (nir_intrinsic_resource_access_intel(instr) &
4644 nir_resource_intel_non_uniform) {
4645 ntb.resource_values[instr->def.index] = elk_fs_reg();
4646 } else {
4647 ntb.resource_values[instr->def.index] =
4648 try_rebuild_resource(ntb, bld, instr->src[1].ssa);
4649 }
4650 ntb.ssa_values[instr->def.index] =
4651 ntb.ssa_values[instr->src[1].ssa->index];
4652 break;
4653
4654 case nir_intrinsic_load_reg:
4655 case nir_intrinsic_store_reg:
4656 /* Nothing to do with these. */
4657 break;
4658
4659 case nir_intrinsic_image_load:
4660 case nir_intrinsic_image_store:
4661 case nir_intrinsic_image_atomic:
4662 case nir_intrinsic_image_atomic_swap:
4663 case nir_intrinsic_bindless_image_load:
4664 case nir_intrinsic_bindless_image_store:
4665 case nir_intrinsic_bindless_image_atomic:
4666 case nir_intrinsic_bindless_image_atomic_swap: {
4667 /* Get some metadata from the image intrinsic. */
4668 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
4669
4670 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4671
4672 switch (instr->intrinsic) {
4673 case nir_intrinsic_image_load:
4674 case nir_intrinsic_image_store:
4675 case nir_intrinsic_image_atomic:
4676 case nir_intrinsic_image_atomic_swap:
4677 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4678 get_nir_image_intrinsic_image(ntb, bld, instr);
4679 break;
4680
4681 default:
4682 /* Bindless */
4683 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4684 get_nir_image_intrinsic_image(ntb, bld, instr);
4685 break;
4686 }
4687
4688 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4689 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4690 elk_imm_ud(nir_image_intrinsic_coord_components(instr));
4691
4692 /* Emit an image load, store or atomic op. */
4693 if (instr->intrinsic == nir_intrinsic_image_load ||
4694 instr->intrinsic == nir_intrinsic_bindless_image_load) {
4695 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4696 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4697 elk_fs_inst *inst =
4698 bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4699 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4700 inst->size_written = instr->num_components * s.dispatch_width * 4;
4701 } else if (instr->intrinsic == nir_intrinsic_image_store ||
4702 instr->intrinsic == nir_intrinsic_bindless_image_store) {
4703 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4704 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[3]);
4705 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4706 bld.emit(ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4707 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4708 } else {
4709 unsigned num_srcs = info->num_srcs;
4710 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
4711 if (op == LSC_OP_ATOMIC_INC || op == LSC_OP_ATOMIC_DEC) {
4712 assert(num_srcs == 4);
4713 num_srcs = 3;
4714 }
4715
4716 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
4717
4718 elk_fs_reg data;
4719 if (num_srcs >= 4)
4720 data = get_nir_src(ntb, instr->src[3]);
4721 if (num_srcs >= 5) {
4722 elk_fs_reg tmp = bld.vgrf(data.type, 2);
4723 elk_fs_reg sources[2] = { data, get_nir_src(ntb, instr->src[4]) };
4724 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4725 data = tmp;
4726 }
4727 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4728 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4729
4730 bld.emit(ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4731 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4732 }
4733 break;
4734 }
4735
4736 case nir_intrinsic_image_size:
4737 case nir_intrinsic_bindless_image_size: {
4738 /* Cube image sizes should have previously been lowered to a 2D array */
4739 assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
4740
4741 /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4742 * into will handle the binding table index for us in the geneerator.
4743 * Incidentally, this means that we can handle bindless with exactly the
4744 * same code.
4745 */
4746 elk_fs_reg image = retype(get_nir_src_imm(ntb, instr->src[0]),
4747 ELK_REGISTER_TYPE_UD);
4748 image = bld.emit_uniformize(image);
4749
4750 assert(nir_src_as_uint(instr->src[1]) == 0);
4751
4752 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4753 if (instr->intrinsic == nir_intrinsic_image_size)
4754 srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4755 else
4756 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4757 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_d(0);
4758 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(0);
4759 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(0);
4760 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_d(0);
4761
4762 /* Since the image size is always uniform, we can just emit a SIMD8
4763 * query instruction and splat the result out.
4764 */
4765 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
4766
4767 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
4768 elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4769 tmp, srcs, ARRAY_SIZE(srcs));
4770 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
4771
4772 for (unsigned c = 0; c < instr->def.num_components; ++c) {
4773 bld.MOV(offset(retype(dest, tmp.type), bld, c),
4774 component(offset(tmp, ubld, c), 0));
4775 }
4776 break;
4777 }
4778
4779 case nir_intrinsic_image_load_raw_intel: {
4780 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4781 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4782 get_nir_image_intrinsic_image(ntb, bld, instr);
4783 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4784 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4785 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4786 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
4787
4788 elk_fs_inst *inst =
4789 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4790 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4791 inst->size_written = instr->num_components * s.dispatch_width * 4;
4792 break;
4793 }
4794
4795 case nir_intrinsic_image_store_raw_intel: {
4796 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4797 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4798 get_nir_image_intrinsic_image(ntb, bld, instr);
4799 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
4800 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(ntb, instr->src[2]);
4801 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
4802 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
4803 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
4804
4805 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4806 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4807 break;
4808 }
4809
4810 case nir_intrinsic_barrier:
4811 case nir_intrinsic_begin_invocation_interlock:
4812 case nir_intrinsic_end_invocation_interlock: {
4813 bool ugm_fence, slm_fence, tgm_fence, urb_fence;
4814 enum elk_opcode opcode = ELK_OPCODE_NOP;
4815
4816 /* Handling interlock intrinsics here will allow the logic for IVB
4817 * render cache (see below) to be reused.
4818 */
4819
4820 switch (instr->intrinsic) {
4821 case nir_intrinsic_barrier: {
4822 /* Note we only care about the memory part of the
4823 * barrier. The execution part will be taken care
4824 * of by the stage specific intrinsic handler functions.
4825 */
4826 nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
4827 ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
4828 slm_fence = modes & nir_var_mem_shared;
4829 tgm_fence = modes & nir_var_image;
4830 urb_fence = modes & (nir_var_shader_out | nir_var_mem_task_payload);
4831 if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
4832 opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4833 break;
4834 }
4835
4836 case nir_intrinsic_begin_invocation_interlock:
4837 /* For beginInvocationInterlockARB(), we will generate a memory fence
4838 * but with a different opcode so that generator can pick SENDC
4839 * instead of SEND.
4840 */
4841 assert(s.stage == MESA_SHADER_FRAGMENT);
4842 ugm_fence = tgm_fence = true;
4843 slm_fence = urb_fence = false;
4844 opcode = ELK_SHADER_OPCODE_INTERLOCK;
4845 break;
4846
4847 case nir_intrinsic_end_invocation_interlock:
4848 /* For endInvocationInterlockARB(), we need to insert a memory fence which
4849 * stalls in the shader until the memory transactions prior to that
4850 * fence are complete. This ensures that the shader does not end before
4851 * any writes from its critical section have landed. Otherwise, you can
4852 * end up with a case where the next invocation on that pixel properly
4853 * stalls for previous FS invocation on its pixel to complete but
4854 * doesn't actually wait for the dataport memory transactions from that
4855 * thread to land before submitting its own.
4856 */
4857 assert(s.stage == MESA_SHADER_FRAGMENT);
4858 ugm_fence = tgm_fence = true;
4859 slm_fence = urb_fence = false;
4860 opcode = ELK_SHADER_OPCODE_MEMORY_FENCE;
4861 break;
4862
4863 default:
4864 unreachable("invalid intrinsic");
4865 }
4866
4867 if (opcode == ELK_OPCODE_NOP)
4868 break;
4869
4870 if (s.nir->info.shared_size > 0) {
4871 assert(gl_shader_stage_uses_workgroup(s.stage));
4872 } else {
4873 slm_fence = false;
4874 }
4875
4876 /* If the workgroup fits in a single HW thread, the messages for SLM are
4877 * processed in-order and the shader itself is already synchronized so
4878 * the memory fence is not necessary.
4879 *
4880 * TODO: Check if applies for many HW threads sharing same Data Port.
4881 */
4882 if (!s.nir->info.workgroup_size_variable &&
4883 slm_fence && s.workgroup_size() <= s.dispatch_width)
4884 slm_fence = false;
4885
4886 switch (s.stage) {
4887 case MESA_SHADER_TESS_CTRL:
4888 break;
4889 default:
4890 urb_fence = false;
4891 break;
4892 }
4893
4894 unsigned fence_regs_count = 0;
4895 elk_fs_reg fence_regs[4] = {};
4896
4897 const fs_builder ubld = bld.group(8, 0);
4898
4899 /* Prior to Icelake, they're all lumped into a single cache except on
4900 * Ivy Bridge and Bay Trail where typed messages actually go through
4901 * the render cache. There, we need both fences because we may
4902 * access storage images as either typed or untyped.
4903 */
4904 const bool render_fence = tgm_fence && devinfo->verx10 == 70;
4905
4906 const bool commit_enable = render_fence ||
4907 instr->intrinsic == nir_intrinsic_end_invocation_interlock;
4908
4909 if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
4910 fence_regs[fence_regs_count++] =
4911 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 0,
4912 commit_enable, 0 /* BTI */);
4913 }
4914
4915 if (render_fence) {
4916 fence_regs[fence_regs_count++] =
4917 emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 0,
4918 commit_enable, /* bti */ 0);
4919 }
4920
4921 assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
4922
4923 /* There are four cases where we want to insert a stall:
4924 *
4925 * 1. If we're a nir_intrinsic_end_invocation_interlock. This is
4926 * required to ensure that the shader EOT doesn't happen until
4927 * after the fence returns. Otherwise, we might end up with the
4928 * next shader invocation for that pixel not respecting our fence
4929 * because it may happen on a different HW thread.
4930 *
4931 * 2. If we have multiple fences. This is required to ensure that
4932 * they all complete and nothing gets weirdly out-of-order.
4933 *
4934 * 3. If we have no fences. In this case, we need at least a
4935 * scheduling barrier to keep the compiler from moving things
4936 * around in an invalid way.
4937 */
4938 if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
4939 fence_regs_count != 1) {
4940 ubld.exec_all().group(1, 0).emit(
4941 ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
4942 fence_regs, fence_regs_count);
4943 }
4944
4945 break;
4946 }
4947
4948 case nir_intrinsic_shader_clock: {
4949 /* We cannot do anything if there is an event, so ignore it for now */
4950 const elk_fs_reg shader_clock = get_timestamp(bld);
4951 const elk_fs_reg srcs[] = { component(shader_clock, 0),
4952 component(shader_clock, 1) };
4953 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4954 break;
4955 }
4956
4957 case nir_intrinsic_load_reloc_const_intel: {
4958 uint32_t id = nir_intrinsic_param_idx(instr);
4959
4960 /* Emit the reloc in the smallest SIMD size to limit register usage. */
4961 const fs_builder ubld = bld.exec_all().group(1, 0);
4962 elk_fs_reg small_dest = ubld.vgrf(dest.type);
4963 ubld.UNDEF(small_dest);
4964 ubld.exec_all().group(1, 0).emit(ELK_SHADER_OPCODE_MOV_RELOC_IMM,
4965 small_dest, elk_imm_ud(id));
4966
4967 /* Copy propagation will get rid of this MOV. */
4968 bld.MOV(dest, component(small_dest, 0));
4969 break;
4970 }
4971
4972 case nir_intrinsic_load_uniform: {
4973 /* Offsets are in bytes but they should always aligned to
4974 * the type size
4975 */
4976 unsigned base_offset = nir_intrinsic_base(instr);
4977 assert(base_offset % 4 == 0 || base_offset % type_sz(dest.type) == 0);
4978
4979 elk_fs_reg src(UNIFORM, base_offset / 4, dest.type);
4980
4981 if (nir_src_is_const(instr->src[0])) {
4982 unsigned load_offset = nir_src_as_uint(instr->src[0]);
4983 assert(load_offset % type_sz(dest.type) == 0);
4984 /* The base offset can only handle 32-bit units, so for 16-bit
4985 * data take the modulo of the offset with 4 bytes and add it to
4986 * the offset to read from within the source register.
4987 */
4988 src.offset = load_offset + base_offset % 4;
4989
4990 for (unsigned j = 0; j < instr->num_components; j++) {
4991 bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4992 }
4993 } else {
4994 elk_fs_reg indirect = retype(get_nir_src(ntb, instr->src[0]),
4995 ELK_REGISTER_TYPE_UD);
4996
4997 /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4998 * go past the end of the uniform. In order to keep the n'th
4999 * component from running past, we subtract off the size of all but
5000 * one component of the vector.
5001 */
5002 assert(nir_intrinsic_range(instr) >=
5003 instr->num_components * type_sz(dest.type));
5004 unsigned read_size = nir_intrinsic_range(instr) -
5005 (instr->num_components - 1) * type_sz(dest.type);
5006
5007 bool supports_64bit_indirects = devinfo->platform != INTEL_PLATFORM_CHV;
5008
5009 if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
5010 for (unsigned j = 0; j < instr->num_components; j++) {
5011 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5012 offset(dest, bld, j), offset(src, bld, j),
5013 indirect, elk_imm_ud(read_size));
5014 }
5015 } else {
5016 const unsigned num_mov_indirects =
5017 type_sz(dest.type) / type_sz(ELK_REGISTER_TYPE_UD);
5018 /* We read a little bit less per MOV INDIRECT, as they are now
5019 * 32-bits ones instead of 64-bit. Fix read_size then.
5020 */
5021 const unsigned read_size_32bit = read_size -
5022 (num_mov_indirects - 1) * type_sz(ELK_REGISTER_TYPE_UD);
5023 for (unsigned j = 0; j < instr->num_components; j++) {
5024 for (unsigned i = 0; i < num_mov_indirects; i++) {
5025 bld.emit(ELK_SHADER_OPCODE_MOV_INDIRECT,
5026 subscript(offset(dest, bld, j), ELK_REGISTER_TYPE_UD, i),
5027 subscript(offset(src, bld, j), ELK_REGISTER_TYPE_UD, i),
5028 indirect, elk_imm_ud(read_size_32bit));
5029 }
5030 }
5031 }
5032 }
5033 break;
5034 }
5035
5036 case nir_intrinsic_load_ubo:
5037 case nir_intrinsic_load_ubo_uniform_block_intel: {
5038 elk_fs_reg surface, surface_handle;
5039
5040 if (get_nir_src_bindless(ntb, instr->src[0]))
5041 surface_handle = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5042 else
5043 surface = get_nir_buffer_intrinsic_index(ntb, bld, instr);
5044
5045 if (!nir_src_is_const(instr->src[1])) {
5046 if (instr->intrinsic == nir_intrinsic_load_ubo) {
5047 /* load_ubo with non-uniform offset */
5048 elk_fs_reg base_offset = retype(get_nir_src(ntb, instr->src[1]),
5049 ELK_REGISTER_TYPE_UD);
5050
5051 const unsigned comps_per_load = type_sz(dest.type) == 8 ? 2 : 4;
5052
5053 for (int i = 0; i < instr->num_components; i += comps_per_load) {
5054 const unsigned remaining = instr->num_components - i;
5055 s.VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i),
5056 surface, surface_handle,
5057 base_offset,
5058 i * type_sz(dest.type),
5059 instr->def.bit_size / 8,
5060 MIN2(remaining, comps_per_load));
5061 }
5062
5063 s.prog_data->has_ubo_pull = true;
5064 } else {
5065 /* load_ubo with uniform offset */
5066 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5067 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5068 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5069
5070 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5071
5072 srcs[SURFACE_LOGICAL_SRC_SURFACE] = surface;
5073 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = surface_handle;
5074
5075 const nir_src load_offset = instr->src[1];
5076 if (nir_src_is_const(load_offset)) {
5077 elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5078 ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5079 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5080 } else {
5081 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5082 bld.emit_uniformize(get_nir_src(ntb, load_offset));
5083 }
5084
5085 const unsigned total_dwords =
5086 ALIGN(instr->num_components, REG_SIZE * reg_unit(devinfo) / 4);
5087 unsigned loaded_dwords = 0;
5088
5089 const elk_fs_reg packed_consts =
5090 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5091
5092 while (loaded_dwords < total_dwords) {
5093 const unsigned block =
5094 choose_oword_block_size_dwords(devinfo,
5095 total_dwords - loaded_dwords);
5096 const unsigned block_bytes = block * 4;
5097
5098 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5099
5100 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5101 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5102 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5103 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5104 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5105
5106 loaded_dwords += block;
5107
5108 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5109 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5110 elk_imm_ud(block_bytes));
5111 }
5112
5113 for (unsigned c = 0; c < instr->num_components; c++) {
5114 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5115 component(packed_consts, c));
5116 }
5117
5118 s.prog_data->has_ubo_pull = true;
5119 }
5120 } else {
5121 /* Even if we are loading doubles, a pull constant load will load
5122 * a 32-bit vec4, so should only reserve vgrf space for that. If we
5123 * need to load a full dvec4 we will have to emit 2 loads. This is
5124 * similar to demote_pull_constants(), except that in that case we
5125 * see individual accesses to each component of the vector and then
5126 * we let CSE deal with duplicate loads. Here we see a vector access
5127 * and we have to split it if necessary.
5128 */
5129 const unsigned type_size = type_sz(dest.type);
5130 const unsigned load_offset = nir_src_as_uint(instr->src[1]);
5131 const unsigned ubo_block =
5132 elk_nir_ubo_surface_index_get_push_block(instr->src[0]);
5133 const unsigned offset_256b = load_offset / 32;
5134 const unsigned end_256b =
5135 DIV_ROUND_UP(load_offset + type_size * instr->num_components, 32);
5136
5137 /* See if we've selected this as a push constant candidate */
5138 elk_fs_reg push_reg;
5139 for (int i = 0; i < 4; i++) {
5140 const struct elk_ubo_range *range = &s.prog_data->ubo_ranges[i];
5141 if (range->block == ubo_block &&
5142 offset_256b >= range->start &&
5143 end_256b <= range->start + range->length) {
5144
5145 push_reg = elk_fs_reg(UNIFORM, UBO_START + i, dest.type);
5146 push_reg.offset = load_offset - 32 * range->start;
5147 break;
5148 }
5149 }
5150
5151 if (push_reg.file != BAD_FILE) {
5152 for (unsigned i = 0; i < instr->num_components; i++) {
5153 bld.MOV(offset(dest, bld, i),
5154 byte_offset(push_reg, i * type_size));
5155 }
5156 break;
5157 }
5158
5159 s.prog_data->has_ubo_pull = true;
5160
5161 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
5162 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
5163
5164 for (unsigned c = 0; c < instr->num_components;) {
5165 const unsigned base = load_offset + c * type_size;
5166 /* Number of usable components in the next block-aligned load. */
5167 const unsigned count = MIN2(instr->num_components - c,
5168 (block_sz - base % block_sz) / type_size);
5169
5170 const elk_fs_reg packed_consts = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5171 elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
5172 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface;
5173 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
5174 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = elk_imm_ud(base & ~(block_sz - 1));
5175 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = elk_imm_ud(block_sz);
5176
5177 ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
5178 srcs, PULL_UNIFORM_CONSTANT_SRCS);
5179
5180 const elk_fs_reg consts =
5181 retype(byte_offset(packed_consts, base & (block_sz - 1)),
5182 dest.type);
5183
5184 for (unsigned d = 0; d < count; d++)
5185 bld.MOV(offset(dest, bld, c + d), component(consts, d));
5186
5187 c += count;
5188 }
5189 }
5190 break;
5191 }
5192
5193 case nir_intrinsic_load_global:
5194 case nir_intrinsic_load_global_constant: {
5195 assert(devinfo->ver >= 8);
5196
5197 assert(instr->def.bit_size <= 32);
5198 assert(nir_intrinsic_align(instr) > 0);
5199 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5200 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[0]);
5201 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5202 srcs[A64_LOGICAL_ENABLE_HELPERS] =
5203 elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5204
5205 if (instr->def.bit_size == 32 &&
5206 nir_intrinsic_align(instr) >= 4) {
5207 assert(instr->def.num_components <= 4);
5208
5209 srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5210
5211 elk_fs_inst *inst =
5212 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, dest,
5213 srcs, A64_LOGICAL_NUM_SRCS);
5214 inst->size_written = instr->num_components *
5215 inst->dst.component_size(inst->exec_size);
5216 } else {
5217 const unsigned bit_size = instr->def.bit_size;
5218 assert(instr->def.num_components == 1);
5219 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5220
5221 srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5222
5223 bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, tmp,
5224 srcs, A64_LOGICAL_NUM_SRCS);
5225 bld.MOV(dest, subscript(tmp, dest.type, 0));
5226 }
5227 break;
5228 }
5229
5230 case nir_intrinsic_store_global: {
5231 assert(devinfo->ver >= 8);
5232
5233 assert(nir_src_bit_size(instr->src[0]) <= 32);
5234 assert(nir_intrinsic_write_mask(instr) ==
5235 (1u << instr->num_components) - 1);
5236 assert(nir_intrinsic_align(instr) > 0);
5237
5238 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5239 srcs[A64_LOGICAL_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5240 srcs[A64_LOGICAL_ENABLE_HELPERS] =
5241 elk_imm_ud(nir_intrinsic_access(instr) & ACCESS_INCLUDE_HELPERS);
5242
5243 if (nir_src_bit_size(instr->src[0]) == 32 &&
5244 nir_intrinsic_align(instr) >= 4) {
5245 assert(nir_src_num_components(instr->src[0]) <= 4);
5246
5247 srcs[A64_LOGICAL_SRC] = get_nir_src(ntb, instr->src[0]); /* Data */
5248 srcs[A64_LOGICAL_ARG] = elk_imm_ud(instr->num_components);
5249
5250 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, elk_fs_reg(),
5251 srcs, A64_LOGICAL_NUM_SRCS);
5252 } else {
5253 assert(nir_src_num_components(instr->src[0]) == 1);
5254 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5255 elk_reg_type data_type =
5256 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5257 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5258 bld.MOV(tmp, retype(get_nir_src(ntb, instr->src[0]), data_type));
5259
5260 srcs[A64_LOGICAL_SRC] = tmp;
5261 srcs[A64_LOGICAL_ARG] = elk_imm_ud(bit_size);
5262
5263 bld.emit(ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, elk_fs_reg(),
5264 srcs, A64_LOGICAL_NUM_SRCS);
5265 }
5266 break;
5267 }
5268
5269 case nir_intrinsic_global_atomic:
5270 case nir_intrinsic_global_atomic_swap:
5271 fs_nir_emit_global_atomic(ntb, bld, instr);
5272 break;
5273
5274 case nir_intrinsic_load_global_constant_uniform_block_intel: {
5275 const unsigned total_dwords = ALIGN(instr->num_components,
5276 REG_SIZE * reg_unit(devinfo) / 4);
5277 unsigned loaded_dwords = 0;
5278
5279 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5280 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5281 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5282
5283 const elk_fs_reg packed_consts =
5284 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5285 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
5286
5287 while (loaded_dwords < total_dwords) {
5288 const unsigned block =
5289 choose_oword_block_size_dwords(devinfo,
5290 total_dwords - loaded_dwords);
5291 const unsigned block_bytes = block * 4;
5292
5293 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5294
5295 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
5296 srcs[A64_LOGICAL_ADDRESS] = address;
5297 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
5298 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
5299 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
5300 ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5301 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5302 srcs, A64_LOGICAL_NUM_SRCS)->size_written =
5303 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5304
5305 increment_a64_address(ubld1, address, block_bytes);
5306 loaded_dwords += block;
5307 }
5308
5309 for (unsigned c = 0; c < instr->num_components; c++)
5310 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5311 component(packed_consts, c));
5312
5313 break;
5314 }
5315
5316 case nir_intrinsic_load_ssbo: {
5317 assert(devinfo->ver >= 7);
5318
5319 const unsigned bit_size = instr->def.bit_size;
5320 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5321 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5322 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5323 SURFACE_LOGICAL_SRC_SURFACE] =
5324 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5325 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
5326 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5327 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5328
5329 /* Make dest unsigned because that's what the temporary will be */
5330 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5331
5332 /* Read the vector */
5333 assert(bit_size <= 32);
5334 assert(nir_intrinsic_align(instr) > 0);
5335 if (bit_size == 32 &&
5336 nir_intrinsic_align(instr) >= 4) {
5337 assert(instr->def.num_components <= 4);
5338 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5339 elk_fs_inst *inst =
5340 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5341 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5342 inst->size_written = instr->num_components * s.dispatch_width * 4;
5343 } else {
5344 assert(instr->def.num_components == 1);
5345 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5346
5347 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5348 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5349 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5350 bld.MOV(dest, subscript(read_result, dest.type, 0));
5351 }
5352 break;
5353 }
5354
5355 case nir_intrinsic_store_ssbo: {
5356 assert(devinfo->ver >= 7);
5357
5358 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5359 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5360 srcs[get_nir_src_bindless(ntb, instr->src[1]) ?
5361 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5362 SURFACE_LOGICAL_SRC_SURFACE] =
5363 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5364 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[2]);
5365 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5366 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
5367
5368 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5369 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5370
5371 assert(bit_size <= 32);
5372 assert(nir_intrinsic_write_mask(instr) ==
5373 (1u << instr->num_components) - 1);
5374 assert(nir_intrinsic_align(instr) > 0);
5375 if (bit_size == 32 &&
5376 nir_intrinsic_align(instr) >= 4) {
5377 assert(nir_src_num_components(instr->src[0]) <= 4);
5378 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5379 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(instr->num_components);
5380 bld.emit(ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5381 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5382 } else {
5383 assert(nir_src_num_components(instr->src[0]) == 1);
5384 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5385
5386 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5387 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5388
5389 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5390 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5391 }
5392 break;
5393 }
5394
5395 case nir_intrinsic_load_ssbo_uniform_block_intel:
5396 case nir_intrinsic_load_shared_uniform_block_intel: {
5397 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5398
5399 const bool is_ssbo =
5400 instr->intrinsic == nir_intrinsic_load_ssbo_uniform_block_intel;
5401 if (is_ssbo) {
5402 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5403 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
5404 SURFACE_LOGICAL_SRC_SURFACE] =
5405 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5406 } else {
5407 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
5408 }
5409
5410 const unsigned total_dwords = ALIGN(instr->num_components,
5411 REG_SIZE * reg_unit(devinfo) / 4);
5412 unsigned loaded_dwords = 0;
5413
5414 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5415 const fs_builder ubld8 = bld.exec_all().group(8, 0);
5416 const fs_builder ubld16 = bld.exec_all().group(16, 0);
5417
5418 const elk_fs_reg packed_consts =
5419 ubld1.vgrf(ELK_REGISTER_TYPE_UD, total_dwords);
5420
5421 const nir_src load_offset = is_ssbo ? instr->src[1] : instr->src[0];
5422 if (nir_src_is_const(load_offset)) {
5423 elk_fs_reg addr = ubld8.vgrf(ELK_REGISTER_TYPE_UD);
5424 ubld8.MOV(addr, elk_imm_ud(nir_src_as_uint(load_offset)));
5425 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = component(addr, 0);
5426 } else {
5427 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5428 bld.emit_uniformize(get_nir_src(ntb, load_offset));
5429 }
5430
5431 while (loaded_dwords < total_dwords) {
5432 const unsigned block =
5433 choose_oword_block_size_dwords(devinfo,
5434 total_dwords - loaded_dwords);
5435 const unsigned block_bytes = block * 4;
5436
5437 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
5438
5439 const fs_builder &ubld = block <= 8 ? ubld8 : ubld16;
5440 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5441 retype(byte_offset(packed_consts, loaded_dwords * 4), ELK_REGISTER_TYPE_UD),
5442 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written =
5443 align(block_bytes, REG_SIZE * reg_unit(devinfo));
5444
5445 loaded_dwords += block;
5446
5447 ubld1.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5448 srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5449 elk_imm_ud(block_bytes));
5450 }
5451
5452 for (unsigned c = 0; c < instr->num_components; c++)
5453 bld.MOV(retype(offset(dest, bld, c), ELK_REGISTER_TYPE_UD),
5454 component(packed_consts, c));
5455
5456 break;
5457 }
5458
5459 case nir_intrinsic_store_output: {
5460 assert(nir_src_bit_size(instr->src[0]) == 32);
5461 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5462
5463 unsigned store_offset = nir_src_as_uint(instr->src[1]);
5464 unsigned num_components = instr->num_components;
5465 unsigned first_component = nir_intrinsic_component(instr);
5466
5467 elk_fs_reg new_dest = retype(offset(s.outputs[instr->const_index[0]], bld,
5468 4 * store_offset), src.type);
5469 for (unsigned j = 0; j < num_components; j++) {
5470 bld.MOV(offset(new_dest, bld, j + first_component),
5471 offset(src, bld, j));
5472 }
5473 break;
5474 }
5475
5476 case nir_intrinsic_ssbo_atomic:
5477 case nir_intrinsic_ssbo_atomic_swap:
5478 fs_nir_emit_surface_atomic(ntb, bld, instr,
5479 get_nir_buffer_intrinsic_index(ntb, bld, instr),
5480 get_nir_src_bindless(ntb, instr->src[0]));
5481 break;
5482
5483 case nir_intrinsic_get_ssbo_size: {
5484 assert(nir_src_num_components(instr->src[0]) == 1);
5485
5486 /* A resinfo's sampler message is used to get the buffer size. The
5487 * SIMD8's writeback message consists of four registers and SIMD16's
5488 * writeback message consists of 8 destination registers (two per each
5489 * component). Because we are only interested on the first channel of
5490 * the first returned component, where resinfo returns the buffer size
5491 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
5492 * the dispatch width.
5493 */
5494 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
5495 elk_fs_reg src_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5496 elk_fs_reg ret_payload = ubld.vgrf(ELK_REGISTER_TYPE_UD, 4);
5497
5498 /* Set LOD = 0 */
5499 ubld.MOV(src_payload, elk_imm_d(0));
5500
5501 elk_fs_reg srcs[GET_BUFFER_SIZE_SRCS];
5502 srcs[get_nir_src_bindless(ntb, instr->src[0]) ?
5503 GET_BUFFER_SIZE_SRC_SURFACE_HANDLE :
5504 GET_BUFFER_SIZE_SRC_SURFACE] =
5505 get_nir_buffer_intrinsic_index(ntb, bld, instr);
5506 srcs[GET_BUFFER_SIZE_SRC_LOD] = src_payload;
5507 elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
5508 srcs, GET_BUFFER_SIZE_SRCS);
5509 inst->header_size = 0;
5510 inst->mlen = reg_unit(devinfo);
5511 inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
5512
5513 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
5514 *
5515 * "Out-of-bounds checking is always performed at a DWord granularity. If
5516 * any part of the DWord is out-of-bounds then the whole DWord is
5517 * considered out-of-bounds."
5518 *
5519 * This implies that types with size smaller than 4-bytes need to be
5520 * padded if they don't complete the last dword of the buffer. But as we
5521 * need to maintain the original size we need to reverse the padding
5522 * calculation to return the correct size to know the number of elements
5523 * of an unsized array. As we stored in the last two bits of the surface
5524 * size the needed padding for the buffer, we calculate here the
5525 * original buffer_size reversing the surface_size calculation:
5526 *
5527 * surface_size = isl_align(buffer_size, 4) +
5528 * (isl_align(buffer_size) - buffer_size)
5529 *
5530 * buffer_size = surface_size & ~3 - surface_size & 3
5531 */
5532
5533 elk_fs_reg size_aligned4 = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5534 elk_fs_reg size_padding = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5535 elk_fs_reg buffer_size = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5536
5537 ubld.AND(size_padding, ret_payload, elk_imm_ud(3));
5538 ubld.AND(size_aligned4, ret_payload, elk_imm_ud(~3));
5539 ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
5540
5541 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
5542 break;
5543 }
5544
5545 case nir_intrinsic_load_scratch: {
5546 assert(devinfo->ver >= 7);
5547
5548 assert(instr->def.num_components == 1);
5549 const unsigned bit_size = instr->def.bit_size;
5550 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5551
5552 if (devinfo->ver >= 8) {
5553 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5554 elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5555 } else {
5556 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5557 }
5558
5559 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5560 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5561 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5562 const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[0]);
5563
5564 /* Make dest unsigned because that's what the temporary will be */
5565 dest.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5566
5567 /* Read the vector */
5568 assert(instr->def.num_components == 1);
5569 assert(bit_size <= 32);
5570 assert(nir_intrinsic_align(instr) > 0);
5571 if (bit_size == 32 &&
5572 nir_intrinsic_align(instr) >= 4) {
5573 /* The offset for a DWORD scattered message is in dwords. */
5574 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5575 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5576
5577 bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
5578 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5579 } else {
5580 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5581 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5582
5583 elk_fs_reg read_result = bld.vgrf(ELK_REGISTER_TYPE_UD);
5584 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5585 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5586 bld.MOV(dest, read_result);
5587 }
5588
5589 s.shader_stats.fill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5590 break;
5591 }
5592
5593 case nir_intrinsic_store_scratch: {
5594 assert(devinfo->ver >= 7);
5595
5596 assert(nir_src_num_components(instr->src[0]) == 1);
5597 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5598 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5599
5600 if (devinfo->ver >= 8) {
5601 srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5602 elk_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5603 } else {
5604 srcs[SURFACE_LOGICAL_SRC_SURFACE] = elk_imm_ud(ELK_BTI_STATELESS);
5605 }
5606
5607 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
5608 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(bit_size);
5609 /**
5610 * While this instruction has side-effects, it should not be predicated
5611 * on sample mask, because otherwise fs helper invocations would
5612 * load undefined values from scratch memory. And scratch memory
5613 * load-stores are produced from operations without side-effects, thus
5614 * they should not have different behaviour in the helper invocations.
5615 */
5616 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(0);
5617 const elk_fs_reg nir_addr = get_nir_src(ntb, instr->src[1]);
5618
5619 elk_fs_reg data = get_nir_src(ntb, instr->src[0]);
5620 data.type = elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_UD);
5621
5622 assert(nir_src_num_components(instr->src[0]) == 1);
5623 assert(bit_size <= 32);
5624 assert(nir_intrinsic_write_mask(instr) == 1);
5625 assert(nir_intrinsic_align(instr) > 0);
5626 if (bit_size == 32 &&
5627 nir_intrinsic_align(instr) >= 4) {
5628 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5629
5630 /* The offset for a DWORD scattered message is in dwords. */
5631 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5632 swizzle_nir_scratch_addr(ntb, bld, nir_addr, true);
5633
5634 bld.emit(ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
5635 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5636 } else {
5637 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(ELK_REGISTER_TYPE_UD);
5638 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5639
5640 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5641 swizzle_nir_scratch_addr(ntb, bld, nir_addr, false);
5642
5643 bld.emit(ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5644 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5645 }
5646 s.shader_stats.spill_count += DIV_ROUND_UP(s.dispatch_width, 16);
5647 break;
5648 }
5649
5650 case nir_intrinsic_load_subgroup_size:
5651 /* This should only happen for fragment shaders because every other case
5652 * is lowered in NIR so we can optimize on it.
5653 */
5654 assert(s.stage == MESA_SHADER_FRAGMENT);
5655 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), elk_imm_d(s.dispatch_width));
5656 break;
5657
5658 case nir_intrinsic_load_subgroup_invocation:
5659 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D),
5660 ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
5661 break;
5662
5663 case nir_intrinsic_load_subgroup_eq_mask:
5664 case nir_intrinsic_load_subgroup_ge_mask:
5665 case nir_intrinsic_load_subgroup_gt_mask:
5666 case nir_intrinsic_load_subgroup_le_mask:
5667 case nir_intrinsic_load_subgroup_lt_mask:
5668 unreachable("not reached");
5669
5670 case nir_intrinsic_vote_any: {
5671 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5672
5673 /* The any/all predicates do not consider channel enables. To prevent
5674 * dead channels from affecting the result, we initialize the flag with
5675 * with the identity value for the logical operation.
5676 */
5677 if (s.dispatch_width == 32) {
5678 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5679 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5680 elk_imm_ud(0));
5681 } else {
5682 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0));
5683 }
5684 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5685
5686 /* For some reason, the any/all predicates don't work properly with
5687 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
5688 * doesn't read the correct subset of the flag register and you end up
5689 * getting garbage in the second half. Work around this by using a pair
5690 * of 1-wide MOVs and scattering the result.
5691 */
5692 const fs_builder ubld = ubld1;
5693 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5694 ubld.MOV(res1, elk_imm_d(0));
5695 set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ANY8H :
5696 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ANY16H :
5697 ELK_PREDICATE_ALIGN1_ANY32H,
5698 ubld.MOV(res1, elk_imm_d(-1)));
5699
5700 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5701 break;
5702 }
5703 case nir_intrinsic_vote_all: {
5704 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5705
5706 /* The any/all predicates do not consider channel enables. To prevent
5707 * dead channels from affecting the result, we initialize the flag with
5708 * with the identity value for the logical operation.
5709 */
5710 if (s.dispatch_width == 32) {
5711 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5712 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5713 elk_imm_ud(0xffffffff));
5714 } else {
5715 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5716 }
5717 bld.CMP(bld.null_reg_d(), get_nir_src(ntb, instr->src[0]), elk_imm_d(0), ELK_CONDITIONAL_NZ);
5718
5719 /* For some reason, the any/all predicates don't work properly with
5720 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
5721 * doesn't read the correct subset of the flag register and you end up
5722 * getting garbage in the second half. Work around this by using a pair
5723 * of 1-wide MOVs and scattering the result.
5724 */
5725 const fs_builder ubld = ubld1;
5726 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5727 ubld.MOV(res1, elk_imm_d(0));
5728 set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
5729 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5730 ELK_PREDICATE_ALIGN1_ALL32H,
5731 ubld.MOV(res1, elk_imm_d(-1)));
5732
5733 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5734 break;
5735 }
5736 case nir_intrinsic_vote_feq:
5737 case nir_intrinsic_vote_ieq: {
5738 elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5739 if (instr->intrinsic == nir_intrinsic_vote_feq) {
5740 const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5741 value.type = bit_size == 8 ? ELK_REGISTER_TYPE_B :
5742 elk_reg_type_from_bit_size(bit_size, ELK_REGISTER_TYPE_F);
5743 }
5744
5745 elk_fs_reg uniformized = bld.emit_uniformize(value);
5746 const fs_builder ubld1 = bld.exec_all().group(1, 0);
5747
5748 /* The any/all predicates do not consider channel enables. To prevent
5749 * dead channels from affecting the result, we initialize the flag with
5750 * with the identity value for the logical operation.
5751 */
5752 if (s.dispatch_width == 32) {
5753 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5754 ubld1.MOV(retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD),
5755 elk_imm_ud(0xffffffff));
5756 } else {
5757 ubld1.MOV(elk_flag_reg(0, 0), elk_imm_uw(0xffff));
5758 }
5759 bld.CMP(bld.null_reg_d(), value, uniformized, ELK_CONDITIONAL_Z);
5760
5761 /* For some reason, the any/all predicates don't work properly with
5762 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H
5763 * doesn't read the correct subset of the flag register and you end up
5764 * getting garbage in the second half. Work around this by using a pair
5765 * of 1-wide MOVs and scattering the result.
5766 */
5767 const fs_builder ubld = ubld1;
5768 elk_fs_reg res1 = ubld.vgrf(ELK_REGISTER_TYPE_D);
5769 ubld.MOV(res1, elk_imm_d(0));
5770 set_predicate(s.dispatch_width == 8 ? ELK_PREDICATE_ALIGN1_ALL8H :
5771 s.dispatch_width == 16 ? ELK_PREDICATE_ALIGN1_ALL16H :
5772 ELK_PREDICATE_ALIGN1_ALL32H,
5773 ubld.MOV(res1, elk_imm_d(-1)));
5774
5775 bld.MOV(retype(dest, ELK_REGISTER_TYPE_D), component(res1, 0));
5776 break;
5777 }
5778
5779 case nir_intrinsic_ballot: {
5780 const elk_fs_reg value = retype(get_nir_src(ntb, instr->src[0]),
5781 ELK_REGISTER_TYPE_UD);
5782 struct elk_reg flag = elk_flag_reg(0, 0);
5783 /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
5784 * as f0.0. This is a problem for fragment programs as we currently use
5785 * f0.1 for discards. Fortunately, we don't support SIMD32 fragment
5786 * programs yet so this isn't a problem. When we do, something will
5787 * have to change.
5788 */
5789 if (s.dispatch_width == 32)
5790 flag.type = ELK_REGISTER_TYPE_UD;
5791
5792 bld.exec_all().group(1, 0).MOV(flag, elk_imm_ud(0u));
5793 bld.CMP(bld.null_reg_ud(), value, elk_imm_ud(0u), ELK_CONDITIONAL_NZ);
5794
5795 if (instr->def.bit_size > 32) {
5796 dest.type = ELK_REGISTER_TYPE_UQ;
5797 } else {
5798 dest.type = ELK_REGISTER_TYPE_UD;
5799 }
5800 bld.MOV(dest, flag);
5801 break;
5802 }
5803
5804 case nir_intrinsic_read_invocation: {
5805 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5806 const elk_fs_reg invocation = get_nir_src(ntb, instr->src[1]);
5807
5808 elk_fs_reg tmp = bld.vgrf(value.type);
5809
5810 /* When for some reason the subgroup_size picked by NIR is larger than
5811 * the dispatch size picked by the backend (this could happen in RT,
5812 * FS), bound the invocation to the dispatch size.
5813 */
5814 elk_fs_reg bound_invocation;
5815 if (s.api_subgroup_size == 0 ||
5816 bld.dispatch_width() < s.api_subgroup_size) {
5817 bound_invocation = bld.vgrf(ELK_REGISTER_TYPE_UD);
5818 bld.AND(bound_invocation, invocation, elk_imm_ud(s.dispatch_width - 1));
5819 } else {
5820 bound_invocation = invocation;
5821 }
5822 bld.exec_all().emit(ELK_SHADER_OPCODE_BROADCAST, tmp, value,
5823 bld.emit_uniformize(bound_invocation));
5824
5825 bld.MOV(retype(dest, value.type), elk_fs_reg(component(tmp, 0)));
5826 break;
5827 }
5828
5829 case nir_intrinsic_read_first_invocation: {
5830 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5831 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
5832 break;
5833 }
5834
5835 case nir_intrinsic_shuffle: {
5836 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5837 const elk_fs_reg index = get_nir_src(ntb, instr->src[1]);
5838
5839 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
5840 break;
5841 }
5842
5843 case nir_intrinsic_first_invocation: {
5844 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5845 bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
5846 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5847 elk_fs_reg(component(tmp, 0)));
5848 break;
5849 }
5850
5851 case nir_intrinsic_last_invocation: {
5852 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
5853 bld.exec_all().emit(ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL, tmp);
5854 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UD),
5855 elk_fs_reg(component(tmp, 0)));
5856 break;
5857 }
5858
5859 case nir_intrinsic_quad_broadcast: {
5860 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5861 const unsigned index = nir_src_as_uint(instr->src[1]);
5862
5863 bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
5864 value, elk_imm_ud(index), elk_imm_ud(4));
5865 break;
5866 }
5867
5868 case nir_intrinsic_quad_swap_horizontal: {
5869 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5870 const elk_fs_reg tmp = bld.vgrf(value.type);
5871 if (devinfo->ver <= 7) {
5872 /* The hardware doesn't seem to support these crazy regions with
5873 * compressed instructions on gfx7 and earlier so we fall back to
5874 * using quad swizzles. Fortunately, we don't support 64-bit
5875 * anything in Vulkan on gfx7.
5876 */
5877 assert(nir_src_bit_size(instr->src[0]) == 32);
5878 const fs_builder ubld = bld.exec_all();
5879 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5880 elk_imm_ud(ELK_SWIZZLE4(1,0,3,2)));
5881 bld.MOV(retype(dest, value.type), tmp);
5882 } else {
5883 const fs_builder ubld = bld.exec_all().group(s.dispatch_width / 2, 0);
5884
5885 const elk_fs_reg src_left = horiz_stride(value, 2);
5886 const elk_fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
5887 const elk_fs_reg tmp_left = horiz_stride(tmp, 2);
5888 const elk_fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
5889
5890 ubld.MOV(tmp_left, src_right);
5891 ubld.MOV(tmp_right, src_left);
5892
5893 }
5894 bld.MOV(retype(dest, value.type), tmp);
5895 break;
5896 }
5897
5898 case nir_intrinsic_quad_swap_vertical: {
5899 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5900 if (nir_src_bit_size(instr->src[0]) == 32) {
5901 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5902 const elk_fs_reg tmp = bld.vgrf(value.type);
5903 const fs_builder ubld = bld.exec_all();
5904 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5905 elk_imm_ud(ELK_SWIZZLE4(2,3,0,1)));
5906 bld.MOV(retype(dest, value.type), tmp);
5907 } else {
5908 /* For larger data types, we have to either emit dispatch_width many
5909 * MOVs or else fall back to doing indirects.
5910 */
5911 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5912 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5913 elk_imm_w(0x2));
5914 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5915 }
5916 break;
5917 }
5918
5919 case nir_intrinsic_quad_swap_diagonal: {
5920 const elk_fs_reg value = get_nir_src(ntb, instr->src[0]);
5921 if (nir_src_bit_size(instr->src[0]) == 32) {
5922 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5923 const elk_fs_reg tmp = bld.vgrf(value.type);
5924 const fs_builder ubld = bld.exec_all();
5925 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5926 elk_imm_ud(ELK_SWIZZLE4(3,2,1,0)));
5927 bld.MOV(retype(dest, value.type), tmp);
5928 } else {
5929 /* For larger data types, we have to either emit dispatch_width many
5930 * MOVs or else fall back to doing indirects.
5931 */
5932 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
5933 bld.XOR(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5934 elk_imm_w(0x3));
5935 bld.emit(ELK_SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5936 }
5937 break;
5938 }
5939
5940 case nir_intrinsic_ddx_fine:
5941 bld.emit(ELK_FS_OPCODE_DDX_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5942 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5943 break;
5944 case nir_intrinsic_ddx:
5945 case nir_intrinsic_ddx_coarse:
5946 bld.emit(ELK_FS_OPCODE_DDX_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5947 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5948 break;
5949 case nir_intrinsic_ddy_fine:
5950 bld.emit(ELK_FS_OPCODE_DDY_FINE, retype(dest, ELK_REGISTER_TYPE_F),
5951 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5952 break;
5953 case nir_intrinsic_ddy:
5954 case nir_intrinsic_ddy_coarse:
5955 bld.emit(ELK_FS_OPCODE_DDY_COARSE, retype(dest, ELK_REGISTER_TYPE_F),
5956 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_F));
5957 break;
5958
5959 case nir_intrinsic_reduce: {
5960 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
5961 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5962 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
5963 if (cluster_size == 0 || cluster_size > s.dispatch_width)
5964 cluster_size = s.dispatch_width;
5965
5966 /* Figure out the source type */
5967 src.type = elk_type_for_nir_type(devinfo,
5968 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5969 nir_src_bit_size(instr->src[0])));
5970
5971 elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
5972 elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
5973 elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
5974
5975 /* Set up a register for all of our scratching around and initialize it
5976 * to reduction operation's identity value.
5977 */
5978 elk_fs_reg scan = bld.vgrf(src.type);
5979 bld.exec_all().emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5980
5981 bld.emit_scan(elk_op, scan, cluster_size, cond_mod);
5982
5983 dest.type = src.type;
5984 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
5985 /* In this case, CLUSTER_BROADCAST instruction isn't needed because
5986 * the distance between clusters is at least 2 GRFs. In this case,
5987 * we don't need the weird striding of the CLUSTER_BROADCAST
5988 * instruction and can just do regular MOVs.
5989 */
5990 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
5991 const unsigned groups =
5992 (s.dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
5993 const unsigned group_size = s.dispatch_width / groups;
5994 for (unsigned i = 0; i < groups; i++) {
5995 const unsigned cluster = (i * group_size) / cluster_size;
5996 const unsigned comp = cluster * cluster_size + (cluster_size - 1);
5997 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
5998 component(scan, comp));
5999 }
6000 } else {
6001 bld.emit(ELK_SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
6002 elk_imm_ud(cluster_size - 1), elk_imm_ud(cluster_size));
6003 }
6004 break;
6005 }
6006
6007 case nir_intrinsic_inclusive_scan:
6008 case nir_intrinsic_exclusive_scan: {
6009 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6010 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
6011
6012 /* Figure out the source type */
6013 src.type = elk_type_for_nir_type(devinfo,
6014 (nir_alu_type)(nir_op_infos[redop].input_types[0] |
6015 nir_src_bit_size(instr->src[0])));
6016
6017 elk_fs_reg identity = elk_nir_reduction_op_identity(bld, redop, src.type);
6018 elk_opcode elk_op = elk_op_for_nir_reduction_op(redop);
6019 elk_conditional_mod cond_mod = elk_cond_mod_for_nir_reduction_op(redop);
6020
6021 /* Set up a register for all of our scratching around and initialize it
6022 * to reduction operation's identity value.
6023 */
6024 elk_fs_reg scan = bld.vgrf(src.type);
6025 const fs_builder allbld = bld.exec_all();
6026 allbld.emit(ELK_SHADER_OPCODE_SEL_EXEC, scan, src, identity);
6027
6028 if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
6029 /* Exclusive scan is a bit harder because we have to do an annoying
6030 * shift of the contents before we can begin. To make things worse,
6031 * we can't do this with a normal stride; we have to use indirects.
6032 */
6033 elk_fs_reg shifted = bld.vgrf(src.type);
6034 elk_fs_reg idx = bld.vgrf(ELK_REGISTER_TYPE_W);
6035 allbld.ADD(idx, ntb.system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
6036 elk_imm_w(-1));
6037 allbld.emit(ELK_SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
6038 allbld.group(1, 0).MOV(component(shifted, 0), identity);
6039 scan = shifted;
6040 }
6041
6042 bld.emit_scan(elk_op, scan, s.dispatch_width, cond_mod);
6043
6044 bld.MOV(retype(dest, src.type), scan);
6045 break;
6046 }
6047
6048 case nir_intrinsic_load_global_block_intel: {
6049 assert(instr->def.bit_size == 32);
6050
6051 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[0]));
6052
6053 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6054 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6055 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6056
6057 const unsigned total = instr->num_components * s.dispatch_width;
6058 unsigned loaded = 0;
6059
6060 while (loaded < total) {
6061 const unsigned block =
6062 choose_oword_block_size_dwords(devinfo, total - loaded);
6063 const unsigned block_bytes = block * 4;
6064
6065 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6066
6067 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6068 srcs[A64_LOGICAL_ADDRESS] = address;
6069 srcs[A64_LOGICAL_SRC] = elk_fs_reg(); /* No source data */
6070 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6071 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(1);
6072 ubld.emit(ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6073 retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6074 srcs, A64_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6075
6076 increment_a64_address(ubld1, address, block_bytes);
6077 loaded += block;
6078 }
6079
6080 assert(loaded == total);
6081 break;
6082 }
6083
6084 case nir_intrinsic_store_global_block_intel: {
6085 assert(nir_src_bit_size(instr->src[0]) == 32);
6086
6087 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[1]));
6088 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6089
6090 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6091 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6092 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6093
6094 const unsigned total = instr->num_components * s.dispatch_width;
6095 unsigned written = 0;
6096
6097 while (written < total) {
6098 const unsigned block =
6099 choose_oword_block_size_dwords(devinfo, total - written);
6100
6101 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6102 srcs[A64_LOGICAL_ADDRESS] = address;
6103 srcs[A64_LOGICAL_SRC] = retype(byte_offset(src, written * 4),
6104 ELK_REGISTER_TYPE_UD);
6105 srcs[A64_LOGICAL_ARG] = elk_imm_ud(block);
6106 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6107
6108 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6109 ubld.emit(ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, elk_fs_reg(),
6110 srcs, A64_LOGICAL_NUM_SRCS);
6111
6112 const unsigned block_bytes = block * 4;
6113 increment_a64_address(ubld1, address, block_bytes);
6114 written += block;
6115 }
6116
6117 assert(written == total);
6118 break;
6119 }
6120
6121 case nir_intrinsic_load_shared_block_intel:
6122 case nir_intrinsic_load_ssbo_block_intel: {
6123 assert(instr->def.bit_size == 32);
6124
6125 const bool is_ssbo =
6126 instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
6127 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 1 : 0]));
6128
6129 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6130 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6131 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6132 elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6133 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6134
6135 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6136 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6137 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6138
6139 const unsigned total = instr->num_components * s.dispatch_width;
6140 unsigned loaded = 0;
6141
6142 while (loaded < total) {
6143 const unsigned block =
6144 choose_oword_block_size_dwords(devinfo, total - loaded);
6145 const unsigned block_bytes = block * 4;
6146
6147 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6148
6149 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6150 ubld.emit(ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
6151 retype(byte_offset(dest, loaded * 4), ELK_REGISTER_TYPE_UD),
6152 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
6153
6154 ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6155 loaded += block;
6156 }
6157
6158 assert(loaded == total);
6159 break;
6160 }
6161
6162 case nir_intrinsic_store_shared_block_intel:
6163 case nir_intrinsic_store_ssbo_block_intel: {
6164 assert(nir_src_bit_size(instr->src[0]) == 32);
6165
6166 const bool is_ssbo =
6167 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
6168
6169 elk_fs_reg address = bld.emit_uniformize(get_nir_src(ntb, instr->src[is_ssbo ? 2 : 1]));
6170 elk_fs_reg src = get_nir_src(ntb, instr->src[0]);
6171
6172 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6173 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
6174 get_nir_buffer_intrinsic_index(ntb, bld, instr) :
6175 elk_fs_reg(elk_imm_ud(GFX7_BTI_SLM));
6176 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
6177
6178 const fs_builder ubld1 = bld.exec_all().group(1, 0);
6179 const fs_builder ubld8 = bld.exec_all().group(8, 0);
6180 const fs_builder ubld16 = bld.exec_all().group(16, 0);
6181
6182 const unsigned total = instr->num_components * s.dispatch_width;
6183 unsigned written = 0;
6184
6185 while (written < total) {
6186 const unsigned block =
6187 choose_oword_block_size_dwords(devinfo, total - written);
6188
6189 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(block);
6190 srcs[SURFACE_LOGICAL_SRC_DATA] =
6191 retype(byte_offset(src, written * 4), ELK_REGISTER_TYPE_UD);
6192
6193 const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
6194 ubld.emit(ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
6195 elk_fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
6196
6197 const unsigned block_bytes = block * 4;
6198 ubld1.ADD(address, address, elk_imm_ud(block_bytes));
6199 written += block;
6200 }
6201
6202 assert(written == total);
6203 break;
6204 }
6205
6206 default:
6207 #ifndef NDEBUG
6208 assert(instr->intrinsic < nir_num_intrinsics);
6209 fprintf(stderr, "intrinsic: %s\n", nir_intrinsic_infos[instr->intrinsic].name);
6210 #endif
6211 unreachable("unknown intrinsic");
6212 }
6213 }
6214
6215 static elk_fs_reg
expand_to_32bit(const fs_builder & bld,const elk_fs_reg & src)6216 expand_to_32bit(const fs_builder &bld, const elk_fs_reg &src)
6217 {
6218 if (type_sz(src.type) == 2) {
6219 elk_fs_reg src32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6220 bld.MOV(src32, retype(src, ELK_REGISTER_TYPE_UW));
6221 return src32;
6222 } else {
6223 return src;
6224 }
6225 }
6226
6227 static void
fs_nir_emit_surface_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr,elk_fs_reg surface,bool bindless)6228 fs_nir_emit_surface_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6229 nir_intrinsic_instr *instr,
6230 elk_fs_reg surface,
6231 bool bindless)
6232 {
6233 const intel_device_info *devinfo = ntb.devinfo;
6234 elk_fs_visitor &s = ntb.s;
6235
6236 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6237 int num_data = lsc_op_num_data_values(op);
6238
6239 bool shared = surface.file == IMM && surface.ud == GFX7_BTI_SLM;
6240
6241 /* The BTI untyped atomic messages only support 32-bit atomics. If you
6242 * just look at the big table of messages in the Vol 7 of the SKL PRM, they
6243 * appear to exist. However, if you look at Vol 2a, there are no message
6244 * descriptors provided for Qword atomic ops except for A64 messages.
6245 *
6246 * 16-bit float atomics are supported, however.
6247 */
6248 assert(instr->def.bit_size == 32 ||
6249 (instr->def.bit_size == 64 && devinfo->has_lsc) ||
6250 (instr->def.bit_size == 16 &&
6251 (devinfo->has_lsc || elk_lsc_opcode_is_atomic_float(op))));
6252
6253 elk_fs_reg dest = get_nir_def(ntb, instr->def);
6254
6255 elk_fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
6256 srcs[bindless ?
6257 SURFACE_LOGICAL_SRC_SURFACE_HANDLE :
6258 SURFACE_LOGICAL_SRC_SURFACE] = surface;
6259 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = elk_imm_ud(1);
6260 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = elk_imm_ud(op);
6261 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = elk_imm_ud(1);
6262
6263 if (shared) {
6264 /* SLM - Get the offset */
6265 if (nir_src_is_const(instr->src[0])) {
6266 srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
6267 elk_imm_ud(nir_intrinsic_base(instr) +
6268 nir_src_as_uint(instr->src[0]));
6269 } else {
6270 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = s.vgrf(glsl_uint_type());
6271 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
6272 retype(get_nir_src(ntb, instr->src[0]), ELK_REGISTER_TYPE_UD),
6273 elk_imm_ud(nir_intrinsic_base(instr)));
6274 }
6275 } else {
6276 /* SSBOs */
6277 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(ntb, instr->src[1]);
6278 }
6279
6280 elk_fs_reg data;
6281 if (num_data >= 1)
6282 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 1 : 2]));
6283
6284 if (num_data >= 2) {
6285 elk_fs_reg tmp = bld.vgrf(data.type, 2);
6286 elk_fs_reg sources[2] = {
6287 data,
6288 expand_to_32bit(bld, get_nir_src(ntb, instr->src[shared ? 2 : 3]))
6289 };
6290 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6291 data = tmp;
6292 }
6293 srcs[SURFACE_LOGICAL_SRC_DATA] = data;
6294
6295 /* Emit the actual atomic operation */
6296
6297 switch (instr->def.bit_size) {
6298 case 16: {
6299 elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6300 bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6301 retype(dest32, dest.type),
6302 srcs, SURFACE_LOGICAL_NUM_SRCS);
6303 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW),
6304 retype(dest32, ELK_REGISTER_TYPE_UD));
6305 break;
6306 }
6307
6308 case 32:
6309 case 64:
6310 bld.emit(ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
6311 dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
6312 break;
6313 default:
6314 unreachable("Unsupported bit size");
6315 }
6316 }
6317
6318 static void
fs_nir_emit_global_atomic(nir_to_elk_state & ntb,const fs_builder & bld,nir_intrinsic_instr * instr)6319 fs_nir_emit_global_atomic(nir_to_elk_state &ntb, const fs_builder &bld,
6320 nir_intrinsic_instr *instr)
6321 {
6322 enum elk_lsc_opcode op = elk_lsc_aop_for_nir_intrinsic(instr);
6323 int num_data = lsc_op_num_data_values(op);
6324
6325 elk_fs_reg dest = get_nir_def(ntb, instr->def);
6326
6327 elk_fs_reg addr = get_nir_src(ntb, instr->src[0]);
6328
6329 elk_fs_reg data;
6330 if (num_data >= 1)
6331 data = expand_to_32bit(bld, get_nir_src(ntb, instr->src[1]));
6332
6333 if (num_data >= 2) {
6334 elk_fs_reg tmp = bld.vgrf(data.type, 2);
6335 elk_fs_reg sources[2] = {
6336 data,
6337 expand_to_32bit(bld, get_nir_src(ntb, instr->src[2]))
6338 };
6339 bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6340 data = tmp;
6341 }
6342
6343 elk_fs_reg srcs[A64_LOGICAL_NUM_SRCS];
6344 srcs[A64_LOGICAL_ADDRESS] = addr;
6345 srcs[A64_LOGICAL_SRC] = data;
6346 srcs[A64_LOGICAL_ARG] = elk_imm_ud(op);
6347 srcs[A64_LOGICAL_ENABLE_HELPERS] = elk_imm_ud(0);
6348
6349 switch (instr->def.bit_size) {
6350 case 16: {
6351 elk_fs_reg dest32 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6352 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
6353 retype(dest32, dest.type),
6354 srcs, A64_LOGICAL_NUM_SRCS);
6355 bld.MOV(retype(dest, ELK_REGISTER_TYPE_UW), dest32);
6356 break;
6357 }
6358 case 32:
6359 case 64:
6360 bld.emit(ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, dest,
6361 srcs, A64_LOGICAL_NUM_SRCS);
6362 break;
6363 default:
6364 unreachable("Unsupported bit size");
6365 }
6366 }
6367
6368 static void
fs_nir_emit_texture(nir_to_elk_state & ntb,nir_tex_instr * instr)6369 fs_nir_emit_texture(nir_to_elk_state &ntb,
6370 nir_tex_instr *instr)
6371 {
6372 const intel_device_info *devinfo = ntb.devinfo;
6373 const fs_builder &bld = ntb.bld;
6374 elk_fs_visitor &s = ntb.s;
6375
6376 elk_fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
6377
6378 /* SKL PRMs: Volume 7: 3D-Media-GPGPU:
6379 *
6380 * "The Pixel Null Mask field, when enabled via the Pixel Null Mask
6381 * Enable will be incorect for sample_c when applied to a surface with
6382 * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask
6383 * Enable may incorrectly report pixels as referencing a Null surface."
6384 *
6385 * We'll take care of this in NIR.
6386 */
6387 assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
6388
6389 srcs[TEX_LOGICAL_SRC_RESIDENCY] = elk_imm_ud(instr->is_sparse);
6390
6391 int lod_components = 0;
6392
6393 /* The hardware requires a LOD for buffer textures */
6394 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6395 srcs[TEX_LOGICAL_SRC_LOD] = elk_imm_d(0);
6396
6397 ASSERTED bool got_lod = false;
6398 ASSERTED bool got_bias = false;
6399 uint32_t header_bits = 0;
6400 for (unsigned i = 0; i < instr->num_srcs; i++) {
6401 nir_src nir_src = instr->src[i].src;
6402 elk_fs_reg src = get_nir_src(ntb, nir_src);
6403 switch (instr->src[i].src_type) {
6404 case nir_tex_src_bias:
6405 assert(!got_lod);
6406 got_bias = true;
6407
6408 srcs[TEX_LOGICAL_SRC_LOD] =
6409 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6410 break;
6411 case nir_tex_src_comparator:
6412 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, ELK_REGISTER_TYPE_F);
6413 break;
6414 case nir_tex_src_coord:
6415 switch (instr->op) {
6416 case nir_texop_txf:
6417 case nir_texop_txf_ms:
6418 case nir_texop_txf_ms_mcs_intel:
6419 case nir_texop_samples_identical:
6420 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_D);
6421 break;
6422 default:
6423 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, ELK_REGISTER_TYPE_F);
6424 break;
6425 }
6426 break;
6427 case nir_tex_src_ddx:
6428 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, ELK_REGISTER_TYPE_F);
6429 lod_components = nir_tex_instr_src_size(instr, i);
6430 break;
6431 case nir_tex_src_ddy:
6432 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, ELK_REGISTER_TYPE_F);
6433 break;
6434 case nir_tex_src_lod:
6435 assert(!got_bias);
6436 got_lod = true;
6437
6438 switch (instr->op) {
6439 case nir_texop_txs:
6440 srcs[TEX_LOGICAL_SRC_LOD] =
6441 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_UD);
6442 break;
6443 case nir_texop_txf:
6444 srcs[TEX_LOGICAL_SRC_LOD] =
6445 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_D);
6446 break;
6447 default:
6448 srcs[TEX_LOGICAL_SRC_LOD] =
6449 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6450 break;
6451 }
6452 break;
6453 case nir_tex_src_min_lod:
6454 srcs[TEX_LOGICAL_SRC_MIN_LOD] =
6455 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6456 break;
6457 case nir_tex_src_ms_index:
6458 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, ELK_REGISTER_TYPE_UD);
6459 break;
6460
6461 case nir_tex_src_offset: {
6462 uint32_t offset_bits = 0;
6463 if (elk_texture_offset(instr, i, &offset_bits)) {
6464 header_bits |= offset_bits;
6465 } else {
6466 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
6467 retype(src, ELK_REGISTER_TYPE_D);
6468 }
6469 break;
6470 }
6471
6472 case nir_tex_src_projector:
6473 unreachable("should be lowered");
6474
6475 case nir_tex_src_texture_offset: {
6476 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
6477 /* Emit code to evaluate the actual indexing expression */
6478 if (instr->texture_index == 0 && is_resource_src(nir_src))
6479 srcs[TEX_LOGICAL_SRC_SURFACE] = get_resource_nir_src(ntb, nir_src);
6480 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE) {
6481 elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6482 bld.ADD(tmp, src, elk_imm_ud(instr->texture_index));
6483 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
6484 }
6485 assert(srcs[TEX_LOGICAL_SRC_SURFACE].file != BAD_FILE);
6486 break;
6487 }
6488
6489 case nir_tex_src_sampler_offset: {
6490 /* Emit code to evaluate the actual indexing expression */
6491 if (instr->sampler_index == 0 && is_resource_src(nir_src))
6492 srcs[TEX_LOGICAL_SRC_SAMPLER] = get_resource_nir_src(ntb, nir_src);
6493 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE) {
6494 elk_fs_reg tmp = s.vgrf(glsl_uint_type());
6495 bld.ADD(tmp, src, elk_imm_ud(instr->sampler_index));
6496 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
6497 }
6498 break;
6499 }
6500
6501 case nir_tex_src_texture_handle:
6502 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
6503 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_fs_reg();
6504 if (is_resource_src(nir_src))
6505 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = get_resource_nir_src(ntb, nir_src);
6506 if (srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6507 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
6508 break;
6509
6510 case nir_tex_src_sampler_handle:
6511 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
6512 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_fs_reg();
6513 if (is_resource_src(nir_src))
6514 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = get_resource_nir_src(ntb, nir_src);
6515 if (srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6516 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
6517 break;
6518
6519 case nir_tex_src_ms_mcs_intel:
6520 assert(instr->op == nir_texop_txf_ms);
6521 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, ELK_REGISTER_TYPE_D);
6522 break;
6523
6524 /* If this parameter is present, we are packing either the explicit LOD
6525 * or LOD bias and the array index into a single (32-bit) value when
6526 * 32-bit texture coordinates are used.
6527 */
6528 case nir_tex_src_backend1:
6529 assert(!got_lod && !got_bias);
6530 got_lod = true;
6531
6532 assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
6533 srcs[TEX_LOGICAL_SRC_LOD] =
6534 retype(get_nir_src_imm(ntb, instr->src[i].src), ELK_REGISTER_TYPE_F);
6535 break;
6536
6537 default:
6538 unreachable("unknown texture source");
6539 }
6540 }
6541
6542 /* If the surface or sampler were not specified through sources, use the
6543 * instruction index.
6544 */
6545 if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE &&
6546 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE].file == BAD_FILE)
6547 srcs[TEX_LOGICAL_SRC_SURFACE] = elk_imm_ud(instr->texture_index);
6548 if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE &&
6549 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE].file == BAD_FILE)
6550 srcs[TEX_LOGICAL_SRC_SAMPLER] = elk_imm_ud(instr->sampler_index);
6551
6552 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
6553 (instr->op == nir_texop_txf_ms ||
6554 instr->op == nir_texop_samples_identical)) {
6555 if (devinfo->ver >= 7) {
6556 srcs[TEX_LOGICAL_SRC_MCS] =
6557 emit_mcs_fetch(ntb, srcs[TEX_LOGICAL_SRC_COORDINATE],
6558 instr->coord_components,
6559 srcs[TEX_LOGICAL_SRC_SURFACE],
6560 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
6561 } else {
6562 srcs[TEX_LOGICAL_SRC_MCS] = elk_imm_ud(0u);
6563 }
6564 }
6565
6566 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = elk_imm_d(instr->coord_components);
6567 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = elk_imm_d(lod_components);
6568
6569 enum elk_opcode opcode;
6570 switch (instr->op) {
6571 case nir_texop_tex:
6572 opcode = ELK_SHADER_OPCODE_TEX_LOGICAL;
6573 break;
6574 case nir_texop_txb:
6575 opcode = ELK_FS_OPCODE_TXB_LOGICAL;
6576 break;
6577 case nir_texop_txl:
6578 opcode = ELK_SHADER_OPCODE_TXL_LOGICAL;
6579 break;
6580 case nir_texop_txd:
6581 opcode = ELK_SHADER_OPCODE_TXD_LOGICAL;
6582 break;
6583 case nir_texop_txf:
6584 opcode = ELK_SHADER_OPCODE_TXF_LOGICAL;
6585 break;
6586 case nir_texop_txf_ms:
6587 opcode = ELK_SHADER_OPCODE_TXF_CMS_LOGICAL;
6588 break;
6589 case nir_texop_txf_ms_mcs_intel:
6590 opcode = ELK_SHADER_OPCODE_TXF_MCS_LOGICAL;
6591 break;
6592 case nir_texop_query_levels:
6593 case nir_texop_txs:
6594 opcode = ELK_SHADER_OPCODE_TXS_LOGICAL;
6595 break;
6596 case nir_texop_lod:
6597 opcode = ELK_SHADER_OPCODE_LOD_LOGICAL;
6598 break;
6599 case nir_texop_tg4:
6600 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
6601 opcode = ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL;
6602 else
6603 opcode = ELK_SHADER_OPCODE_TG4_LOGICAL;
6604 break;
6605 case nir_texop_texture_samples:
6606 opcode = ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL;
6607 break;
6608 case nir_texop_samples_identical: {
6609 elk_fs_reg dst = retype(get_nir_def(ntb, instr->def), ELK_REGISTER_TYPE_D);
6610
6611 /* If mcs is an immediate value, it means there is no MCS. In that case
6612 * just return false.
6613 */
6614 if (srcs[TEX_LOGICAL_SRC_MCS].file == ELK_IMMEDIATE_VALUE) {
6615 bld.MOV(dst, elk_imm_ud(0u));
6616 } else {
6617 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], elk_imm_ud(0u),
6618 ELK_CONDITIONAL_EQ);
6619 }
6620 return;
6621 }
6622 default:
6623 unreachable("unknown texture opcode");
6624 }
6625
6626 if (instr->op == nir_texop_tg4) {
6627 if (instr->component == 1 &&
6628 s.key_tex->gather_channel_quirk_mask & (1 << instr->texture_index)) {
6629 /* gather4 sampler is broken for green channel on RG32F --
6630 * we must ask for blue instead.
6631 */
6632 header_bits |= 2 << 16;
6633 } else {
6634 header_bits |= instr->component << 16;
6635 }
6636 }
6637
6638 elk_fs_reg dst = bld.vgrf(elk_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
6639 elk_fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
6640 inst->offset = header_bits;
6641
6642 const unsigned dest_size = nir_tex_instr_dest_size(instr);
6643 inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
6644 (instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
6645
6646 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
6647 inst->shadow_compare = true;
6648
6649 /* Wa_14012688258:
6650 *
6651 * Don't trim zeros at the end of payload for sample operations
6652 * in cube and cube arrays.
6653 */
6654 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6655 intel_needs_workaround(devinfo, 14012688258)) {
6656
6657 /* Compiler should send U,V,R parameters even if V,R are 0. */
6658 if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
6659 assert(instr->coord_components >= 3u);
6660
6661 /* See opt_zero_samples(). */
6662 inst->keep_payload_trailing_zeros = true;
6663 }
6664
6665 elk_fs_reg nir_dest[5];
6666 for (unsigned i = 0; i < dest_size; i++)
6667 nir_dest[i] = offset(dst, bld, i);
6668
6669 if (instr->op == nir_texop_query_levels) {
6670 /* # levels is in .w */
6671 /**
6672 * Wa_1940217:
6673 *
6674 * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
6675 * MIPCount returned is undefined instead of 0.
6676 */
6677 elk_fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
6678 mov->conditional_mod = ELK_CONDITIONAL_NZ;
6679 nir_dest[0] = bld.vgrf(ELK_REGISTER_TYPE_D);
6680 elk_fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), elk_imm_d(0));
6681 sel->predicate = ELK_PREDICATE_NORMAL;
6682 } else if (instr->op == nir_texop_txs &&
6683 dest_size >= 3 && devinfo->ver < 7) {
6684 /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
6685 elk_fs_reg depth = offset(dst, bld, 2);
6686 nir_dest[2] = s.vgrf(glsl_int_type());
6687 bld.emit_minmax(nir_dest[2], depth, elk_imm_d(1), ELK_CONDITIONAL_GE);
6688 }
6689
6690 /* The residency bits are only in the first component. */
6691 if (instr->is_sparse)
6692 nir_dest[dest_size - 1] = component(offset(dst, bld, dest_size - 1), 0);
6693
6694 bld.LOAD_PAYLOAD(get_nir_def(ntb, instr->def), nir_dest, dest_size, 0);
6695 }
6696
6697 static void
fs_nir_emit_jump(nir_to_elk_state & ntb,nir_jump_instr * instr)6698 fs_nir_emit_jump(nir_to_elk_state &ntb, nir_jump_instr *instr)
6699 {
6700 switch (instr->type) {
6701 case nir_jump_break:
6702 ntb.bld.emit(ELK_OPCODE_BREAK);
6703 break;
6704 case nir_jump_continue:
6705 ntb.bld.emit(ELK_OPCODE_CONTINUE);
6706 break;
6707 case nir_jump_halt:
6708 ntb.bld.emit(ELK_OPCODE_HALT);
6709 break;
6710 case nir_jump_return:
6711 default:
6712 unreachable("unknown jump");
6713 }
6714 }
6715
6716 /*
6717 * This helper takes a source register and un/shuffles it into the destination
6718 * register.
6719 *
6720 * If source type size is smaller than destination type size the operation
6721 * needed is a component shuffle. The opposite case would be an unshuffle. If
6722 * source/destination type size is equal a shuffle is done that would be
6723 * equivalent to a simple MOV.
6724 *
6725 * For example, if source is a 16-bit type and destination is 32-bit. A 3
6726 * components .xyz 16-bit vector on SIMD8 would be.
6727 *
6728 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
6729 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | |
6730 *
6731 * This helper will return the following 2 32-bit components with the 16-bit
6732 * values shuffled:
6733 *
6734 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
6735 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 |
6736 *
6737 * For unshuffle, the example would be the opposite, a 64-bit type source
6738 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
6739 * would be:
6740 *
6741 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h |
6742 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h |
6743 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h |
6744 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h |
6745 *
6746 * The returned result would be the following 4 32-bit components unshuffled:
6747 *
6748 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
6749 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
6750 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
6751 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
6752 *
6753 * - Source and destination register must not be overlapped.
6754 * - components units are measured in terms of the smaller type between
6755 * source and destination because we are un/shuffling the smaller
6756 * components from/into the bigger ones.
6757 * - first_component parameter allows skipping source components.
6758 */
6759 void
elk_shuffle_src_to_dst(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6760 elk_shuffle_src_to_dst(const fs_builder &bld,
6761 const elk_fs_reg &dst,
6762 const elk_fs_reg &src,
6763 uint32_t first_component,
6764 uint32_t components)
6765 {
6766 if (type_sz(src.type) == type_sz(dst.type)) {
6767 assert(!regions_overlap(dst,
6768 type_sz(dst.type) * bld.dispatch_width() * components,
6769 offset(src, bld, first_component),
6770 type_sz(src.type) * bld.dispatch_width() * components));
6771 for (unsigned i = 0; i < components; i++) {
6772 bld.MOV(retype(offset(dst, bld, i), src.type),
6773 offset(src, bld, i + first_component));
6774 }
6775 } else if (type_sz(src.type) < type_sz(dst.type)) {
6776 /* Source is shuffled into destination */
6777 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
6778 assert(!regions_overlap(dst,
6779 type_sz(dst.type) * bld.dispatch_width() *
6780 DIV_ROUND_UP(components, size_ratio),
6781 offset(src, bld, first_component),
6782 type_sz(src.type) * bld.dispatch_width() * components));
6783
6784 elk_reg_type shuffle_type =
6785 elk_reg_type_from_bit_size(8 * type_sz(src.type),
6786 ELK_REGISTER_TYPE_D);
6787 for (unsigned i = 0; i < components; i++) {
6788 elk_fs_reg shuffle_component_i =
6789 subscript(offset(dst, bld, i / size_ratio),
6790 shuffle_type, i % size_ratio);
6791 bld.MOV(shuffle_component_i,
6792 retype(offset(src, bld, i + first_component), shuffle_type));
6793 }
6794 } else {
6795 /* Source is unshuffled into destination */
6796 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
6797 assert(!regions_overlap(dst,
6798 type_sz(dst.type) * bld.dispatch_width() * components,
6799 offset(src, bld, first_component / size_ratio),
6800 type_sz(src.type) * bld.dispatch_width() *
6801 DIV_ROUND_UP(components + (first_component % size_ratio),
6802 size_ratio)));
6803
6804 elk_reg_type shuffle_type =
6805 elk_reg_type_from_bit_size(8 * type_sz(dst.type),
6806 ELK_REGISTER_TYPE_D);
6807 for (unsigned i = 0; i < components; i++) {
6808 elk_fs_reg shuffle_component_i =
6809 subscript(offset(src, bld, (first_component + i) / size_ratio),
6810 shuffle_type, (first_component + i) % size_ratio);
6811 bld.MOV(retype(offset(dst, bld, i), shuffle_type),
6812 shuffle_component_i);
6813 }
6814 }
6815 }
6816
6817 void
elk_shuffle_from_32bit_read(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & src,uint32_t first_component,uint32_t components)6818 elk_shuffle_from_32bit_read(const fs_builder &bld,
6819 const elk_fs_reg &dst,
6820 const elk_fs_reg &src,
6821 uint32_t first_component,
6822 uint32_t components)
6823 {
6824 assert(type_sz(src.type) == 4);
6825
6826 /* This function takes components in units of the destination type while
6827 * elk_shuffle_src_to_dst takes components in units of the smallest type
6828 */
6829 if (type_sz(dst.type) > 4) {
6830 assert(type_sz(dst.type) == 8);
6831 first_component *= 2;
6832 components *= 2;
6833 }
6834
6835 elk_shuffle_src_to_dst(bld, dst, src, first_component, components);
6836 }
6837
6838 elk_fs_reg
elk_setup_imm_df(const fs_builder & bld,double v)6839 elk_setup_imm_df(const fs_builder &bld, double v)
6840 {
6841 const struct intel_device_info *devinfo = bld.shader->devinfo;
6842 assert(devinfo->ver >= 7);
6843
6844 if (devinfo->ver >= 8)
6845 return elk_imm_df(v);
6846
6847 /* gfx7.5 does not support DF immediates straightforward but the DIM
6848 * instruction allows to set the 64-bit immediate value.
6849 */
6850 if (devinfo->platform == INTEL_PLATFORM_HSW) {
6851 const fs_builder ubld = bld.exec_all().group(1, 0);
6852 elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_DF, 1);
6853 ubld.DIM(dst, elk_imm_df(v));
6854 return component(dst, 0);
6855 }
6856
6857 /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
6858 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
6859 * the high 32-bit to suboffset 4 and then applying a stride of 0.
6860 *
6861 * Alternatively, we could also produce a normal VGRF (without stride 0)
6862 * by writing to all the channels in the VGRF, however, that would hit the
6863 * gfx7 bug where we have to split writes that span more than 1 register
6864 * into instructions with a width of 4 (otherwise the write to the second
6865 * register written runs into an execmask hardware bug) which isn't very
6866 * nice.
6867 */
6868 union {
6869 double d;
6870 struct {
6871 uint32_t i1;
6872 uint32_t i2;
6873 };
6874 } di;
6875
6876 di.d = v;
6877
6878 const fs_builder ubld = bld.exec_all().group(1, 0);
6879 const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
6880 ubld.MOV(tmp, elk_imm_ud(di.i1));
6881 ubld.MOV(horiz_offset(tmp, 1), elk_imm_ud(di.i2));
6882
6883 return component(retype(tmp, ELK_REGISTER_TYPE_DF), 0);
6884 }
6885
6886 elk_fs_reg
elk_setup_imm_b(const fs_builder & bld,int8_t v)6887 elk_setup_imm_b(const fs_builder &bld, int8_t v)
6888 {
6889 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_B);
6890 bld.MOV(tmp, elk_imm_w(v));
6891 return tmp;
6892 }
6893
6894 elk_fs_reg
elk_setup_imm_ub(const fs_builder & bld,uint8_t v)6895 elk_setup_imm_ub(const fs_builder &bld, uint8_t v)
6896 {
6897 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UB);
6898 bld.MOV(tmp, elk_imm_uw(v));
6899 return tmp;
6900 }
6901
6902 static void
fs_nir_emit_instr(nir_to_elk_state & ntb,nir_instr * instr)6903 fs_nir_emit_instr(nir_to_elk_state &ntb, nir_instr *instr)
6904 {
6905 ntb.bld = ntb.bld.annotate(NULL, instr);
6906
6907 switch (instr->type) {
6908 case nir_instr_type_alu:
6909 fs_nir_emit_alu(ntb, nir_instr_as_alu(instr), true);
6910 break;
6911
6912 case nir_instr_type_deref:
6913 unreachable("All derefs should've been lowered");
6914 break;
6915
6916 case nir_instr_type_intrinsic:
6917 switch (ntb.s.stage) {
6918 case MESA_SHADER_VERTEX:
6919 fs_nir_emit_vs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6920 break;
6921 case MESA_SHADER_TESS_CTRL:
6922 fs_nir_emit_tcs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6923 break;
6924 case MESA_SHADER_TESS_EVAL:
6925 fs_nir_emit_tes_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6926 break;
6927 case MESA_SHADER_GEOMETRY:
6928 fs_nir_emit_gs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6929 break;
6930 case MESA_SHADER_FRAGMENT:
6931 fs_nir_emit_fs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6932 break;
6933 case MESA_SHADER_COMPUTE:
6934 fs_nir_emit_cs_intrinsic(ntb, nir_instr_as_intrinsic(instr));
6935 break;
6936 default:
6937 unreachable("unsupported shader stage");
6938 }
6939 break;
6940
6941 case nir_instr_type_tex:
6942 fs_nir_emit_texture(ntb, nir_instr_as_tex(instr));
6943 break;
6944
6945 case nir_instr_type_load_const:
6946 fs_nir_emit_load_const(ntb, nir_instr_as_load_const(instr));
6947 break;
6948
6949 case nir_instr_type_undef:
6950 /* We create a new VGRF for undefs on every use (by handling
6951 * them in get_nir_src()), rather than for each definition.
6952 * This helps register coalescing eliminate MOVs from undef.
6953 */
6954 break;
6955
6956 case nir_instr_type_jump:
6957 fs_nir_emit_jump(ntb, nir_instr_as_jump(instr));
6958 break;
6959
6960 default:
6961 unreachable("unknown instruction type");
6962 }
6963 }
6964
6965 static unsigned
elk_rnd_mode_from_nir(unsigned mode,unsigned * mask)6966 elk_rnd_mode_from_nir(unsigned mode, unsigned *mask)
6967 {
6968 unsigned elk_mode = 0;
6969 *mask = 0;
6970
6971 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
6972 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
6973 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
6974 mode) {
6975 elk_mode |= ELK_RND_MODE_RTZ << ELK_CR0_RND_MODE_SHIFT;
6976 *mask |= ELK_CR0_RND_MODE_MASK;
6977 }
6978 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
6979 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
6980 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
6981 mode) {
6982 elk_mode |= ELK_RND_MODE_RTNE << ELK_CR0_RND_MODE_SHIFT;
6983 *mask |= ELK_CR0_RND_MODE_MASK;
6984 }
6985 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
6986 elk_mode |= ELK_CR0_FP16_DENORM_PRESERVE;
6987 *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6988 }
6989 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
6990 elk_mode |= ELK_CR0_FP32_DENORM_PRESERVE;
6991 *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
6992 }
6993 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
6994 elk_mode |= ELK_CR0_FP64_DENORM_PRESERVE;
6995 *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
6996 }
6997 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
6998 *mask |= ELK_CR0_FP16_DENORM_PRESERVE;
6999 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
7000 *mask |= ELK_CR0_FP32_DENORM_PRESERVE;
7001 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
7002 *mask |= ELK_CR0_FP64_DENORM_PRESERVE;
7003 if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7004 *mask |= ELK_CR0_FP_MODE_MASK;
7005
7006 if (*mask != 0)
7007 assert((*mask & elk_mode) == elk_mode);
7008
7009 return elk_mode;
7010 }
7011
7012 static void
emit_shader_float_controls_execution_mode(nir_to_elk_state & ntb)7013 emit_shader_float_controls_execution_mode(nir_to_elk_state &ntb)
7014 {
7015 const fs_builder &bld = ntb.bld;
7016 elk_fs_visitor &s = ntb.s;
7017
7018 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
7019 if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
7020 return;
7021
7022 fs_builder ubld = bld.exec_all().group(1, 0);
7023 fs_builder abld = ubld.annotate("shader floats control execution mode");
7024 unsigned mask, mode = elk_rnd_mode_from_nir(execution_mode, &mask);
7025
7026 if (mask == 0)
7027 return;
7028
7029 abld.emit(ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
7030 elk_imm_d(mode), elk_imm_d(mask));
7031 }
7032
7033 void
nir_to_elk(elk_fs_visitor * s)7034 nir_to_elk(elk_fs_visitor *s)
7035 {
7036 nir_to_elk_state ntb = {
7037 .s = *s,
7038 .nir = s->nir,
7039 .devinfo = s->devinfo,
7040 .mem_ctx = ralloc_context(NULL),
7041 .bld = fs_builder(s).at_end(),
7042 };
7043
7044 emit_shader_float_controls_execution_mode(ntb);
7045
7046 /* emit the arrays used for inputs and outputs - load/store intrinsics will
7047 * be converted to reads/writes of these arrays
7048 */
7049 fs_nir_setup_outputs(ntb);
7050 fs_nir_setup_uniforms(ntb.s);
7051 fs_nir_emit_system_values(ntb);
7052 ntb.s.last_scratch = ALIGN(ntb.nir->scratch_size, 4) * ntb.s.dispatch_width;
7053
7054 fs_nir_emit_impl(ntb, nir_shader_get_entrypoint((nir_shader *)ntb.nir));
7055
7056 ntb.bld.emit(ELK_SHADER_OPCODE_HALT_TARGET);
7057
7058 ralloc_free(ntb.mem_ctx);
7059 }
7060
7061