1 /*
2 * Copyright © 2014-2015 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "nir/nir_to_tgsi.h"
27 #include "pipe/p_screen.h"
28 #include "pipe/p_state.h"
29 #include "tgsi/tgsi_dump.h"
30 #include "tgsi/tgsi_from_mesa.h"
31 #include "tgsi/tgsi_info.h"
32 #include "tgsi/tgsi_ureg.h"
33 #include "util/debug.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36
37 struct ntt_compile {
38 nir_shader *s;
39 nir_function_impl *impl;
40 struct pipe_screen *screen;
41 struct ureg_program *ureg;
42
43 bool needs_texcoord_semantic;
44 bool any_reg_as_address;
45 bool native_integers;
46 bool has_txf_lz;
47
48 int next_addr_reg;
49 bool addr_declared[2];
50 struct ureg_dst addr_reg[2];
51
52 /* if condition set up at the end of a block, for ntt_emit_if(). */
53 struct ureg_src if_cond;
54
55 /* TGSI temps for our NIR SSA and register values. */
56 struct ureg_dst *reg_temp;
57 struct ureg_src *ssa_temp;
58
59 nir_instr_liveness *liveness;
60
61 /* Mappings from driver_location to TGSI input/output number.
62 *
63 * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
64 * their numbers assigned incrementally, unlike inputs or constants.
65 */
66 struct ureg_src *input_index_map;
67 uint64_t centroid_inputs;
68
69 uint32_t first_ubo;
70
71 struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
72 };
73
74 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
75
76 /**
77 * Interprets a nir_load_const used as a NIR src as a uint.
78 *
79 * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
80 * instruction (or in a phi-web used by an integer ALU instruction) were
81 * converted to floats and the ALU instruction swapped to the float equivalent.
82 * However, this means that integer load_consts used by intrinsics (which don't
83 * normally get that conversion) may have been reformatted to be floats. Given
84 * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
85 * we can just look and see if they look like floats and convert them back to
86 * ints.
87 */
88 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)89 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
90 {
91 uint32_t val = nir_src_as_uint(src);
92 if (!c->native_integers && val >= fui(1.0))
93 val = (uint32_t)uif(val);
94 return val;
95 }
96
97 static unsigned
ntt_64bit_write_mask(unsigned write_mask)98 ntt_64bit_write_mask(unsigned write_mask)
99 {
100 return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
101 }
102
103 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)104 ntt_64bit_1f(struct ntt_compile *c)
105 {
106 return ureg_imm4u(c->ureg,
107 0x00000000, 0x3ff00000,
108 0x00000000, 0x3ff00000);
109 }
110
111 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)112 ntt_shader_input_type(struct ntt_compile *c,
113 struct nir_variable *var)
114 {
115 switch (c->s->info.stage) {
116 case MESA_SHADER_GEOMETRY:
117 case MESA_SHADER_TESS_EVAL:
118 case MESA_SHADER_TESS_CTRL:
119 if (glsl_type_is_array(var->type))
120 return glsl_get_array_element(var->type);
121 else
122 return var->type;
123 default:
124 return var->type;
125 }
126 }
127
128 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)129 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
130 unsigned *semantic_name, unsigned *semantic_index)
131 {
132 /* We want to use most of tgsi_get_gl_varying_semantic(), but the
133 * !texcoord shifting has already been applied, so avoid that.
134 */
135 if (!c->needs_texcoord_semantic &&
136 (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
137 *semantic_name = TGSI_SEMANTIC_GENERIC;
138 *semantic_index = location - VARYING_SLOT_VAR0;
139 return;
140 }
141
142 tgsi_get_gl_varying_semantic(location, true,
143 semantic_name, semantic_index);
144 }
145
146 /* TGSI varying declarations have a component usage mask associated (used by
147 * r600 and svga).
148 */
149 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)150 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
151 bool is_64)
152 {
153 uint32_t usage_mask =
154 u_bit_consecutive(start_component, num_components);
155
156 if (is_64) {
157 if (start_component >= 2)
158 usage_mask >>= 2;
159
160 uint32_t tgsi_usage_mask = 0;
161
162 if (usage_mask & TGSI_WRITEMASK_X)
163 tgsi_usage_mask |= TGSI_WRITEMASK_XY;
164 if (usage_mask & TGSI_WRITEMASK_Y)
165 tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
166
167 return tgsi_usage_mask;
168 } else {
169 return usage_mask;
170 }
171 }
172
173 /* TGSI varying declarations have a component usage mask associated (used by
174 * r600 and svga).
175 */
176 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)177 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
178 {
179 const struct glsl_type *type_without_array =
180 glsl_without_array(var->type);
181 unsigned num_components = glsl_get_vector_elements(type_without_array);
182 if (num_components == 0) /* structs */
183 num_components = 4;
184
185 return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
186 glsl_type_is_64bit(type_without_array));
187 }
188
189 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)190 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
191 {
192 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
193 int base = nir_intrinsic_base(instr);
194 *frac = nir_intrinsic_component(instr);
195 bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
196
197 struct ureg_dst out;
198 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
199 unsigned semantic_name, semantic_index;
200 tgsi_get_gl_frag_result_semantic(semantics.location,
201 &semantic_name, &semantic_index);
202 semantic_index += semantics.dual_source_blend_index;
203
204 switch (semantics.location) {
205 case FRAG_RESULT_DEPTH:
206 *frac = 2; /* z write is the to the .z channel in TGSI */
207 break;
208 case FRAG_RESULT_STENCIL:
209 *frac = 1;
210 break;
211 default:
212 break;
213 }
214
215 out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
216 } else {
217 unsigned semantic_name, semantic_index;
218
219 ntt_get_gl_varying_semantic(c, semantics.location,
220 &semantic_name, &semantic_index);
221
222 uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
223 instr->num_components,
224 is_64);
225 uint32_t gs_streams = semantics.gs_streams;
226 for (int i = 0; i < 4; i++) {
227 if (!(usage_mask & (1 << i)))
228 gs_streams &= ~(0x3 << 2 * i);
229 }
230
231 /* No driver appears to use array_id of outputs. */
232 unsigned array_id = 0;
233
234 /* This bit is lost in the i/o semantics, but it's unused in in-tree
235 * drivers.
236 */
237 bool invariant = false;
238
239 out = ureg_DECL_output_layout(c->ureg,
240 semantic_name, semantic_index,
241 gs_streams,
242 base,
243 usage_mask,
244 array_id,
245 semantics.num_slots,
246 invariant);
247 }
248
249 unsigned write_mask;
250 if (nir_intrinsic_has_write_mask(instr))
251 write_mask = nir_intrinsic_write_mask(instr);
252 else
253 write_mask = ((1 << instr->num_components) - 1) << *frac;
254
255 if (is_64) {
256 write_mask = ntt_64bit_write_mask(write_mask);
257 if (*frac >= 2)
258 write_mask = write_mask << 2;
259 } else {
260 write_mask = write_mask << *frac;
261 }
262 return ureg_writemask(out, write_mask);
263 }
264
265 /* If this reg or SSA def is used only for storing an output, then in the simple
266 * cases we can write directly to the TGSI output instead of having store_output
267 * emit its own MOV.
268 */
269 static bool
ntt_try_store_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,struct list_head * uses,struct list_head * if_uses)270 ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
271 struct list_head *uses, struct list_head *if_uses)
272 {
273 *dst = ureg_dst_undef();
274
275 switch (c->s->info.stage) {
276 case MESA_SHADER_FRAGMENT:
277 case MESA_SHADER_VERTEX:
278 break;
279 default:
280 /* tgsi_exec (at least) requires that output stores happen per vertex
281 * emitted, you don't get to reuse a previous output value for the next
282 * vertex.
283 */
284 return false;
285 }
286
287 if (!list_is_empty(if_uses) || !list_is_singular(uses))
288 return false;
289
290 nir_src *src = list_first_entry(uses, nir_src, use_link);
291
292 if (src->parent_instr->type != nir_instr_type_intrinsic)
293 return false;
294
295 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
296 if (intr->intrinsic != nir_intrinsic_store_output ||
297 !nir_src_is_const(intr->src[1])) {
298 return false;
299 }
300
301 uint32_t frac;
302 *dst = ntt_output_decl(c, intr, &frac);
303 dst->Index += ntt_src_as_uint(c, intr->src[1]);
304
305 return frac == 0;
306 }
307
308 static void
ntt_setup_inputs(struct ntt_compile * c)309 ntt_setup_inputs(struct ntt_compile *c)
310 {
311 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
312 return;
313
314 unsigned num_inputs = 0;
315 int num_input_arrays = 0;
316
317 nir_foreach_shader_in_variable(var, c->s) {
318 const struct glsl_type *type = ntt_shader_input_type(c, var);
319 unsigned array_len =
320 glsl_count_attribute_slots(type, false);
321
322 num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
323 }
324
325 c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
326
327 nir_foreach_shader_in_variable(var, c->s) {
328 const struct glsl_type *type = ntt_shader_input_type(c, var);
329 unsigned array_len =
330 glsl_count_attribute_slots(type, false);
331
332 unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
333 unsigned sample_loc;
334 struct ureg_src decl;
335
336 if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
337 interpolation =
338 tgsi_get_interp_mode(var->data.interpolation,
339 var->data.location == VARYING_SLOT_COL0 ||
340 var->data.location == VARYING_SLOT_COL1);
341
342 if (var->data.location == VARYING_SLOT_POS)
343 interpolation = TGSI_INTERPOLATE_LINEAR;
344 }
345
346 unsigned semantic_name, semantic_index;
347 ntt_get_gl_varying_semantic(c, var->data.location,
348 &semantic_name, &semantic_index);
349
350 if (var->data.sample) {
351 sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
352 } else if (var->data.centroid) {
353 sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
354 c->centroid_inputs |= (BITSET_MASK(array_len) <<
355 var->data.driver_location);
356 } else {
357 sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
358 }
359
360 unsigned array_id = 0;
361 if (glsl_type_is_array(type))
362 array_id = ++num_input_arrays;
363
364 uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
365
366 decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
367 semantic_name,
368 semantic_index,
369 interpolation,
370 sample_loc,
371 var->data.driver_location,
372 usage_mask,
373 array_id, array_len);
374
375 if (semantic_name == TGSI_SEMANTIC_FACE) {
376 struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
377 /* NIR is ~0 front and 0 back, while TGSI is +1 front */
378 ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));
379 decl = ureg_src(temp);
380 }
381
382 for (unsigned i = 0; i < array_len; i++) {
383 c->input_index_map[var->data.driver_location + i] = decl;
384 c->input_index_map[var->data.driver_location + i].Index += i;
385 }
386 }
387 }
388
389 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)390 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
391 {
392 return a->data.location - b->data.location;
393 }
394
395 /**
396 * Workaround for virglrenderer requiring that TGSI FS output color variables
397 * are declared in order. Besides, it's a lot nicer to read the TGSI this way.
398 */
399 static void
ntt_setup_outputs(struct ntt_compile * c)400 ntt_setup_outputs(struct ntt_compile *c)
401 {
402 if (c->s->info.stage != MESA_SHADER_FRAGMENT)
403 return;
404
405 nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
406
407 nir_foreach_shader_out_variable(var, c->s) {
408 if (var->data.location == FRAG_RESULT_COLOR)
409 ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
410
411 unsigned semantic_name, semantic_index;
412 tgsi_get_gl_frag_result_semantic(var->data.location,
413 &semantic_name, &semantic_index);
414
415 (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
416 }
417 }
418
419 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)420 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
421 {
422 switch (dim) {
423 case GLSL_SAMPLER_DIM_1D:
424 if (is_shadow)
425 return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
426 else
427 return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
428 case GLSL_SAMPLER_DIM_2D:
429 case GLSL_SAMPLER_DIM_EXTERNAL:
430 if (is_shadow)
431 return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
432 else
433 return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
434 case GLSL_SAMPLER_DIM_3D:
435 return TGSI_TEXTURE_3D;
436 case GLSL_SAMPLER_DIM_CUBE:
437 if (is_shadow)
438 return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
439 else
440 return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
441 case GLSL_SAMPLER_DIM_RECT:
442 if (is_shadow)
443 return TGSI_TEXTURE_SHADOWRECT;
444 else
445 return TGSI_TEXTURE_RECT;
446 case GLSL_SAMPLER_DIM_MS:
447 return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
448 case GLSL_SAMPLER_DIM_BUF:
449 return TGSI_TEXTURE_BUFFER;
450 default:
451 unreachable("unknown sampler dim");
452 }
453 }
454
455 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)456 tgsi_return_type_from_base_type(enum glsl_base_type type)
457 {
458 switch (type) {
459 case GLSL_TYPE_INT:
460 return TGSI_RETURN_TYPE_SINT;
461 case GLSL_TYPE_UINT:
462 return TGSI_RETURN_TYPE_UINT;
463 case GLSL_TYPE_FLOAT:
464 return TGSI_RETURN_TYPE_FLOAT;
465 default:
466 unreachable("unexpected texture type");
467 }
468 }
469
470 static void
ntt_setup_uniforms(struct ntt_compile * c)471 ntt_setup_uniforms(struct ntt_compile *c)
472 {
473 nir_foreach_uniform_variable(var, c->s) {
474 int image_count = glsl_type_get_image_count(var->type);
475
476 if (glsl_type_is_sampler(glsl_without_array(var->type))) {
477 /* Don't use this size for the check for samplers -- arrays of structs
478 * containing samplers should be ignored, and just the separate lowered
479 * sampler uniform decl used.
480 */
481 int size = glsl_type_get_sampler_count(var->type);
482
483 const struct glsl_type *stype = glsl_without_array(var->type);
484 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
485 glsl_sampler_type_is_array(stype),
486 glsl_sampler_type_is_shadow(stype));
487 enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
488 for (int i = 0; i < size; i++) {
489 ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
490 target, ret_type, ret_type, ret_type, ret_type);
491 ureg_DECL_sampler(c->ureg, var->data.binding + i);
492 }
493 } else if (image_count) {
494 const struct glsl_type *itype = glsl_without_array(var->type);
495 enum tgsi_texture_type tex_type =
496 tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
497 glsl_sampler_type_is_array(itype), false);
498
499 for (int i = 0; i < image_count; i++) {
500 c->images[var->data.binding] = ureg_DECL_image(c->ureg,
501 var->data.binding + i,
502 tex_type,
503 var->data.image.format,
504 !(var->data.access & ACCESS_NON_WRITEABLE),
505 false);
506 }
507 } else if (glsl_contains_atomic(var->type)) {
508 uint32_t offset = var->data.offset / 4;
509 uint32_t size = glsl_atomic_size(var->type) / 4;
510 ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
511 }
512
513 /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
514 * size declaration happens with other UBOs below.
515 */
516 }
517
518 c->first_ubo = ~0;
519
520 unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
521 nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
522 int ubo = var->data.driver_location;
523 if (ubo == -1)
524 continue;
525
526 if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
527 c->first_ubo = MIN2(c->first_ubo, ubo);
528
529 unsigned size = glsl_get_explicit_size(var->interface_type, false);
530
531 int array_size = 1;
532 if (glsl_type_is_interface(glsl_without_array(var->type)))
533 array_size = MAX2(1, glsl_array_size(var->type));
534 for (int i = 0; i < array_size; i++) {
535 /* Even if multiple NIR variables are in the same uniform block, their
536 * explicit size is the size of the block.
537 */
538 if (ubo_sizes[ubo + i])
539 assert(ubo_sizes[ubo + i] == size);
540
541 ubo_sizes[ubo + i] = size;
542 }
543 }
544
545 for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
546 if (ubo_sizes[i])
547 ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
548 }
549
550 for (int i = 0; i < c->s->info.num_ssbos; i++) {
551 /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
552 * counters
553 */
554 bool atomic = false;
555 ureg_DECL_buffer(c->ureg, i, atomic);
556 }
557 }
558
559 static void
ntt_setup_registers(struct ntt_compile * c,struct exec_list * list)560 ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
561 {
562 foreach_list_typed(nir_register, nir_reg, node, list) {
563 struct ureg_dst decl;
564 if (nir_reg->num_array_elems == 0) {
565 uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
566 if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
567 if (nir_reg->bit_size == 64) {
568 if (nir_reg->num_components > 2) {
569 fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
570 nir_reg->num_components, nir_reg->index);
571 }
572
573 write_mask = ntt_64bit_write_mask(write_mask);
574 }
575
576 decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
577 }
578 } else {
579 decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
580 true);
581 }
582 c->reg_temp[nir_reg->index] = decl;
583 }
584 }
585
586 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)587 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
588 {
589 int num_components = instr->def.num_components;
590
591 if (!c->native_integers) {
592 float values[4];
593 assert(instr->def.bit_size == 32);
594 for (int i = 0; i < num_components; i++)
595 values[i] = uif(instr->value[i].u32);
596
597 return ureg_DECL_immediate(c->ureg, values, num_components);
598 } else {
599 uint32_t values[4];
600
601 if (instr->def.bit_size == 32) {
602 for (int i = 0; i < num_components; i++)
603 values[i] = instr->value[i].u32;
604 } else {
605 assert(num_components <= 2);
606 for (int i = 0; i < num_components; i++) {
607 values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
608 values[i * 2 + 1] = instr->value[i].u64 >> 32;
609 }
610 num_components *= 2;
611 }
612
613 return ureg_DECL_immediate_uint(c->ureg, values, num_components);
614 }
615 }
616
617 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr)618 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)
619 {
620 if (c->any_reg_as_address) {
621 /* Make sure we're getting the refcounting right even on any_reg
622 * drivers.
623 */
624 c->next_addr_reg++;
625
626 return ureg_scalar(addr, 0);
627 }
628
629 assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));
630
631 if (!c->addr_declared[c->next_addr_reg]) {
632 c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),
633 TGSI_WRITEMASK_X);
634 c->addr_declared[c->next_addr_reg] = true;
635 }
636
637 if (c->native_integers)
638 ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
639 else
640 ureg_ARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
641 return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);
642 }
643
644 static void
ntt_put_reladdr(struct ntt_compile * c)645 ntt_put_reladdr(struct ntt_compile *c)
646 {
647 c->next_addr_reg--;
648 assert(c->next_addr_reg >= 0);
649 }
650
651 static void
ntt_reladdr_dst_put(struct ntt_compile * c,struct ureg_dst dst)652 ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)
653 {
654 if (c->any_reg_as_address)
655 return;
656
657 if (dst.Indirect)
658 ntt_put_reladdr(c);
659 if (dst.DimIndirect)
660 ntt_put_reladdr(c);
661 }
662
663 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)664 ntt_get_src(struct ntt_compile *c, nir_src src)
665 {
666 if (src.is_ssa) {
667 if (src.ssa->parent_instr->type == nir_instr_type_load_const)
668 return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
669
670 return c->ssa_temp[src.ssa->index];
671 } else {
672 nir_register *reg = src.reg.reg;
673 struct ureg_dst reg_temp = c->reg_temp[reg->index];
674 reg_temp.Index += src.reg.base_offset;
675
676 if (src.reg.indirect) {
677 struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
678 return ureg_src_indirect(ureg_src(reg_temp),
679 ntt_reladdr(c, offset));
680 } else {
681 return ureg_src(reg_temp);
682 }
683 }
684 }
685
686 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)687 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
688 {
689 nir_alu_src src = instr->src[i];
690 struct ureg_src usrc = ntt_get_src(c, src.src);
691
692 if (nir_src_bit_size(src.src) == 64) {
693 int chan0 = 0, chan1 = 1;
694 if (nir_op_infos[instr->op].input_sizes[i] == 0) {
695 chan0 = ffs(instr->dest.write_mask) - 1;
696 chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
697 if (chan1 == -1)
698 chan1 = chan0;
699 }
700 usrc = ureg_swizzle(usrc,
701 src.swizzle[chan0] * 2,
702 src.swizzle[chan0] * 2 + 1,
703 src.swizzle[chan1] * 2,
704 src.swizzle[chan1] * 2 + 1);
705 } else {
706 usrc = ureg_swizzle(usrc,
707 src.swizzle[0],
708 src.swizzle[1],
709 src.swizzle[2],
710 src.swizzle[3]);
711 }
712
713 if (src.abs)
714 usrc = ureg_abs(usrc);
715 if (src.negate)
716 usrc = ureg_negate(usrc);
717
718 return usrc;
719 }
720
721 /* Reswizzles a source so that the unset channels in the write mask still refer
722 * to one of the channels present in the write mask.
723 */
724 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)725 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
726 {
727 assert(write_mask);
728 int first_chan = ffs(write_mask) - 1;
729 return ureg_swizzle(src,
730 (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
731 (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
732 (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
733 (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
734 }
735
736 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_ssa_def * ssa)737 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
738 {
739 uint32_t writemask = BITSET_MASK(ssa->num_components);
740 if (ssa->bit_size == 64)
741 writemask = ntt_64bit_write_mask(writemask);
742
743 struct ureg_dst dst;
744 if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
745 dst = ureg_DECL_temporary(c->ureg);
746
747 c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
748
749 return ureg_writemask(dst, writemask);
750 }
751
752 static struct ureg_dst
ntt_get_dest_decl(struct ntt_compile * c,nir_dest * dest)753 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
754 {
755 if (dest->is_ssa)
756 return ntt_get_ssa_def_decl(c, &dest->ssa);
757 else
758 return c->reg_temp[dest->reg.reg->index];
759 }
760
761 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_dest * dest)762 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
763 {
764 struct ureg_dst dst = ntt_get_dest_decl(c, dest);
765
766 if (!dest->is_ssa) {
767 dst.Index += dest->reg.base_offset;
768
769 if (dest->reg.indirect) {
770 struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
771 dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));
772 }
773 }
774
775 return dst;
776 }
777
778 /* For an SSA dest being populated by a constant src, replace the storage with
779 * a copy of the ureg_src.
780 */
781 static void
ntt_store_def(struct ntt_compile * c,nir_ssa_def * def,struct ureg_src src)782 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
783 {
784 if (!src.Indirect && !src.DimIndirect) {
785 switch (src.File) {
786 case TGSI_FILE_IMMEDIATE:
787 case TGSI_FILE_INPUT:
788 case TGSI_FILE_CONSTANT:
789 case TGSI_FILE_SYSTEM_VALUE:
790 c->ssa_temp[def->index] = src;
791 return;
792 }
793 }
794
795 ureg_MOV(c->ureg, ntt_get_ssa_def_decl(c, def), src);
796 }
797
798 static void
ntt_store(struct ntt_compile * c,nir_dest * dest,struct ureg_src src)799 ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
800 {
801 if (dest->is_ssa)
802 ntt_store_def(c, &dest->ssa, src);
803 else {
804 struct ureg_dst dst = ntt_get_dest(c, dest);
805 ureg_MOV(c->ureg, dst, src);
806 }
807 }
808
809 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)810 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
811 struct ureg_dst dst,
812 struct ureg_src src0,
813 struct ureg_src src1)
814 {
815 unsigned i;
816 int num_src;
817
818 /* POW is the only 2-operand scalar op. */
819 if (tgsi_op == TGSI_OPCODE_POW) {
820 num_src = 2;
821 } else {
822 num_src = 1;
823 src1 = src0;
824 }
825
826 for (i = 0; i < 4; i++) {
827 if (dst.WriteMask & (1 << i)) {
828 struct ureg_dst this_dst = dst;
829 struct ureg_src srcs[2] = {
830 ureg_scalar(src0, i),
831 ureg_scalar(src1, i),
832 };
833 this_dst.WriteMask = (1 << i);
834
835 ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);
836 }
837 }
838 }
839
840 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)841 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
842 {
843 struct ureg_src src[4];
844 struct ureg_dst dst;
845 unsigned i;
846 int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
847 int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
848 int num_srcs = nir_op_infos[instr->op].num_inputs;
849
850 assert(num_srcs <= ARRAY_SIZE(src));
851 for (i = 0; i < num_srcs; i++)
852 src[i] = ntt_get_alu_src(c, instr, i);
853 dst = ntt_get_dest(c, &instr->dest.dest);
854
855 if (instr->dest.saturate)
856 dst.Saturate = true;
857
858 if (dst_64)
859 dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
860 else
861 dst = ureg_writemask(dst, instr->dest.write_mask);
862
863 static enum tgsi_opcode op_map[][2] = {
864 [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
865
866 /* fabs/fneg 32-bit are special-cased below. */
867 [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
868 [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
869
870 [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
871 [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
872 [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
873 [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
874 [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
875 [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
876 [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
877 [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
878 [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
879 [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
880
881 [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
882 [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
883 [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
884
885 /* The conversions will have one combination of src and dst bitsize. */
886 [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
887 [nir_op_f2f64] = { TGSI_OPCODE_F2D },
888 [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
889
890 [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
891 [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
892 [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
893 [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
894 [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
895 [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
896 [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
897 [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
898
899 [nir_op_slt] = { TGSI_OPCODE_SLT },
900 [nir_op_sge] = { TGSI_OPCODE_SGE },
901 [nir_op_seq] = { TGSI_OPCODE_SEQ },
902 [nir_op_sne] = { TGSI_OPCODE_SNE },
903
904 [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
905 [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
906 [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
907 [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
908
909 [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
910 [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
911 [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
912 [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
913
914 [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
915 [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
916
917 [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
918 [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
919 [nir_op_fsign] = { TGSI_OPCODE_SSG },
920 [nir_op_isign] = { TGSI_OPCODE_ISSG },
921 [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
922 [nir_op_fddx] = { TGSI_OPCODE_DDX },
923 [nir_op_fddy] = { TGSI_OPCODE_DDY },
924 [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
925 [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
926 [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
927 [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
928 [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
929 [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
930 [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
931 [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
932 [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
933 [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
934 [nir_op_bit_count] = { TGSI_OPCODE_POPC },
935 [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
936 [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
937 [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
938 [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
939 [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
940 [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
941 [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
942 [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
943 [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
944 [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
945 [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
946 [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
947 [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
948 [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
949
950 /* These bitwise ops don't care about 32 vs 64 types, so they have the
951 * same TGSI op.
952 */
953 [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
954 [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
955 [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
956 [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
957
958 [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
959 [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
960 [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
961 [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
962 [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
963 [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
964 [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
965 [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
966 };
967
968 /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
969 * of .xy. Store to a temp and move it to the real dst.
970 */
971 bool tgsi_64bit_compare = src_64 && !dst_64 &&
972 (num_srcs == 2 ||
973 nir_op_infos[instr->op].output_type == nir_type_bool32) &&
974 (dst.WriteMask != TGSI_WRITEMASK_X);
975
976 /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
977 * channels and will need to get fixed up.
978 */
979 bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
980 num_srcs == 1 && !tgsi_64bit_compare &&
981 (dst.WriteMask & ~TGSI_WRITEMASK_XY));
982
983 struct ureg_dst real_dst = ureg_dst_undef();
984 if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
985 real_dst = dst;
986 dst = ureg_DECL_temporary(c->ureg);
987 }
988
989 bool table_op64 = src_64;
990 if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
991 /* The normal path for NIR to TGSI ALU op translation */
992 ureg_insn(c->ureg, op_map[instr->op][table_op64],
993 &dst, 1, src, num_srcs, false);
994 } else {
995 /* Special cases for NIR to TGSI ALU op translation. */
996
997 /* TODO: Use something like the ntt_store() path for the MOV calls so we
998 * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
999 */
1000
1001 switch (instr->op) {
1002 case nir_op_u2u64:
1003 ureg_AND(c->ureg, dst, ureg_swizzle(src[0],
1004 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1005 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1006 ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1007 break;
1008
1009 case nir_op_i2i32:
1010 case nir_op_u2u32:
1011 assert(src_64);
1012 ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1013 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1014 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1015 break;
1016
1017 case nir_op_fabs:
1018 ureg_MOV(c->ureg, dst, ureg_abs(src[0]));
1019 break;
1020
1021 case nir_op_fsat:
1022 if (dst_64) {
1023 ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));
1024 ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1025 } else {
1026 ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);
1027 }
1028 break;
1029
1030 case nir_op_fneg:
1031 ureg_MOV(c->ureg, dst, ureg_negate(src[0]));
1032 break;
1033
1034 /* NOTE: TGSI 32-bit math ops have the old "one source channel
1035 * replicated to all dst channels" behavior, while 64 is normal mapping
1036 * of src channels to dst.
1037 */
1038 case nir_op_frcp:
1039 assert(!dst_64);
1040 ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);
1041 break;
1042
1043 case nir_op_frsq:
1044 assert(!dst_64);
1045 ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);
1046 break;
1047
1048 case nir_op_fsqrt:
1049 assert(!dst_64);
1050 ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);
1051 break;
1052
1053 case nir_op_fexp2:
1054 assert(!dst_64);
1055 ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);
1056 break;
1057
1058 case nir_op_flog2:
1059 assert(!dst_64);
1060 ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);
1061 break;
1062
1063 case nir_op_b2f32:
1064 ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1065 break;
1066
1067 case nir_op_b2f64:
1068 ureg_AND(c->ureg, dst,
1069 ureg_swizzle(src[0],
1070 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1071 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1072 ntt_64bit_1f(c));
1073 break;
1074
1075 case nir_op_f2b32:
1076 if (src_64)
1077 ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
1078 else
1079 ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
1080 break;
1081
1082 case nir_op_i2b32:
1083 if (src_64) {
1084 ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
1085 } else
1086 ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
1087 break;
1088
1089 case nir_op_b2i32:
1090 ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));
1091 break;
1092
1093 case nir_op_b2i64:
1094 ureg_AND(c->ureg, dst,
1095 ureg_swizzle(src[0],
1096 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1097 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1098 ureg_imm4u(c->ureg, 1, 0, 1, 0));
1099 break;
1100
1101 case nir_op_fsin:
1102 ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);
1103 break;
1104
1105 case nir_op_fcos:
1106 ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);
1107 break;
1108
1109 case nir_op_fsub:
1110 assert(!dst_64);
1111 ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));
1112 break;
1113
1114 case nir_op_isub:
1115 assert(!dst_64);
1116 ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));
1117 break;
1118
1119 case nir_op_fmod:
1120 unreachable("should be handled by .lower_fmod = true");
1121 break;
1122
1123 case nir_op_fpow:
1124 ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1125 break;
1126
1127 case nir_op_flrp:
1128 ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);
1129 break;
1130
1131 case nir_op_pack_64_2x32_split:
1132 ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1133 ureg_swizzle(src[0],
1134 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1135 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1136 ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1137 ureg_swizzle(src[1],
1138 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1139 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1140 break;
1141
1142 case nir_op_unpack_64_2x32_split_x:
1143 ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1144 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1145 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1146 break;
1147
1148 case nir_op_unpack_64_2x32_split_y:
1149 ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1150 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1151 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1152 break;
1153
1154 case nir_op_b32csel:
1155 if (nir_src_bit_size(instr->src[1].src) == 64) {
1156 ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],
1157 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1158 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1159 src[1], src[2]);
1160 } else {
1161 ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);
1162 }
1163 break;
1164
1165 case nir_op_fcsel:
1166 /* NIR is src0 != 0 ? src1 : src2.
1167 * TGSI is src0 < 0 ? src1 : src2.
1168 *
1169 * However, fcsel so far as I can find only appears on bools-as-floats
1170 * (1.0 or 0.0), so we can just negate it for the TGSI op. It's
1171 * important to not have an abs here, as i915g has to make extra
1172 * instructions to do the abs.
1173 */
1174 ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[1], src[2]);
1175 break;
1176
1177 /* It would be nice if we could get this left as scalar in NIR, since
1178 * the TGSI op is scalar.
1179 */
1180 case nir_op_frexp_sig:
1181 case nir_op_frexp_exp: {
1182 assert(src_64);
1183 struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
1184
1185 for (int chan = 0; chan < 2; chan++) {
1186 int wm = 1 << chan;
1187
1188 if (!(instr->dest.write_mask & wm))
1189 continue;
1190
1191 struct ureg_dst dsts[2] = { temp, temp };
1192 if (instr->op == nir_op_frexp_sig) {
1193 dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
1194 } else {
1195 dsts[1] = ureg_writemask(dst, wm);
1196 }
1197
1198 struct ureg_src chan_src = ureg_swizzle(src[0],
1199 chan * 2, chan * 2 + 1,
1200 chan * 2, chan * 2 + 1);
1201
1202 ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,
1203 dsts, 2,
1204 &chan_src, 1, false);
1205 }
1206
1207 ureg_release_temporary(c->ureg, temp);
1208 break;
1209 }
1210
1211 case nir_op_ldexp:
1212 assert(dst_64); /* 32bit handled in table. */
1213 ureg_DLDEXP(c->ureg, dst, src[0],
1214 ureg_swizzle(src[1],
1215 TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1216 TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1217 break;
1218
1219 case nir_op_vec4:
1220 case nir_op_vec3:
1221 case nir_op_vec2:
1222 unreachable("covered by nir_lower_vec_to_movs()");
1223
1224 default:
1225 fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1226 unreachable("Unknown NIR opcode");
1227 }
1228 }
1229
1230 /* 64-bit op fixup movs */
1231 if (!ureg_dst_is_undef(real_dst)) {
1232 if (tgsi_64bit_compare) {
1233 ureg_MOV(c->ureg, real_dst,
1234 ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
1235 } else {
1236 assert(tgsi_64bit_downconvert);
1237 uint8_t swizzle[] = {0, 0, 0, 0};
1238 uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
1239 if (second_bit)
1240 swizzle[ffs(second_bit) - 1] = 1;
1241 ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),
1242 swizzle[0],
1243 swizzle[1],
1244 swizzle[2],
1245 swizzle[3]));
1246 }
1247 ureg_release_temporary(c->ureg, dst);
1248 }
1249 }
1250
1251 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1252 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1253 nir_src src)
1254 {
1255 if (nir_src_is_const(src)) {
1256 usrc.Index += ntt_src_as_uint(c, src);
1257 return usrc;
1258 } else {
1259 return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
1260 }
1261 }
1262
1263 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1264 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1265 nir_src src)
1266 {
1267 if (nir_src_is_const(src)) {
1268 dst.Index += ntt_src_as_uint(c, src);
1269 return dst;
1270 } else {
1271 return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
1272 }
1273 }
1274
1275 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1276 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1277 nir_src src)
1278 {
1279 if (nir_src_is_const(src)) {
1280 return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1281 }
1282 else
1283 {
1284 return ureg_src_dimension_indirect(usrc,
1285 ntt_reladdr(c, ntt_get_src(c, src)),
1286 0);
1287 }
1288 }
1289
1290 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1291 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1292 nir_src src)
1293 {
1294 if (nir_src_is_const(src)) {
1295 return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1296 } else {
1297 return ureg_dst_dimension_indirect(udst,
1298 ntt_reladdr(c, ntt_get_src(c, src)),
1299 0);
1300 }
1301 }
1302 /* Some load operations in NIR will have a fractional offset that we need to
1303 * swizzle down before storing to the result register.
1304 */
1305 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1306 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1307 {
1308 return ureg_swizzle(src,
1309 frac,
1310 frac + MIN2(num_components - 1, 1),
1311 frac + MIN2(num_components - 1, 2),
1312 frac + MIN2(num_components - 1, 3));
1313 }
1314
1315
1316 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1317 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1318 {
1319 int bit_size = nir_dest_bit_size(instr->dest);
1320 assert(bit_size == 32 || instr->num_components <= 2);
1321
1322 struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1323
1324 struct ureg_dst addr_temp = ureg_dst_undef();
1325
1326 if (nir_src_is_const(instr->src[0])) {
1327 src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1328 } else {
1329 /* virglrenderer requires that indirect UBO references have the UBO
1330 * array's base index in the Index field, not added to the indrect
1331 * address.
1332 *
1333 * Many nir intrinsics have a base address const value for the start of
1334 * their array indirection, but load_ubo doesn't. We fake it by
1335 * subtracting it off here.
1336 */
1337 addr_temp = ureg_DECL_temporary(c->ureg);
1338 ureg_UADD(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1339 src = ureg_src_dimension_indirect(src,
1340 ntt_reladdr(c, ureg_src(addr_temp)),
1341 c->first_ubo);
1342 }
1343
1344 if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1345 /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1346 * file.
1347 */
1348
1349 if (nir_src_is_const(instr->src[1])) {
1350 src.Index += ntt_src_as_uint(c, instr->src[1]);
1351 } else {
1352 src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
1353 }
1354
1355 int start_component = nir_intrinsic_component(instr);
1356 if (bit_size == 64)
1357 start_component *= 2;
1358
1359 src = ntt_shift_by_frac(src, start_component,
1360 instr->num_components * bit_size / 32);
1361
1362 ntt_store(c, &instr->dest, src);
1363 } else {
1364 /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1365 * TGSI_OPCODE_LOAD instruction from the const file.
1366 */
1367 struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
1368 struct ureg_src srcs[2] = {
1369 src,
1370 ntt_get_src(c, instr->src[1]),
1371 };
1372 ureg_memory_insn(c->ureg, TGSI_OPCODE_LOAD,
1373 &dst, 1,
1374 srcs, ARRAY_SIZE(srcs),
1375 0 /* qualifier */,
1376 0 /* tex target */,
1377 0 /* format: unused */
1378 );
1379 }
1380
1381 ureg_release_temporary(c->ureg, addr_temp);
1382 }
1383
1384 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1385 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1386 {
1387 enum gl_access_qualifier access = nir_intrinsic_access(instr);
1388 unsigned qualifier = 0;
1389
1390 if (access & ACCESS_COHERENT)
1391 qualifier |= TGSI_MEMORY_COHERENT;
1392 if (access & ACCESS_VOLATILE)
1393 qualifier |= TGSI_MEMORY_VOLATILE;
1394 if (access & ACCESS_RESTRICT)
1395 qualifier |= TGSI_MEMORY_RESTRICT;
1396
1397 return qualifier;
1398 }
1399
1400 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1401 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1402 nir_variable_mode mode)
1403 {
1404 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1405 instr->intrinsic == nir_intrinsic_store_shared);
1406 bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1407 instr->intrinsic == nir_intrinsic_load_ssbo ||
1408 instr->intrinsic == nir_intrinsic_load_shared);
1409 unsigned opcode;
1410 struct ureg_src src[4];
1411 int num_src = 0;
1412 int nir_src;
1413 struct ureg_dst addr_temp = ureg_dst_undef();
1414
1415 struct ureg_src memory;
1416 switch (mode) {
1417 case nir_var_mem_ssbo:
1418 memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1419 instr->src[is_store ? 1 : 0]);
1420 nir_src = 1;
1421 break;
1422 case nir_var_mem_shared:
1423 memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1424 nir_src = 0;
1425 break;
1426 case nir_var_uniform: { /* HW atomic buffers */
1427 memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, 0);
1428 /* ntt_ureg_src_indirect, except dividing by 4 */
1429 if (nir_src_is_const(instr->src[0])) {
1430 memory.Index += nir_src_as_uint(instr->src[0]) / 4;
1431 } else {
1432 addr_temp = ureg_DECL_temporary(c->ureg);
1433 ureg_USHR(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, 2));
1434 memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp)));
1435 }
1436 memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
1437 nir_src = 0;
1438 break;
1439 }
1440
1441 default:
1442 unreachable("unknown memory type");
1443 }
1444
1445 if (is_store) {
1446 src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */
1447 src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1448 } else {
1449 src[num_src++] = memory;
1450 if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1451 src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
1452 switch (instr->intrinsic) {
1453 case nir_intrinsic_atomic_counter_inc:
1454 src[num_src++] = ureg_imm1i(c->ureg, 1);
1455 break;
1456 case nir_intrinsic_atomic_counter_post_dec:
1457 src[num_src++] = ureg_imm1i(c->ureg, -1);
1458 break;
1459 default:
1460 if (!is_load)
1461 src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
1462 break;
1463 }
1464 }
1465 }
1466
1467
1468 switch (instr->intrinsic) {
1469 case nir_intrinsic_atomic_counter_add:
1470 case nir_intrinsic_atomic_counter_inc:
1471 case nir_intrinsic_atomic_counter_post_dec:
1472 case nir_intrinsic_ssbo_atomic_add:
1473 case nir_intrinsic_shared_atomic_add:
1474 opcode = TGSI_OPCODE_ATOMUADD;
1475 break;
1476 case nir_intrinsic_ssbo_atomic_fadd:
1477 case nir_intrinsic_shared_atomic_fadd:
1478 opcode = TGSI_OPCODE_ATOMFADD;
1479 break;
1480 case nir_intrinsic_atomic_counter_min:
1481 case nir_intrinsic_ssbo_atomic_imin:
1482 case nir_intrinsic_shared_atomic_imin:
1483 opcode = TGSI_OPCODE_ATOMIMIN;
1484 break;
1485 case nir_intrinsic_atomic_counter_max:
1486 case nir_intrinsic_ssbo_atomic_imax:
1487 case nir_intrinsic_shared_atomic_imax:
1488 opcode = TGSI_OPCODE_ATOMIMAX;
1489 break;
1490 case nir_intrinsic_ssbo_atomic_umin:
1491 case nir_intrinsic_shared_atomic_umin:
1492 opcode = TGSI_OPCODE_ATOMUMIN;
1493 break;
1494 case nir_intrinsic_ssbo_atomic_umax:
1495 case nir_intrinsic_shared_atomic_umax:
1496 opcode = TGSI_OPCODE_ATOMUMAX;
1497 break;
1498 case nir_intrinsic_atomic_counter_and:
1499 case nir_intrinsic_ssbo_atomic_and:
1500 case nir_intrinsic_shared_atomic_and:
1501 opcode = TGSI_OPCODE_ATOMAND;
1502 break;
1503 case nir_intrinsic_atomic_counter_or:
1504 case nir_intrinsic_ssbo_atomic_or:
1505 case nir_intrinsic_shared_atomic_or:
1506 opcode = TGSI_OPCODE_ATOMOR;
1507 break;
1508 case nir_intrinsic_atomic_counter_xor:
1509 case nir_intrinsic_ssbo_atomic_xor:
1510 case nir_intrinsic_shared_atomic_xor:
1511 opcode = TGSI_OPCODE_ATOMXOR;
1512 break;
1513 case nir_intrinsic_atomic_counter_exchange:
1514 case nir_intrinsic_ssbo_atomic_exchange:
1515 case nir_intrinsic_shared_atomic_exchange:
1516 opcode = TGSI_OPCODE_ATOMXCHG;
1517 break;
1518 case nir_intrinsic_atomic_counter_comp_swap:
1519 case nir_intrinsic_ssbo_atomic_comp_swap:
1520 case nir_intrinsic_shared_atomic_comp_swap:
1521 opcode = TGSI_OPCODE_ATOMCAS;
1522 src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
1523 break;
1524 case nir_intrinsic_atomic_counter_read:
1525 case nir_intrinsic_load_ssbo:
1526 case nir_intrinsic_load_shared:
1527 opcode = TGSI_OPCODE_LOAD;
1528 break;
1529 case nir_intrinsic_store_ssbo:
1530 case nir_intrinsic_store_shared:
1531 opcode = TGSI_OPCODE_STORE;
1532 break;
1533 case nir_intrinsic_get_ssbo_size:
1534 opcode = TGSI_OPCODE_RESQ;
1535 break;
1536 default:
1537 unreachable("unknown memory op");
1538 }
1539
1540 unsigned qualifier = 0;
1541 if (mode == nir_var_mem_ssbo &&
1542 instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1543 qualifier = ntt_get_access_qualifier(instr);
1544 }
1545
1546 struct ureg_dst dst;
1547 if (is_store) {
1548 dst = ureg_dst(memory);
1549
1550 unsigned write_mask = nir_intrinsic_write_mask(instr);
1551 if (nir_src_bit_size(instr->src[0]) == 64)
1552 write_mask = ntt_64bit_write_mask(write_mask);
1553 dst = ureg_writemask(dst, write_mask);
1554 } else {
1555 dst = ntt_get_dest(c, &instr->dest);
1556 }
1557
1558 ureg_memory_insn(c->ureg, opcode,
1559 &dst, 1,
1560 src, num_src,
1561 qualifier,
1562 TGSI_TEXTURE_BUFFER,
1563 0 /* format: unused */);
1564
1565 ureg_release_temporary(c->ureg, addr_temp);
1566 }
1567
1568 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)1569 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
1570 {
1571 unsigned op;
1572 struct ureg_src srcs[4];
1573 int num_src = 0;
1574 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1575 bool is_array = nir_intrinsic_image_array(instr);
1576
1577 struct ureg_dst temp = ureg_dst_undef();
1578
1579 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
1580
1581 struct ureg_src resource =
1582 ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
1583 instr->src[0]);
1584
1585 struct ureg_dst dst;
1586 if (instr->intrinsic == nir_intrinsic_image_store) {
1587 dst = ureg_dst(resource);
1588 } else {
1589 srcs[num_src++] = resource;
1590 dst = ntt_get_dest(c, &instr->dest);
1591 }
1592
1593 if (instr->intrinsic != nir_intrinsic_image_size) {
1594 struct ureg_src coord = ntt_get_src(c, instr->src[1]);
1595
1596 if (dim == GLSL_SAMPLER_DIM_MS) {
1597 temp = ureg_DECL_temporary(c->ureg);
1598 ureg_MOV(c->ureg, temp, coord);
1599 ureg_MOV(c->ureg, ureg_writemask(temp, 1 << (is_array ? 3 : 2)),
1600 ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
1601 coord = ureg_src(temp);
1602 }
1603 srcs[num_src++] = coord;
1604
1605 if (instr->intrinsic != nir_intrinsic_image_load) {
1606 srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
1607 if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
1608 srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
1609 }
1610 }
1611
1612 switch (instr->intrinsic) {
1613 case nir_intrinsic_image_load:
1614 op = TGSI_OPCODE_LOAD;
1615 break;
1616 case nir_intrinsic_image_store:
1617 op = TGSI_OPCODE_STORE;
1618 break;
1619 case nir_intrinsic_image_size:
1620 op = TGSI_OPCODE_RESQ;
1621 break;
1622 case nir_intrinsic_image_atomic_add:
1623 op = TGSI_OPCODE_ATOMUADD;
1624 break;
1625 case nir_intrinsic_image_atomic_fadd:
1626 op = TGSI_OPCODE_ATOMFADD;
1627 break;
1628 case nir_intrinsic_image_atomic_imin:
1629 op = TGSI_OPCODE_ATOMIMIN;
1630 break;
1631 case nir_intrinsic_image_atomic_umin:
1632 op = TGSI_OPCODE_ATOMUMIN;
1633 break;
1634 case nir_intrinsic_image_atomic_imax:
1635 op = TGSI_OPCODE_ATOMIMAX;
1636 break;
1637 case nir_intrinsic_image_atomic_umax:
1638 op = TGSI_OPCODE_ATOMUMAX;
1639 break;
1640 case nir_intrinsic_image_atomic_and:
1641 op = TGSI_OPCODE_ATOMAND;
1642 break;
1643 case nir_intrinsic_image_atomic_or:
1644 op = TGSI_OPCODE_ATOMOR;
1645 break;
1646 case nir_intrinsic_image_atomic_xor:
1647 op = TGSI_OPCODE_ATOMXOR;
1648 break;
1649 case nir_intrinsic_image_atomic_exchange:
1650 op = TGSI_OPCODE_ATOMXCHG;
1651 break;
1652 case nir_intrinsic_image_atomic_comp_swap:
1653 op = TGSI_OPCODE_ATOMCAS;
1654 break;
1655 default:
1656 unreachable("bad op");
1657 }
1658
1659 ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,
1660 ntt_get_access_qualifier(instr),
1661 target,
1662 nir_intrinsic_format(instr));
1663
1664 if (!ureg_dst_is_undef(temp))
1665 ureg_release_temporary(c->ureg, temp);
1666 }
1667
1668 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)1669 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
1670 {
1671 uint32_t frac = nir_intrinsic_component(instr);
1672 uint32_t num_components = instr->num_components;
1673 unsigned base = nir_intrinsic_base(instr);
1674 struct ureg_src input;
1675 nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1676 bool is_64 = nir_dest_bit_size(instr->dest) == 64;
1677
1678 if (c->s->info.stage == MESA_SHADER_VERTEX) {
1679 input = ureg_DECL_vs_input(c->ureg, base);
1680 for (int i = 1; i < semantics.num_slots; i++)
1681 ureg_DECL_vs_input(c->ureg, base + i);
1682 } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
1683 unsigned semantic_name, semantic_index;
1684 ntt_get_gl_varying_semantic(c, semantics.location,
1685 &semantic_name, &semantic_index);
1686
1687 /* XXX: ArrayID is used in r600 gs inputs */
1688 uint32_t array_id = 0;
1689
1690 input = ureg_DECL_input_layout(c->ureg,
1691 semantic_name,
1692 semantic_index,
1693 base,
1694 ntt_tgsi_usage_mask(frac,
1695 instr->num_components,
1696 is_64),
1697 array_id,
1698 semantics.num_slots);
1699 } else {
1700 input = c->input_index_map[base];
1701 }
1702
1703 if (is_64)
1704 num_components *= 2;
1705
1706 input = ntt_shift_by_frac(input, frac, num_components);
1707
1708 switch (instr->intrinsic) {
1709 case nir_intrinsic_load_input:
1710 input = ntt_ureg_src_indirect(c, input, instr->src[0]);
1711 ntt_store(c, &instr->dest, input);
1712 break;
1713
1714 case nir_intrinsic_load_per_vertex_input:
1715 input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1716 input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
1717 ntt_store(c, &instr->dest, input);
1718 break;
1719
1720 case nir_intrinsic_load_interpolated_input: {
1721 input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1722
1723 nir_intrinsic_instr *bary_instr =
1724 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1725
1726 switch (bary_instr->intrinsic) {
1727 case nir_intrinsic_load_barycentric_pixel:
1728 case nir_intrinsic_load_barycentric_sample:
1729 /* For these, we know that the barycentric load matches the
1730 * interpolation on the input declaration, so we can use it directly.
1731 */
1732 ntt_store(c, &instr->dest, input);
1733 break;
1734
1735 case nir_intrinsic_load_barycentric_centroid:
1736 /* If the input was declared centroid, then there's no need to
1737 * emit the extra TGSI interp instruction, we can just read the
1738 * input.
1739 */
1740 if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
1741 ntt_store(c, &instr->dest, input);
1742 } else {
1743 ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),
1744 input);
1745 }
1746 break;
1747
1748 case nir_intrinsic_load_barycentric_at_sample:
1749 /* We stored the sample in the fake "bary" dest. */
1750 ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
1751 ntt_get_src(c, instr->src[0]));
1752 break;
1753
1754 case nir_intrinsic_load_barycentric_at_offset:
1755 /* We stored the offset in the fake "bary" dest. */
1756 ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,
1757 ntt_get_src(c, instr->src[0]));
1758 break;
1759
1760 default:
1761 unreachable("bad barycentric interp intrinsic\n");
1762 }
1763 break;
1764 }
1765
1766 default:
1767 unreachable("bad load input intrinsic\n");
1768 }
1769 }
1770
1771 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)1772 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
1773 {
1774 struct ureg_src src = ntt_get_src(c, instr->src[0]);
1775
1776 if (src.File == TGSI_FILE_OUTPUT) {
1777 /* If our src is the output file, that's an indication that we were able
1778 * to emit the output stores in the generating instructions and we have
1779 * nothing to do here.
1780 */
1781 return;
1782 }
1783
1784 uint32_t frac;
1785 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
1786
1787 if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
1788 out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
1789 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
1790 } else {
1791 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
1792 }
1793
1794 uint8_t swizzle[4] = { 0, 0, 0, 0 };
1795 for (int i = frac; i <= 4; i++) {
1796 if (out.WriteMask & (1 << i))
1797 swizzle[i] = i - frac;
1798 }
1799
1800 src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1801
1802 ureg_MOV(c->ureg, out, src);
1803 ntt_reladdr_dst_put(c, out);
1804 }
1805
1806 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)1807 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
1808 {
1809 /* ntt_try_store_in_tgsi_output() optimization is not valid if load_output
1810 * is present.
1811 */
1812 assert(c->s->info.stage != MESA_SHADER_VERTEX &&
1813 c->s->info.stage != MESA_SHADER_FRAGMENT);
1814
1815 uint32_t frac;
1816 struct ureg_dst out = ntt_output_decl(c, instr, &frac);
1817
1818 if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
1819 out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
1820 out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
1821 } else {
1822 out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
1823 }
1824
1825 ureg_MOV(c->ureg, ntt_get_dest(c, &instr->dest), ureg_src(out));
1826 ntt_reladdr_dst_put(c, out);
1827 }
1828
1829 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)1830 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
1831 {
1832 gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1833 enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1834 struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
1835
1836 /* virglrenderer doesn't like references to channels of the sysval that
1837 * aren't defined, even if they aren't really read. (GLSL compile fails on
1838 * gl_NumWorkGroups.w, for example).
1839 */
1840 uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
1841 sv = ntt_swizzle_for_write_mask(sv, write_mask);
1842
1843 /* TGSI and NIR define these intrinsics as always loading ints, but they can
1844 * still appear on hardware with non-native-integers fragment shaders using
1845 * the draw path (i915g). In that case, having called nir_lower_int_to_float
1846 * means that we actually want floats instead.
1847 */
1848 if (!c->native_integers) {
1849 switch (instr->intrinsic) {
1850 case nir_intrinsic_load_vertex_id:
1851 case nir_intrinsic_load_instance_id:
1852 ureg_U2F(c->ureg, ntt_get_dest(c, &instr->dest), sv);
1853 return;
1854
1855 default:
1856 break;
1857 }
1858 }
1859
1860 ntt_store(c, &instr->dest, sv);
1861 }
1862
1863 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)1864 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
1865 {
1866 switch (instr->intrinsic) {
1867 case nir_intrinsic_load_ubo:
1868 case nir_intrinsic_load_ubo_vec4:
1869 ntt_emit_load_ubo(c, instr);
1870 break;
1871
1872 /* Vertex */
1873 case nir_intrinsic_load_vertex_id:
1874 case nir_intrinsic_load_vertex_id_zero_base:
1875 case nir_intrinsic_load_base_vertex:
1876 case nir_intrinsic_load_base_instance:
1877 case nir_intrinsic_load_instance_id:
1878 case nir_intrinsic_load_draw_id:
1879 case nir_intrinsic_load_invocation_id:
1880 case nir_intrinsic_load_frag_coord:
1881 case nir_intrinsic_load_point_coord:
1882 case nir_intrinsic_load_front_face:
1883 case nir_intrinsic_load_sample_id:
1884 case nir_intrinsic_load_sample_pos:
1885 case nir_intrinsic_load_sample_mask_in:
1886 case nir_intrinsic_load_helper_invocation:
1887 case nir_intrinsic_load_tess_coord:
1888 case nir_intrinsic_load_patch_vertices_in:
1889 case nir_intrinsic_load_primitive_id:
1890 case nir_intrinsic_load_tess_level_outer:
1891 case nir_intrinsic_load_tess_level_inner:
1892 case nir_intrinsic_load_local_invocation_id:
1893 case nir_intrinsic_load_workgroup_id:
1894 case nir_intrinsic_load_num_workgroups:
1895 case nir_intrinsic_load_workgroup_size:
1896 case nir_intrinsic_load_subgroup_size:
1897 case nir_intrinsic_load_subgroup_invocation:
1898 case nir_intrinsic_load_subgroup_eq_mask:
1899 case nir_intrinsic_load_subgroup_ge_mask:
1900 case nir_intrinsic_load_subgroup_gt_mask:
1901 case nir_intrinsic_load_subgroup_lt_mask:
1902 ntt_emit_load_sysval(c, instr);
1903 break;
1904
1905 case nir_intrinsic_load_input:
1906 case nir_intrinsic_load_per_vertex_input:
1907 case nir_intrinsic_load_interpolated_input:
1908 ntt_emit_load_input(c, instr);
1909 break;
1910
1911 case nir_intrinsic_store_output:
1912 case nir_intrinsic_store_per_vertex_output:
1913 ntt_emit_store_output(c, instr);
1914 break;
1915
1916 case nir_intrinsic_load_output:
1917 case nir_intrinsic_load_per_vertex_output:
1918 ntt_emit_load_output(c, instr);
1919 break;
1920
1921 case nir_intrinsic_discard:
1922 ureg_KILL(c->ureg);
1923 break;
1924
1925 case nir_intrinsic_discard_if: {
1926 struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
1927
1928 if (c->native_integers) {
1929 struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);
1930 ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));
1931 ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
1932 ureg_release_temporary(c->ureg, temp);
1933 } else {
1934 /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1935 ureg_KILL_IF(c->ureg, ureg_negate(cond));
1936 }
1937 break;
1938 }
1939
1940 case nir_intrinsic_load_ssbo:
1941 case nir_intrinsic_store_ssbo:
1942 case nir_intrinsic_ssbo_atomic_add:
1943 case nir_intrinsic_ssbo_atomic_fadd:
1944 case nir_intrinsic_ssbo_atomic_imin:
1945 case nir_intrinsic_ssbo_atomic_imax:
1946 case nir_intrinsic_ssbo_atomic_umin:
1947 case nir_intrinsic_ssbo_atomic_umax:
1948 case nir_intrinsic_ssbo_atomic_and:
1949 case nir_intrinsic_ssbo_atomic_or:
1950 case nir_intrinsic_ssbo_atomic_xor:
1951 case nir_intrinsic_ssbo_atomic_exchange:
1952 case nir_intrinsic_ssbo_atomic_comp_swap:
1953 case nir_intrinsic_get_ssbo_size:
1954 ntt_emit_mem(c, instr, nir_var_mem_ssbo);
1955 break;
1956
1957 case nir_intrinsic_load_shared:
1958 case nir_intrinsic_store_shared:
1959 case nir_intrinsic_shared_atomic_add:
1960 case nir_intrinsic_shared_atomic_fadd:
1961 case nir_intrinsic_shared_atomic_imin:
1962 case nir_intrinsic_shared_atomic_imax:
1963 case nir_intrinsic_shared_atomic_umin:
1964 case nir_intrinsic_shared_atomic_umax:
1965 case nir_intrinsic_shared_atomic_and:
1966 case nir_intrinsic_shared_atomic_or:
1967 case nir_intrinsic_shared_atomic_xor:
1968 case nir_intrinsic_shared_atomic_exchange:
1969 case nir_intrinsic_shared_atomic_comp_swap:
1970 ntt_emit_mem(c, instr, nir_var_mem_shared);
1971 break;
1972
1973 case nir_intrinsic_atomic_counter_read:
1974 case nir_intrinsic_atomic_counter_add:
1975 case nir_intrinsic_atomic_counter_inc:
1976 case nir_intrinsic_atomic_counter_post_dec:
1977 case nir_intrinsic_atomic_counter_min:
1978 case nir_intrinsic_atomic_counter_max:
1979 case nir_intrinsic_atomic_counter_and:
1980 case nir_intrinsic_atomic_counter_or:
1981 case nir_intrinsic_atomic_counter_xor:
1982 case nir_intrinsic_atomic_counter_exchange:
1983 case nir_intrinsic_atomic_counter_comp_swap:
1984 ntt_emit_mem(c, instr, nir_var_uniform);
1985 break;
1986 case nir_intrinsic_atomic_counter_pre_dec:
1987 unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
1988 break;
1989
1990 case nir_intrinsic_image_load:
1991 case nir_intrinsic_image_store:
1992 case nir_intrinsic_image_size:
1993 case nir_intrinsic_image_atomic_add:
1994 case nir_intrinsic_image_atomic_fadd:
1995 case nir_intrinsic_image_atomic_imin:
1996 case nir_intrinsic_image_atomic_umin:
1997 case nir_intrinsic_image_atomic_imax:
1998 case nir_intrinsic_image_atomic_umax:
1999 case nir_intrinsic_image_atomic_and:
2000 case nir_intrinsic_image_atomic_or:
2001 case nir_intrinsic_image_atomic_xor:
2002 case nir_intrinsic_image_atomic_exchange:
2003 case nir_intrinsic_image_atomic_comp_swap:
2004 ntt_emit_image_load_store(c, instr);
2005 break;
2006
2007 case nir_intrinsic_control_barrier:
2008 case nir_intrinsic_memory_barrier_tcs_patch:
2009 ureg_BARRIER(c->ureg);
2010 break;
2011
2012 case nir_intrinsic_memory_barrier:
2013 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
2014 TGSI_MEMBAR_SHADER_BUFFER |
2015 TGSI_MEMBAR_ATOMIC_BUFFER |
2016 TGSI_MEMBAR_SHADER_IMAGE |
2017 TGSI_MEMBAR_SHARED));
2018 break;
2019
2020 case nir_intrinsic_memory_barrier_atomic_counter:
2021 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
2022 break;
2023
2024 case nir_intrinsic_memory_barrier_buffer:
2025 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
2026 break;
2027
2028 case nir_intrinsic_memory_barrier_image:
2029 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
2030 break;
2031
2032 case nir_intrinsic_memory_barrier_shared:
2033 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
2034 break;
2035
2036 case nir_intrinsic_group_memory_barrier:
2037 ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
2038 TGSI_MEMBAR_SHADER_BUFFER |
2039 TGSI_MEMBAR_ATOMIC_BUFFER |
2040 TGSI_MEMBAR_SHADER_IMAGE |
2041 TGSI_MEMBAR_SHARED |
2042 TGSI_MEMBAR_THREAD_GROUP));
2043 break;
2044
2045 case nir_intrinsic_end_primitive:
2046 ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2047 break;
2048
2049 case nir_intrinsic_emit_vertex:
2050 ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2051 break;
2052
2053 /* In TGSI we don't actually generate the barycentric coords, and emit
2054 * interp intrinsics later. However, we do need to store the
2055 * load_barycentric_at_* argument so that we can use it at that point.
2056 */
2057 case nir_intrinsic_load_barycentric_pixel:
2058 case nir_intrinsic_load_barycentric_centroid:
2059 case nir_intrinsic_load_barycentric_sample:
2060 break;
2061 case nir_intrinsic_load_barycentric_at_sample:
2062 case nir_intrinsic_load_barycentric_at_offset:
2063 ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
2064 break;
2065
2066 default:
2067 fprintf(stderr, "Unknown intrinsic: ");
2068 nir_print_instr(&instr->instr, stderr);
2069 fprintf(stderr, "\n");
2070 break;
2071 }
2072 }
2073
2074 struct ntt_tex_operand_state {
2075 struct ureg_src srcs[4];
2076 unsigned i;
2077 };
2078
2079 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2080 ntt_push_tex_arg(struct ntt_compile *c,
2081 nir_tex_instr *instr,
2082 nir_tex_src_type tex_src_type,
2083 struct ntt_tex_operand_state *s)
2084 {
2085 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2086 if (tex_src < 0)
2087 return;
2088
2089 s->srcs[s->i++] = ntt_get_src(c, instr->src[tex_src].src);
2090 }
2091
2092 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2093 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2094 {
2095 struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2096 enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2097 unsigned tex_opcode;
2098
2099 struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2100 int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2101 if (sampler_src >= 0) {
2102 struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2103 sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));
2104 }
2105
2106 switch (instr->op) {
2107 case nir_texop_tex:
2108 if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2109 instr->coord_components + instr->is_shadow)
2110 tex_opcode = TGSI_OPCODE_TXP;
2111 else
2112 tex_opcode = TGSI_OPCODE_TEX;
2113 break;
2114 case nir_texop_txf:
2115 case nir_texop_txf_ms:
2116 tex_opcode = TGSI_OPCODE_TXF;
2117
2118 if (c->has_txf_lz) {
2119 int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2120 if (lod_src >= 0 &&
2121 nir_src_is_const(instr->src[lod_src].src) &&
2122 ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2123 tex_opcode = TGSI_OPCODE_TXF_LZ;
2124 }
2125 }
2126 break;
2127 case nir_texop_txl:
2128 tex_opcode = TGSI_OPCODE_TXL;
2129 break;
2130 case nir_texop_txb:
2131 tex_opcode = TGSI_OPCODE_TXB;
2132 break;
2133 case nir_texop_txd:
2134 tex_opcode = TGSI_OPCODE_TXD;
2135 break;
2136 case nir_texop_txs:
2137 tex_opcode = TGSI_OPCODE_TXQ;
2138 break;
2139 case nir_texop_tg4:
2140 tex_opcode = TGSI_OPCODE_TG4;
2141 break;
2142 case nir_texop_query_levels:
2143 tex_opcode = TGSI_OPCODE_TXQ;
2144 break;
2145 case nir_texop_lod:
2146 tex_opcode = TGSI_OPCODE_LODQ;
2147 break;
2148 case nir_texop_texture_samples:
2149 tex_opcode = TGSI_OPCODE_TXQS;
2150 break;
2151 default:
2152 unreachable("unsupported tex op");
2153 }
2154
2155 struct ntt_tex_operand_state s = { .i = 0 };
2156 ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2157 ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2158
2159 /* non-coord arg for TXQ */
2160 if (tex_opcode == TGSI_OPCODE_TXQ) {
2161 ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2162 /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2163 * scalar
2164 */
2165 s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2166 }
2167
2168 if (s.i > 1) {
2169 if (tex_opcode == TGSI_OPCODE_TEX)
2170 tex_opcode = TGSI_OPCODE_TEX2;
2171 if (tex_opcode == TGSI_OPCODE_TXB)
2172 tex_opcode = TGSI_OPCODE_TXB2;
2173 if (tex_opcode == TGSI_OPCODE_TXL)
2174 tex_opcode = TGSI_OPCODE_TXL2;
2175 }
2176
2177 if (instr->op == nir_texop_txd) {
2178 /* Derivs appear in their own src args */
2179 int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2180 int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2181 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2182 s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2183 }
2184
2185 if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2186 if (c->screen->get_param(c->screen,
2187 PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2188 sampler = ureg_scalar(sampler, instr->component);
2189 s.srcs[s.i++] = ureg_src_undef();
2190 } else {
2191 s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2192 }
2193 }
2194
2195 s.srcs[s.i++] = sampler;
2196
2197 enum tgsi_return_type tex_type;
2198 switch (instr->dest_type) {
2199 case nir_type_float32:
2200 tex_type = TGSI_RETURN_TYPE_FLOAT;
2201 break;
2202 case nir_type_int32:
2203 tex_type = TGSI_RETURN_TYPE_SINT;
2204 break;
2205 case nir_type_uint32:
2206 tex_type = TGSI_RETURN_TYPE_UINT;
2207 break;
2208 default:
2209 unreachable("unknown texture type");
2210 }
2211
2212 struct tgsi_texture_offset tex_offsets[4];
2213 unsigned num_tex_offsets = 0;
2214 int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2215 if (tex_offset_src >= 0) {
2216 struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2217
2218 tex_offsets[0].File = offset.File;
2219 tex_offsets[0].Index = offset.Index;
2220 tex_offsets[0].SwizzleX = offset.SwizzleX;
2221 tex_offsets[0].SwizzleY = offset.SwizzleY;
2222 tex_offsets[0].SwizzleZ = offset.SwizzleZ;
2223 tex_offsets[0].Padding = 0;
2224
2225 num_tex_offsets = 1;
2226 }
2227
2228 struct ureg_dst tex_dst;
2229 if (instr->op == nir_texop_query_levels)
2230 tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);
2231 else
2232 tex_dst = dst;
2233
2234 ureg_tex_insn(c->ureg, tex_opcode,
2235 &tex_dst, 1,
2236 target,
2237 tex_type,
2238 tex_offsets, num_tex_offsets,
2239 s.srcs, s.i);
2240
2241 if (instr->op == nir_texop_query_levels) {
2242 ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
2243 ureg_release_temporary(c->ureg, tex_dst);
2244 }
2245 }
2246
2247 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2248 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2249 {
2250 switch (jump->type) {
2251 case nir_jump_break:
2252 ureg_BRK(c->ureg);
2253 break;
2254
2255 case nir_jump_continue:
2256 ureg_CONT(c->ureg);
2257 break;
2258
2259 default:
2260 fprintf(stderr, "Unknown jump instruction: ");
2261 nir_print_instr(&jump->instr, stderr);
2262 fprintf(stderr, "\n");
2263 abort();
2264 }
2265 }
2266
2267 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_ssa_undef_instr * instr)2268 ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
2269 {
2270 /* Nothing to do but make sure that we have some storage to deref. */
2271 (void)ntt_get_ssa_def_decl(c, &instr->def);
2272 }
2273
2274 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2275 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2276 {
2277 /* There is no addr reg in use before we start emitting an instr. */
2278 c->next_addr_reg = 0;
2279
2280 switch (instr->type) {
2281 case nir_instr_type_deref:
2282 /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2283 break;
2284
2285 case nir_instr_type_alu:
2286 ntt_emit_alu(c, nir_instr_as_alu(instr));
2287 break;
2288
2289 case nir_instr_type_intrinsic:
2290 ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2291 break;
2292
2293 case nir_instr_type_load_const:
2294 /* Nothing to do here, as load consts are done directly from
2295 * ntt_get_src() (since many constant NIR srcs will often get folded
2296 * directly into a register file index instead of as a TGSI src).
2297 */
2298 break;
2299
2300 case nir_instr_type_tex:
2301 ntt_emit_texture(c, nir_instr_as_tex(instr));
2302 break;
2303
2304 case nir_instr_type_jump:
2305 ntt_emit_jump(c, nir_instr_as_jump(instr));
2306 break;
2307
2308 case nir_instr_type_ssa_undef:
2309 ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2310 break;
2311
2312 default:
2313 fprintf(stderr, "Unknown NIR instr type: ");
2314 nir_print_instr(instr, stderr);
2315 fprintf(stderr, "\n");
2316 abort();
2317 }
2318 }
2319
2320 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2321 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2322 {
2323 unsigned label;
2324 ureg_UIF(c->ureg, c->if_cond, &label);
2325 ntt_emit_cf_list(c, &if_stmt->then_list);
2326
2327 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2328 ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2329 ureg_ELSE(c->ureg, &label);
2330 ntt_emit_cf_list(c, &if_stmt->else_list);
2331 }
2332
2333 ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2334 ureg_ENDIF(c->ureg);
2335 }
2336
2337 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2338 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2339 {
2340 /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2341 * does reference BGNLOOP's. Follow the former behavior unless something comes up
2342 * with a need.
2343 */
2344 unsigned begin_label;
2345 ureg_BGNLOOP(c->ureg, &begin_label);
2346 ntt_emit_cf_list(c, &loop->body);
2347
2348 unsigned end_label;
2349 ureg_ENDLOOP(c->ureg, &end_label);
2350 }
2351
2352 static void
ntt_free_ssa_temp_by_index(struct ntt_compile * c,int index)2353 ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
2354 {
2355 /* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */
2356 if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
2357 return;
2358
2359 ureg_release_temporary(c->ureg, ureg_dst(c->ssa_temp[index]));
2360 memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
2361 }
2362
2363 /* Releases any temporaries for SSA defs with a live interval ending at this
2364 * instruction.
2365 */
2366 static bool
ntt_src_live_interval_end_cb(nir_src * src,void * state)2367 ntt_src_live_interval_end_cb(nir_src *src, void *state)
2368 {
2369 struct ntt_compile *c = state;
2370
2371 if (src->is_ssa) {
2372 nir_ssa_def *def = src->ssa;
2373
2374 if (c->liveness->defs[def->index].end == src->parent_instr->index)
2375 ntt_free_ssa_temp_by_index(c, def->index);
2376 }
2377
2378 return true;
2379 }
2380
2381 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)2382 ntt_emit_block(struct ntt_compile *c, nir_block *block)
2383 {
2384 nir_foreach_instr(instr, block) {
2385 ntt_emit_instr(c, instr);
2386
2387 nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);
2388 }
2389
2390 /* Set up the if condition for ntt_emit_if(), which we have to do before
2391 * freeing up the temps (the "if" is treated as inside the block for liveness
2392 * purposes, despite not being an instruction)
2393 *
2394 * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
2395 * looks at all of .xyzw. No harm in working around the bug.
2396 */
2397 nir_if *nif = nir_block_get_following_if(block);
2398 if (nif)
2399 c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
2400
2401 /* Free up any SSA temps that are unused at the end of the block. */
2402 unsigned index;
2403 BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {
2404 unsigned def_end_ip = c->liveness->defs[index].end;
2405 if (def_end_ip == block->end_ip)
2406 ntt_free_ssa_temp_by_index(c, index);
2407 }
2408 }
2409
2410 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)2411 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2412 {
2413 /* There is no addr reg in use before we start emitting any part of a CF
2414 * node (such as an if condition)
2415 */
2416 c->next_addr_reg = 0;
2417
2418 foreach_list_typed(nir_cf_node, node, node, list) {
2419 switch (node->type) {
2420 case nir_cf_node_block:
2421 ntt_emit_block(c, nir_cf_node_as_block(node));
2422 break;
2423
2424 case nir_cf_node_if:
2425 ntt_emit_if(c, nir_cf_node_as_if(node));
2426 break;
2427
2428 case nir_cf_node_loop:
2429 ntt_emit_loop(c, nir_cf_node_as_loop(node));
2430 break;
2431
2432 default:
2433 unreachable("unknown CF type");
2434 }
2435 }
2436 }
2437
2438 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)2439 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
2440 {
2441 c->impl = impl;
2442 c->liveness = nir_live_ssa_defs_per_instr(impl);
2443
2444 c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
2445 c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
2446
2447 ntt_setup_registers(c, &impl->registers);
2448 ntt_emit_cf_list(c, &impl->body);
2449
2450 ralloc_free(c->liveness);
2451 c->liveness = NULL;
2452 }
2453
2454 static int
type_size(const struct glsl_type * type,bool bindless)2455 type_size(const struct glsl_type *type, bool bindless)
2456 {
2457 return glsl_count_attribute_slots(type, false);
2458 }
2459
2460 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
2461 * can handle for 64-bit values in TGSI.
2462 */
2463 static bool
ntt_should_vectorize_instr(const nir_instr * instr,void * data)2464 ntt_should_vectorize_instr(const nir_instr *instr, void *data)
2465 {
2466 if (instr->type != nir_instr_type_alu)
2467 return false;
2468
2469 nir_alu_instr *alu = nir_instr_as_alu(instr);
2470
2471 switch (alu->op) {
2472 case nir_op_ibitfield_extract:
2473 case nir_op_ubitfield_extract:
2474 case nir_op_bitfield_insert:
2475 /* virglrenderer only looks at the .x channel of the offset/bits operands
2476 * when translating to GLSL. tgsi.rst doesn't seem to require scalar
2477 * offset/bits operands.
2478 *
2479 * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
2480 */
2481 return false;
2482
2483 default:
2484 break;
2485 }
2486
2487 unsigned num_components = alu->dest.dest.ssa.num_components;
2488
2489 int src_bit_size = nir_src_bit_size(alu->src[0].src);
2490 int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
2491
2492 if (src_bit_size == 64 || dst_bit_size == 64) {
2493 if (num_components > 1)
2494 return false;
2495 }
2496
2497 return true;
2498 }
2499
2500 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2501 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
2502 unsigned num_components, unsigned high_offset,
2503 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
2504 void *data)
2505 {
2506 if (bit_size != 32)
2507 return false;
2508
2509 /* Our offset alignment should aways be at least 4 bytes */
2510 if (align < 4)
2511 return false;
2512
2513 /* No wrapping off the end of a TGSI reg. We could do a bit better by
2514 * looking at low's actual offset. XXX: With LOAD_CONSTBUF maybe we don't
2515 * need this restriction.
2516 */
2517 unsigned worst_start_component = align == 4 ? 3 : align / 4;
2518 if (worst_start_component + num_components > 4)
2519 return false;
2520
2521 return true;
2522 }
2523
2524 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)2525 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
2526 {
2527 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2528 unsigned indirect_mask = 0;
2529
2530 if (!screen->get_shader_param(screen, pipe_stage,
2531 PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
2532 indirect_mask |= nir_var_shader_in;
2533 }
2534
2535 if (!screen->get_shader_param(screen, pipe_stage,
2536 PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
2537 indirect_mask |= nir_var_shader_out;
2538 }
2539
2540 if (!screen->get_shader_param(screen, pipe_stage,
2541 PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
2542 indirect_mask |= nir_var_function_temp;
2543 }
2544
2545 return indirect_mask;
2546 }
2547
2548 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)2549 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
2550 {
2551 bool progress;
2552 unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2553 unsigned control_flow_depth =
2554 screen->get_shader_param(screen, pipe_stage,
2555 PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
2556 do {
2557 progress = false;
2558
2559 NIR_PASS_V(s, nir_lower_vars_to_ssa);
2560
2561 NIR_PASS(progress, s, nir_copy_prop);
2562 NIR_PASS(progress, s, nir_opt_algebraic);
2563 NIR_PASS(progress, s, nir_opt_constant_folding);
2564 NIR_PASS(progress, s, nir_opt_remove_phis);
2565 NIR_PASS(progress, s, nir_opt_conditional_discard);
2566 NIR_PASS(progress, s, nir_opt_dce);
2567 NIR_PASS(progress, s, nir_opt_dead_cf);
2568 NIR_PASS(progress, s, nir_opt_cse);
2569 NIR_PASS(progress, s, nir_opt_find_array_copies);
2570 NIR_PASS(progress, s, nir_opt_if, true);
2571 NIR_PASS(progress, s, nir_opt_peephole_select,
2572 control_flow_depth == 0 ? ~0 : 8, true, true);
2573 NIR_PASS(progress, s, nir_opt_algebraic);
2574 NIR_PASS(progress, s, nir_opt_constant_folding);
2575 nir_load_store_vectorize_options vectorize_opts = {
2576 .modes = nir_var_mem_ubo,
2577 .callback = ntt_should_vectorize_io,
2578 .robust_modes = 0,
2579 };
2580 NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
2581 NIR_PASS(progress, s, nir_opt_shrink_vectors, true);
2582 NIR_PASS(progress, s, nir_opt_trivial_continues);
2583 NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
2584 NIR_PASS(progress, s, nir_opt_undef);
2585 NIR_PASS(progress, s, nir_opt_loop_unroll);
2586
2587 } while (progress);
2588 }
2589
2590 /* Scalarizes all 64-bit ALU ops. Note that we only actually need to
2591 * scalarize vec3/vec4s, should probably fix that.
2592 */
2593 static bool
scalarize_64bit(const nir_instr * instr,const void * data)2594 scalarize_64bit(const nir_instr *instr, const void *data)
2595 {
2596 const nir_alu_instr *alu = nir_instr_as_alu(instr);
2597
2598 return (nir_dest_bit_size(alu->dest.dest) == 64 ||
2599 nir_src_bit_size(alu->src[0].src) == 64);
2600 }
2601
2602 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)2603 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
2604 {
2605 b->cursor = nir_after_instr(&instr->instr);
2606
2607 switch (instr->intrinsic) {
2608 case nir_intrinsic_load_ubo:
2609 case nir_intrinsic_load_ubo_vec4:
2610 case nir_intrinsic_load_ssbo:
2611 case nir_intrinsic_load_input:
2612 case nir_intrinsic_load_interpolated_input:
2613 case nir_intrinsic_load_per_vertex_input:
2614 case nir_intrinsic_store_output:
2615 case nir_intrinsic_store_ssbo:
2616 break;
2617 default:
2618 return false;
2619 }
2620
2621 if (instr->num_components <= 2)
2622 return false;
2623
2624 bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
2625 if (has_dest) {
2626 if (nir_dest_bit_size(instr->dest) != 64)
2627 return false;
2628 } else {
2629 if (nir_src_bit_size(instr->src[0]) != 64)
2630 return false;
2631 }
2632
2633 nir_intrinsic_instr *first =
2634 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2635 nir_intrinsic_instr *second =
2636 nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2637
2638 switch (instr->intrinsic) {
2639 case nir_intrinsic_load_ubo:
2640 case nir_intrinsic_load_ubo_vec4:
2641 case nir_intrinsic_load_ssbo:
2642 case nir_intrinsic_store_ssbo:
2643 break;
2644
2645 default: {
2646 nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
2647 semantics.location++;
2648 semantics.num_slots--;
2649 nir_intrinsic_set_io_semantics(second, semantics);
2650
2651 nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
2652 break;
2653 }
2654 }
2655
2656 first->num_components = 2;
2657 second->num_components -= 2;
2658 if (has_dest) {
2659 first->dest.ssa.num_components = 2;
2660 second->dest.ssa.num_components -= 2;
2661 }
2662
2663 nir_builder_instr_insert(b, &first->instr);
2664 nir_builder_instr_insert(b, &second->instr);
2665
2666 if (has_dest) {
2667 /* Merge the two loads' results back into a vector. */
2668 nir_ssa_def *channels[4] = {
2669 nir_channel(b, &first->dest.ssa, 0),
2670 nir_channel(b, &first->dest.ssa, 1),
2671 nir_channel(b, &second->dest.ssa, 0),
2672 second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,
2673 };
2674 nir_ssa_def *new = nir_vec(b, channels, instr->num_components);
2675 nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
2676 } else {
2677 /* Split the src value across the two stores. */
2678 b->cursor = nir_before_instr(&instr->instr);
2679
2680 nir_ssa_def *src0 = instr->src[0].ssa;
2681 nir_ssa_def *channels[4] = { 0 };
2682 for (int i = 0; i < instr->num_components; i++)
2683 channels[i] = nir_channel(b, src0, i);
2684
2685 nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
2686 nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
2687
2688 nir_instr_rewrite_src(&first->instr, &first->src[0],
2689 nir_src_for_ssa(nir_vec(b, channels, 2)));
2690 nir_instr_rewrite_src(&second->instr, &second->src[0],
2691 nir_src_for_ssa(nir_vec(b, &channels[2],
2692 second->num_components)));
2693 }
2694
2695 int offset_src = -1;
2696 uint32_t offset_amount = 16;
2697
2698 switch (instr->intrinsic) {
2699 case nir_intrinsic_load_ssbo:
2700 case nir_intrinsic_load_ubo:
2701 offset_src = 1;
2702 break;
2703 case nir_intrinsic_load_ubo_vec4:
2704 offset_src = 1;
2705 offset_amount = 1;
2706 break;
2707 case nir_intrinsic_store_ssbo:
2708 offset_src = 2;
2709 break;
2710 default:
2711 break;
2712 }
2713 if (offset_src != -1) {
2714 b->cursor = nir_before_instr(&second->instr);
2715 nir_ssa_def *second_offset =
2716 nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
2717 nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
2718 nir_src_for_ssa(second_offset));
2719 }
2720
2721 /* DCE stores we generated with no writemask (nothing else does this
2722 * currently).
2723 */
2724 if (!has_dest) {
2725 if (nir_intrinsic_write_mask(first) == 0)
2726 nir_instr_remove(&first->instr);
2727 if (nir_intrinsic_write_mask(second) == 0)
2728 nir_instr_remove(&second->instr);
2729 }
2730
2731 nir_instr_remove(&instr->instr);
2732
2733 return true;
2734 }
2735
2736 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)2737 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
2738 {
2739 int num_components = instr->def.num_components;
2740
2741 if (instr->def.bit_size != 64 || num_components <= 2)
2742 return false;
2743
2744 b->cursor = nir_before_instr(&instr->instr);
2745
2746 nir_load_const_instr *first =
2747 nir_load_const_instr_create(b->shader, 2, 64);
2748 nir_load_const_instr *second =
2749 nir_load_const_instr_create(b->shader, num_components - 2, 64);
2750
2751 first->value[0] = instr->value[0];
2752 first->value[1] = instr->value[1];
2753 second->value[0] = instr->value[2];
2754 if (num_components == 4)
2755 second->value[1] = instr->value[3];
2756
2757 nir_builder_instr_insert(b, &first->instr);
2758 nir_builder_instr_insert(b, &second->instr);
2759
2760 nir_ssa_def *channels[4] = {
2761 nir_channel(b, &first->def, 0),
2762 nir_channel(b, &first->def, 1),
2763 nir_channel(b, &second->def, 0),
2764 num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
2765 };
2766 nir_ssa_def *new = nir_vec(b, channels, num_components);
2767 nir_ssa_def_rewrite_uses(&instr->def, new);
2768 nir_instr_remove(&instr->instr);
2769
2770 return true;
2771 }
2772
2773 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)2774 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
2775 void *data)
2776 {
2777 switch (instr->type) {
2778 case nir_instr_type_load_const:
2779 return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
2780
2781 case nir_instr_type_intrinsic:
2782 return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
2783 default:
2784 return false;
2785 }
2786 }
2787
2788 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)2789 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
2790 {
2791 return nir_shader_instructions_pass(s,
2792 nir_to_tgsi_lower_64bit_to_vec2_instr,
2793 nir_metadata_block_index |
2794 nir_metadata_dominance,
2795 NULL);
2796 }
2797
2798 struct ntt_lower_tex_state {
2799 nir_ssa_def *channels[8];
2800 unsigned i;
2801 };
2802
2803 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)2804 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
2805 nir_tex_instr *instr,
2806 nir_tex_src_type tex_src_type,
2807 struct ntt_lower_tex_state *s)
2808 {
2809 int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2810 if (tex_src < 0)
2811 return;
2812
2813 assert(instr->src[tex_src].src.is_ssa);
2814
2815 nir_ssa_def *def = instr->src[tex_src].src.ssa;
2816 for (int i = 0; i < def->num_components; i++) {
2817 s->channels[s->i++] = nir_channel(b, def, i);
2818 }
2819
2820 nir_tex_instr_remove_src(instr, tex_src);
2821 }
2822
2823 /**
2824 * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
2825 * src. This lets NIR handle the coalescing of the vec4 rather than trying to
2826 * manage it on our own, and may lead to more vectorization.
2827 */
2828 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)2829 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
2830 {
2831 if (instr->type != nir_instr_type_tex)
2832 return false;
2833
2834 nir_tex_instr *tex = nir_instr_as_tex(instr);
2835
2836 if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
2837 return false;
2838
2839 /* NIR after lower_tex will have LOD set to 0 for tex ops that wanted
2840 * implicit lod in shader stages that don't have quad-based derivatives.
2841 * TGSI doesn't want that, it requires that the backend do implict LOD 0 for
2842 * those stages.
2843 */
2844 if (!nir_shader_supports_implicit_lod(b->shader) && tex->op == nir_texop_txl) {
2845 int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
2846 nir_src *lod_src = &tex->src[lod_index].src;
2847 if (nir_src_is_const(*lod_src) && nir_src_as_uint(*lod_src) == 0) {
2848 nir_tex_instr_remove_src(tex, lod_index);
2849 tex->op = nir_texop_tex;
2850 }
2851 }
2852
2853 b->cursor = nir_before_instr(instr);
2854
2855 struct ntt_lower_tex_state s = {0};
2856
2857 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
2858 /* We always have at least two slots for the coordinate, even on 1D. */
2859 s.i = MAX2(s.i, 2);
2860
2861 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
2862 s.i = MAX2(s.i, 3);
2863
2864 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
2865
2866 /* XXX: LZ */
2867 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
2868 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
2869 nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
2870
2871 /* No need to pack undefs in unused channels of the tex instr */
2872 while (!s.channels[s.i - 1])
2873 s.i--;
2874
2875 /* Instead of putting undefs in the unused slots of the vecs, just put in
2876 * another used channel. Otherwise, we'll get unnecessary moves into
2877 * registers.
2878 */
2879 assert(s.channels[0] != NULL);
2880 for (int i = 1; i < s.i; i++) {
2881 if (!s.channels[i])
2882 s.channels[i] = s.channels[0];
2883 }
2884
2885 nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec(b, s.channels, MIN2(s.i, 4))));
2886 if (s.i > 4)
2887 nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec(b, &s.channels[4], s.i - 4)));
2888
2889 return true;
2890 }
2891
2892 static bool
nir_to_tgsi_lower_tex(nir_shader * s)2893 nir_to_tgsi_lower_tex(nir_shader *s)
2894 {
2895 return nir_shader_instructions_pass(s,
2896 nir_to_tgsi_lower_tex_instr,
2897 nir_metadata_block_index |
2898 nir_metadata_dominance,
2899 NULL);
2900 }
2901
2902 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s)2903 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)
2904 {
2905 const struct nir_shader_compiler_options *options = s->options;
2906 bool lower_fsqrt =
2907 !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
2908 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
2909
2910 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2911
2912 if (!options->lower_extract_byte ||
2913 !options->lower_extract_word ||
2914 !options->lower_insert_byte ||
2915 !options->lower_insert_word ||
2916 !options->lower_fdph ||
2917 !options->lower_flrp64 ||
2918 !options->lower_fmod ||
2919 !options->lower_rotate ||
2920 !options->lower_uniforms_to_ubo ||
2921 !options->lower_vector_cmp ||
2922 options->lower_fsqrt != lower_fsqrt ||
2923 options->force_indirect_unrolling != no_indirects_mask) {
2924 nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
2925 *new_options = *s->options;
2926
2927 new_options->lower_extract_byte = true;
2928 new_options->lower_extract_word = true;
2929 new_options->lower_insert_byte = true;
2930 new_options->lower_insert_word = true;
2931 new_options->lower_fdph = true;
2932 new_options->lower_flrp64 = true;
2933 new_options->lower_fmod = true;
2934 new_options->lower_rotate = true;
2935 new_options->lower_uniforms_to_ubo = true,
2936 new_options->lower_vector_cmp = true;
2937 new_options->lower_fsqrt = lower_fsqrt;
2938 new_options->force_indirect_unrolling = no_indirects_mask;
2939
2940 s->options = new_options;
2941 }
2942 }
2943
2944 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)2945 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
2946 {
2947 return (instr->type == nir_instr_type_intrinsic &&
2948 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
2949 }
2950
2951 static nir_ssa_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)2952 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
2953 {
2954 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2955
2956 nir_ssa_def *old_result = &intr->dest.ssa;
2957 intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
2958
2959 return nir_iadd_imm(b, old_result, -1);
2960 }
2961
2962 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)2963 ntt_lower_atomic_pre_dec(nir_shader *s)
2964 {
2965 return nir_shader_lower_instructions(s,
2966 ntt_lower_atomic_pre_dec_filter,
2967 ntt_lower_atomic_pre_dec_lower, NULL);
2968 }
2969
2970 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
2971 static void
nir_to_tgsi_lower_txp(nir_shader * s)2972 nir_to_tgsi_lower_txp(nir_shader *s)
2973 {
2974 nir_lower_tex_options lower_tex_options = {
2975 .lower_txp = 0,
2976 };
2977
2978 nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
2979 nir_foreach_instr(instr, block) {
2980 if (instr->type != nir_instr_type_tex)
2981 continue;
2982 nir_tex_instr *tex = nir_instr_as_tex(instr);
2983
2984 if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
2985 continue;
2986
2987 bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
2988 bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
2989 bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
2990
2991 /* We can do TXP for any tex (not txg) where we can fit all the
2992 * coordinates and comparator and projector in one vec4 without any
2993 * other modifiers to add on.
2994 *
2995 * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
2996 * if we get any funny projectors then we just blow them all away.
2997 */
2998 if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
2999 lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3000 }
3001 }
3002
3003 /* nir_lower_tex must be run even if no options are set, because we need the
3004 * LOD to be set for query_levels and for non-fragment shaders.
3005 */
3006 NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3007 }
3008
3009 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3010 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3011 {
3012 return (instr->type == nir_instr_type_intrinsic &&
3013 nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3014 }
3015
3016 static nir_ssa_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3017 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3018 {
3019 nir_variable *var = *(nir_variable **)data;
3020 if (!var) {
3021 var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
3022 var->data.location = VARYING_SLOT_PRIMITIVE_ID;
3023 b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
3024 var->data.driver_location = b->shader->num_outputs++;
3025
3026 *(nir_variable **)data = var;
3027 }
3028
3029 nir_io_semantics semantics = {
3030 .location = var->data.location,
3031 .num_slots = 1
3032 };
3033 return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3034 .base = var->data.driver_location,
3035 .io_semantics = semantics);
3036 }
3037
3038 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3039 nir_lower_primid_sysval_to_input(nir_shader *s)
3040 {
3041 nir_variable *input = NULL;
3042
3043 return nir_shader_lower_instructions(s,
3044 nir_lower_primid_sysval_to_input_filter,
3045 nir_lower_primid_sysval_to_input_lower, &input);
3046 }
3047
3048 /**
3049 * Translates the NIR shader to TGSI.
3050 *
3051 * This requires some lowering of the NIR shader to prepare it for translation.
3052 * We take ownership of the NIR shader passed, returning a reference to the new
3053 * TGSI tokens instead. If you need to keep the NIR, then pass us a clone.
3054 */
3055 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3056 nir_to_tgsi(struct nir_shader *s,
3057 struct pipe_screen *screen)
3058 {
3059 struct ntt_compile *c;
3060 const void *tgsi_tokens;
3061 bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);
3062 nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3063 bool native_integers = screen->get_shader_param(screen,
3064 pipe_shader_type_from_mesa(s->info.stage),
3065 PIPE_SHADER_CAP_INTEGERS);
3066 const struct nir_shader_compiler_options *original_options = s->options;
3067
3068 ntt_fix_nir_options(screen, s);
3069
3070 NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3071 type_size, (nir_lower_io_options)0);
3072 NIR_PASS_V(s, nir_lower_regs_to_ssa);
3073
3074 nir_to_tgsi_lower_txp(s);
3075 NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3076
3077 /* While TGSI can represent PRIMID as either an input or a system value,
3078 * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3079 * depend on that.
3080 */
3081 if (s->info.stage == MESA_SHADER_GEOMETRY)
3082 NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3083
3084 if (s->info.num_abos)
3085 NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3086
3087 if (!original_options->lower_uniforms_to_ubo) {
3088 NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3089 screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3090 !native_integers);
3091 }
3092
3093 /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3094 * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3095 * duplication logic we just make it so that we only see vec2s.
3096 */
3097 NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3098 NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3099
3100 if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3101 NIR_PASS_V(s, nir_lower_ubo_vec4);
3102
3103 ntt_optimize_nir(s, screen);
3104
3105 NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3106
3107 bool progress;
3108 do {
3109 progress = false;
3110 NIR_PASS(progress, s, nir_opt_algebraic_late);
3111 if (progress) {
3112 NIR_PASS_V(s, nir_copy_prop);
3113 NIR_PASS_V(s, nir_opt_dce);
3114 NIR_PASS_V(s, nir_opt_cse);
3115 }
3116 } while (progress);
3117
3118 if (screen->get_shader_param(screen,
3119 pipe_shader_type_from_mesa(s->info.stage),
3120 PIPE_SHADER_CAP_INTEGERS)) {
3121 NIR_PASS_V(s, nir_lower_bool_to_int32);
3122 } else {
3123 NIR_PASS_V(s, nir_lower_int_to_float);
3124 NIR_PASS_V(s, nir_lower_bool_to_float);
3125 /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3126 NIR_PASS_V(s, nir_copy_prop);
3127 NIR_PASS_V(s, nir_opt_dce);
3128 }
3129
3130 /* Only lower 32-bit floats. The only other modifier type officially
3131 * supported by TGSI is 32-bit integer negates, but even those are broken on
3132 * virglrenderer, so skip lowering all integer and f64 float mods.
3133 */
3134 NIR_PASS_V(s, nir_lower_to_source_mods, nir_lower_float_source_mods);
3135 NIR_PASS_V(s, nir_convert_from_ssa, true);
3136 NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);
3137
3138 /* locals_to_regs will leave dead derefs that are good to clean up. */
3139 NIR_PASS_V(s, nir_lower_locals_to_regs);
3140 NIR_PASS_V(s, nir_opt_dce);
3141
3142 if (debug) {
3143 fprintf(stderr, "NIR before translation to TGSI:\n");
3144 nir_print_shader(s, stderr);
3145 }
3146
3147 c = rzalloc(NULL, struct ntt_compile);
3148 c->screen = screen;
3149
3150 c->needs_texcoord_semantic =
3151 screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
3152 c->any_reg_as_address =
3153 screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
3154 c->has_txf_lz =
3155 screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
3156
3157 c->s = s;
3158 c->native_integers = native_integers;
3159 c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
3160 ureg_setup_shader_info(c->ureg, &s->info);
3161
3162 ntt_setup_inputs(c);
3163 ntt_setup_outputs(c);
3164 ntt_setup_uniforms(c);
3165
3166 if (s->info.stage == MESA_SHADER_FRAGMENT) {
3167 /* The draw module's polygon stipple layer doesn't respect the chosen
3168 * coordinate mode, so leave it as unspecified unless we're actually
3169 * reading the position in the shader already. See
3170 * gl-2.1-polygon-stipple-fs on softpipe.
3171 */
3172 if ((s->info.inputs_read & VARYING_BIT_POS) ||
3173 BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
3174 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
3175 s->info.fs.origin_upper_left ?
3176 TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
3177 TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
3178
3179 ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
3180 s->info.fs.pixel_center_integer ?
3181 TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
3182 TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
3183 }
3184 }
3185 /* Emit the main function */
3186 nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
3187 ntt_emit_impl(c, impl);
3188 ureg_END(c->ureg);
3189
3190 tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
3191
3192 if (debug) {
3193 fprintf(stderr, "TGSI after translation from NIR:\n");
3194 tgsi_dump(tgsi_tokens, 0);
3195 }
3196
3197 ureg_destroy(c->ureg);
3198
3199 ralloc_free(c);
3200 ralloc_free(s);
3201
3202 return tgsi_tokens;
3203 }
3204
3205 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
3206 .fuse_ffma32 = true,
3207 .fuse_ffma64 = true,
3208 .lower_extract_byte = true,
3209 .lower_extract_word = true,
3210 .lower_insert_byte = true,
3211 .lower_insert_word = true,
3212 .lower_fdph = true,
3213 .lower_flrp64 = true,
3214 .lower_fmod = true,
3215 .lower_rotate = true,
3216 .lower_uniforms_to_ubo = true,
3217 .lower_vector_cmp = true,
3218 .use_interpolated_input_intrinsics = true,
3219 };
3220
3221 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
3222 * NIR support.
3223 */
3224 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)3225 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
3226 enum pipe_shader_ir ir,
3227 unsigned shader)
3228 {
3229 assert(ir == PIPE_SHADER_IR_NIR);
3230 return &nir_to_tgsi_compiler_options;
3231 }
3232